mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-25 00:59:28 -04:00
Compare commits
4 Commits
fix/https-
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3c63431e46 | ||
|
|
3f647a2764 | ||
|
|
f88981cdce | ||
|
|
0d6de15ae9 |
@@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
IK_LLAMA_VERSION?=7ccf1d209588962b96eacca325b37e9b3e8faf5e
|
IK_LLAMA_VERSION?=d5507e33ae7ee2b7b41475f08044d3bde3b839ee
|
||||||
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
|
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
|
||||||
|
|
||||||
CMAKE_ARGS?=
|
CMAKE_ARGS?=
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
|||||||
|
|
||||||
# omnivoice.cpp version
|
# omnivoice.cpp version
|
||||||
OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
|
OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
|
||||||
OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
|
OMNIVOICE_VERSION?=0f37401bebe9b20c0160a888e592108fc1d17607
|
||||||
SO_TARGET?=libgomnivoicecpp.so
|
SO_TARGET?=libgomnivoicecpp.so
|
||||||
|
|
||||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||||
|
|||||||
@@ -140,7 +140,7 @@ type RunCMD struct {
|
|||||||
OIDCIssuer string `env:"LOCALAI_OIDC_ISSUER" help:"OIDC issuer URL for auto-discovery" group:"auth"`
|
OIDCIssuer string `env:"LOCALAI_OIDC_ISSUER" help:"OIDC issuer URL for auto-discovery" group:"auth"`
|
||||||
OIDCClientID string `env:"LOCALAI_OIDC_CLIENT_ID" help:"OIDC Client ID (auto-enables auth)" group:"auth"`
|
OIDCClientID string `env:"LOCALAI_OIDC_CLIENT_ID" help:"OIDC Client ID (auto-enables auth)" group:"auth"`
|
||||||
OIDCClientSecret string `env:"LOCALAI_OIDC_CLIENT_SECRET" help:"OIDC Client Secret" group:"auth"`
|
OIDCClientSecret string `env:"LOCALAI_OIDC_CLIENT_SECRET" help:"OIDC Client Secret" group:"auth"`
|
||||||
ExternalBaseURL string `env:"LOCALAI_BASE_URL" help:"External base URL of this instance (e.g. https://localhost:8080). Used for OAuth callbacks and self-referential links (generated images/videos, job status). When unset, derived from X-Forwarded-Proto/Host or Forwarded headers." group:"api"`
|
AuthBaseURL string `env:"LOCALAI_BASE_URL" help:"Base URL for OAuth callbacks (e.g. http://localhost:8080)" group:"auth"`
|
||||||
AuthAdminEmail string `env:"LOCALAI_ADMIN_EMAIL" help:"Email address to auto-promote to admin role" group:"auth"`
|
AuthAdminEmail string `env:"LOCALAI_ADMIN_EMAIL" help:"Email address to auto-promote to admin role" group:"auth"`
|
||||||
AuthRegistrationMode string `env:"LOCALAI_REGISTRATION_MODE" default:"open" help:"Registration mode: 'open' (default), 'approval', or 'invite' (invite code required)" group:"auth"`
|
AuthRegistrationMode string `env:"LOCALAI_REGISTRATION_MODE" default:"open" help:"Registration mode: 'open' (default), 'approval', or 'invite' (invite code required)" group:"auth"`
|
||||||
DisableLocalAuth bool `env:"LOCALAI_DISABLE_LOCAL_AUTH" default:"false" help:"Disable local email/password registration and login (use with OAuth/OIDC-only setups)" group:"auth"`
|
DisableLocalAuth bool `env:"LOCALAI_DISABLE_LOCAL_AUTH" default:"false" help:"Disable local email/password registration and login (use with OAuth/OIDC-only setups)" group:"auth"`
|
||||||
@@ -503,6 +503,9 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
|||||||
opts = append(opts, config.WithAuthOIDCClientID(r.OIDCClientID))
|
opts = append(opts, config.WithAuthOIDCClientID(r.OIDCClientID))
|
||||||
opts = append(opts, config.WithAuthOIDCClientSecret(r.OIDCClientSecret))
|
opts = append(opts, config.WithAuthOIDCClientSecret(r.OIDCClientSecret))
|
||||||
}
|
}
|
||||||
|
if r.AuthBaseURL != "" {
|
||||||
|
opts = append(opts, config.WithAuthBaseURL(r.AuthBaseURL))
|
||||||
|
}
|
||||||
if r.AuthAdminEmail != "" {
|
if r.AuthAdminEmail != "" {
|
||||||
opts = append(opts, config.WithAuthAdminEmail(r.AuthAdminEmail))
|
opts = append(opts, config.WithAuthAdminEmail(r.AuthAdminEmail))
|
||||||
}
|
}
|
||||||
@@ -520,12 +523,6 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Applied unconditionally: the external base URL governs all self-referential
|
|
||||||
// links (not just OAuth callbacks), so it must take effect even when auth is off.
|
|
||||||
if r.ExternalBaseURL != "" {
|
|
||||||
opts = append(opts, config.WithExternalBaseURL(r.ExternalBaseURL))
|
|
||||||
}
|
|
||||||
|
|
||||||
if idleWatchDog || busyWatchDog {
|
if idleWatchDog || busyWatchDog {
|
||||||
opts = append(opts, config.EnableWatchDog)
|
opts = append(opts, config.EnableWatchDog)
|
||||||
if idleWatchDog {
|
if idleWatchDog {
|
||||||
|
|||||||
@@ -49,13 +49,6 @@ type ApplicationConfig struct {
|
|||||||
P2PNetworkID string
|
P2PNetworkID string
|
||||||
Federated bool
|
Federated bool
|
||||||
|
|
||||||
// ExternalBaseURL is the externally visible base URL of this instance
|
|
||||||
// (scheme+host[:port]), set via LOCALAI_BASE_URL. When non-empty it is
|
|
||||||
// authoritative for every self-referential URL LocalAI emits (OAuth
|
|
||||||
// callbacks, generated image/video links, async job StatusURLs),
|
|
||||||
// overriding proxy-header detection. Empty = derive from request headers.
|
|
||||||
ExternalBaseURL string
|
|
||||||
|
|
||||||
// DisableStats turns off per-request token tracking. By default the
|
// DisableStats turns off per-request token tracking. By default the
|
||||||
// routing module's billing recorder runs in every mode (including
|
// routing module's billing recorder runs in every mode (including
|
||||||
// no-auth single-user) so dashboards and `/api/usage` are immediately
|
// no-auth single-user) so dashboards and `/api/usage` are immediately
|
||||||
@@ -203,6 +196,7 @@ type AuthConfig struct {
|
|||||||
OIDCIssuer string // OIDC issuer URL for auto-discovery (e.g. https://accounts.google.com)
|
OIDCIssuer string // OIDC issuer URL for auto-discovery (e.g. https://accounts.google.com)
|
||||||
OIDCClientID string
|
OIDCClientID string
|
||||||
OIDCClientSecret string
|
OIDCClientSecret string
|
||||||
|
BaseURL string // for OAuth callback URLs (e.g. "http://localhost:8080")
|
||||||
AdminEmail string // auto-promote to admin on login
|
AdminEmail string // auto-promote to admin on login
|
||||||
RegistrationMode string // "open", "approval" (default when empty), "invite"
|
RegistrationMode string // "open", "approval" (default when empty), "invite"
|
||||||
DisableLocalAuth bool // disable local email/password registration and login
|
DisableLocalAuth bool // disable local email/password registration and login
|
||||||
@@ -956,9 +950,9 @@ func WithAuthGitHubClientSecret(clientSecret string) AppOption {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func WithExternalBaseURL(url string) AppOption {
|
func WithAuthBaseURL(baseURL string) AppOption {
|
||||||
return func(o *ApplicationConfig) {
|
return func(o *ApplicationConfig) {
|
||||||
o.ExternalBaseURL = url
|
o.Auth.BaseURL = baseURL
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -54,8 +54,35 @@ func (g GPU) IsNVIDIABlackwell() bool {
|
|||||||
return maj >= 12
|
return maj >= 12
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Compute-buffer headroom guard for the raised physical batch.
|
||||||
|
//
|
||||||
|
// Raising n_ubatch grows the CUDA *compute buffer* (the scratch for the forward
|
||||||
|
// graph), which is allocated PER DEVICE — it does not benefit from a second GPU
|
||||||
|
// the way weights or KV (which are split across devices) do. The buffer scales
|
||||||
|
// ~linearly with n_ubatch * n_ctx, so a large context turns the GB10-tuned
|
||||||
|
// ub2048 into multi-GiB of extra scratch that must fit on a SINGLE card. On a
|
||||||
|
// 16 GiB consumer Blackwell with a 200k context that overflows (issue #10485),
|
||||||
|
// even though the GB10 it was measured on (128 GiB unified memory) had room.
|
||||||
|
//
|
||||||
|
// These constants size a conservative guard: only raise the batch when the
|
||||||
|
// extra scratch fits the per-device VRAM ceiling.
|
||||||
|
const (
|
||||||
|
// computeBufferBytesPerCell approximates the CUDA compute-buffer cost of one
|
||||||
|
// (n_ubatch * n_ctx) cell. Derived from an observed allocation (ub2048 *
|
||||||
|
// ctx204800 ~= 4.5 GiB => ~11 B/cell) and rounded up to 16 for margin, since
|
||||||
|
// the real cost also grows with model width (heads / embedding dim) which we
|
||||||
|
// don't know at config time.
|
||||||
|
computeBufferBytesPerCell = 16
|
||||||
|
// blackwellBatchHeadroomDivisor caps the extra compute buffer from raising the
|
||||||
|
// physical batch at VRAM/divisor. /4 keeps the bulk of a device for weights +
|
||||||
|
// KV, which already dominate VRAM use.
|
||||||
|
blackwellBatchHeadroomDivisor = 4
|
||||||
|
)
|
||||||
|
|
||||||
// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
|
// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
|
||||||
// given hardware, used when the model config leaves batch unset.
|
// given hardware class, ignoring context/VRAM headroom. Use
|
||||||
|
// PhysicalBatchForContext when a model context and per-device VRAM are known
|
||||||
|
// (the load paths) so the raised batch can't overflow a single device.
|
||||||
func PhysicalBatch(g GPU) int {
|
func PhysicalBatch(g GPU) int {
|
||||||
if g.IsNVIDIABlackwell() {
|
if g.IsNVIDIABlackwell() {
|
||||||
return BlackwellPhysicalBatch
|
return BlackwellPhysicalBatch
|
||||||
@@ -63,6 +90,32 @@ func PhysicalBatch(g GPU) int {
|
|||||||
return DefaultPhysicalBatch
|
return DefaultPhysicalBatch
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PhysicalBatchForContext is PhysicalBatch gated on per-device VRAM headroom for
|
||||||
|
// the given context: it only raises the batch above the conservative default
|
||||||
|
// when the extra compute buffer (which is allocated on a single device and grows
|
||||||
|
// with n_ubatch * n_ctx) fits within blackwellBatchHeadroomDivisor of the GPU's
|
||||||
|
// VRAM. g.VRAM must be the PER-DEVICE ceiling (the smallest device on a
|
||||||
|
// multi-GPU host), not the summed total — the compute buffer can't be split.
|
||||||
|
//
|
||||||
|
// VRAM 0 (unknown) stays conservative rather than risk a per-device OOM; the
|
||||||
|
// GB10 / unified-memory path reports system RAM, so it still clears the guard.
|
||||||
|
func PhysicalBatchForContext(g GPU, ctx int) int {
|
||||||
|
if !g.IsNVIDIABlackwell() {
|
||||||
|
return DefaultPhysicalBatch
|
||||||
|
}
|
||||||
|
if ctx <= 0 {
|
||||||
|
ctx = DefaultContextSize
|
||||||
|
}
|
||||||
|
if g.VRAM == 0 {
|
||||||
|
return DefaultPhysicalBatch
|
||||||
|
}
|
||||||
|
extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
|
||||||
|
if extra <= g.VRAM/blackwellBatchHeadroomDivisor {
|
||||||
|
return BlackwellPhysicalBatch
|
||||||
|
}
|
||||||
|
return DefaultPhysicalBatch
|
||||||
|
}
|
||||||
|
|
||||||
// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
|
// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
|
||||||
// Callers that re-tune a value chosen by an upstream host (the distributed
|
// Callers that re-tune a value chosen by an upstream host (the distributed
|
||||||
// router correcting the frontend's guess) use this to avoid clobbering an
|
// router correcting the frontend's guess) use this to avoid clobbering an
|
||||||
@@ -122,7 +175,12 @@ func hasParallelOption(opts []string) bool {
|
|||||||
// deterministic device — detection does a live nvidia-smi call.
|
// deterministic device — detection does a live nvidia-smi call.
|
||||||
var localGPU = func() GPU {
|
var localGPU = func() GPU {
|
||||||
vendor, _ := xsysinfo.DetectGPUVendor()
|
vendor, _ := xsysinfo.DetectGPUVendor()
|
||||||
vram, _ := xsysinfo.TotalAvailableVRAM()
|
// Use the SMALLEST device's VRAM, not the summed total: the parallel-slot
|
||||||
|
// tier and the batch headroom guard both reason about what fits on a single
|
||||||
|
// card, and per-device compute buffers can't be split across GPUs. Summing
|
||||||
|
// two 16 GiB cards into "32 GiB" is what over-provisioned multi-GPU hosts
|
||||||
|
// into OOM (issue #10485).
|
||||||
|
vram, _ := xsysinfo.MinPerGPUVRAM()
|
||||||
return GPU{
|
return GPU{
|
||||||
Vendor: vendor,
|
Vendor: vendor,
|
||||||
ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
|
ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
|
||||||
@@ -137,10 +195,20 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
|
|||||||
if cfg == nil {
|
if cfg == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
|
// Raise the physical batch on Blackwell only when the resulting compute
|
||||||
cfg.Batch = BlackwellPhysicalBatch
|
// buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0
|
||||||
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
|
// (rather than writing the default 512) preserves the downstream single-pass
|
||||||
"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
|
// sizing in core/backend.EffectiveBatchSize for embedding/score/rerank.
|
||||||
|
if cfg.Batch == 0 {
|
||||||
|
ctx := DefaultContextSize
|
||||||
|
if cfg.ContextSize != nil {
|
||||||
|
ctx = *cfg.ContextSize
|
||||||
|
}
|
||||||
|
if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch {
|
||||||
|
cfg.Batch = BlackwellPhysicalBatch
|
||||||
|
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
|
||||||
|
"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability, "context", ctx, "vram_gib", gpu.VRAM>>30)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enable concurrent serving by default on a capable GPU: without this the
|
// Enable concurrent serving by default on a capable GPU: without this the
|
||||||
|
|||||||
@@ -9,26 +9,37 @@ import (
|
|||||||
// GPU. The detection seam (localGPU) is injected so the path is deterministic
|
// GPU. The detection seam (localGPU) is injected so the path is deterministic
|
||||||
// without a real GPU.
|
// without a real GPU.
|
||||||
var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
|
var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
|
||||||
|
const gib = uint64(1) << 30
|
||||||
|
|
||||||
var orig func() GPU
|
var orig func() GPU
|
||||||
BeforeEach(func() { orig = localGPU })
|
BeforeEach(func() { orig = localGPU })
|
||||||
AfterEach(func() { localGPU = orig })
|
AfterEach(func() { localGPU = orig })
|
||||||
|
|
||||||
It("sets the physical batch on a local Blackwell GPU", func() {
|
It("sets the physical batch on a local Blackwell GPU with headroom", func() {
|
||||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
|
localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
|
||||||
cfg := &ModelConfig{}
|
cfg := &ModelConfig{}
|
||||||
cfg.SetDefaults()
|
cfg.SetDefaults()
|
||||||
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
||||||
})
|
})
|
||||||
|
|
||||||
|
It("leaves batch unset when a large context would overflow the device", func() {
|
||||||
|
// Regression guard for issue #10485: 16 GiB consumer Blackwell + ~200k ctx.
|
||||||
|
localGPU = func() GPU { return GPU{ComputeCapability: "12.0", VRAM: 16 * gib} }
|
||||||
|
ctx := 204800
|
||||||
|
cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
|
||||||
|
cfg.SetDefaults()
|
||||||
|
Expect(cfg.Batch).To(Equal(0))
|
||||||
|
})
|
||||||
|
|
||||||
It("leaves batch unset on a non-Blackwell local GPU", func() {
|
It("leaves batch unset on a non-Blackwell local GPU", func() {
|
||||||
localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
|
localGPU = func() GPU { return GPU{ComputeCapability: "8.9", VRAM: 119 * gib} }
|
||||||
cfg := &ModelConfig{}
|
cfg := &ModelConfig{}
|
||||||
cfg.SetDefaults()
|
cfg.SetDefaults()
|
||||||
Expect(cfg.Batch).To(Equal(0))
|
Expect(cfg.Batch).To(Equal(0))
|
||||||
})
|
})
|
||||||
|
|
||||||
It("never overrides an explicit batch", func() {
|
It("never overrides an explicit batch", func() {
|
||||||
localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
|
localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
|
||||||
cfg := &ModelConfig{}
|
cfg := &ModelConfig{}
|
||||||
cfg.Batch = 1024
|
cfg.Batch = 1024
|
||||||
cfg.SetDefaults()
|
cfg.SetDefaults()
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var _ = Describe("Hardware-driven config defaults", func() {
|
var _ = Describe("Hardware-driven config defaults", func() {
|
||||||
|
const gib = uint64(1) << 30
|
||||||
|
|
||||||
DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
|
DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
|
||||||
func(cc string, want bool) {
|
func(cc string, want bool) {
|
||||||
Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
|
Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
|
||||||
@@ -35,21 +37,54 @@ var _ = Describe("Hardware-driven config defaults", func() {
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
Describe("PhysicalBatchForContext (per-device VRAM headroom)", func() {
|
||||||
|
It("raises the batch when the compute buffer fits the device", func() {
|
||||||
|
// 16 GiB Blackwell with a small context: the extra scratch is tiny.
|
||||||
|
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 8192)).
|
||||||
|
To(Equal(BlackwellPhysicalBatch))
|
||||||
|
})
|
||||||
|
It("keeps the default batch when a large context would overflow one device", func() {
|
||||||
|
// The issue #10485 case: 16 GiB consumer Blackwell, ~200k context.
|
||||||
|
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 204800)).
|
||||||
|
To(Equal(DefaultPhysicalBatch))
|
||||||
|
})
|
||||||
|
It("still raises the batch on a large unified-memory device (GB10)", func() {
|
||||||
|
// GB10 reports system RAM (~119 GiB) as its single device's VRAM.
|
||||||
|
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1", VRAM: 119 * gib}, 204800)).
|
||||||
|
To(Equal(BlackwellPhysicalBatch))
|
||||||
|
})
|
||||||
|
It("stays conservative when VRAM is unknown", func() {
|
||||||
|
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1"}, 8192)).
|
||||||
|
To(Equal(DefaultPhysicalBatch))
|
||||||
|
})
|
||||||
|
It("never raises the batch on non-Blackwell", func() {
|
||||||
|
Expect(PhysicalBatchForContext(GPU{ComputeCapability: "9.0", VRAM: 80 * gib}, 8192)).
|
||||||
|
To(Equal(DefaultPhysicalBatch))
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
Describe("ApplyHardwareDefaults", func() {
|
Describe("ApplyHardwareDefaults", func() {
|
||||||
It("raises an unset batch to 2048 on Blackwell", func() {
|
It("raises an unset batch to 2048 on Blackwell with headroom", func() {
|
||||||
cfg := &ModelConfig{}
|
cfg := &ModelConfig{}
|
||||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
||||||
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
||||||
})
|
})
|
||||||
|
It("leaves batch unset when a large context would overflow one device", func() {
|
||||||
|
// Regression guard for issue #10485: 16 GiB card + ~200k context.
|
||||||
|
ctx := 204800
|
||||||
|
cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
|
||||||
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
|
||||||
|
Expect(cfg.Batch).To(Equal(0))
|
||||||
|
})
|
||||||
It("leaves batch unset on non-Blackwell", func() {
|
It("leaves batch unset on non-Blackwell", func() {
|
||||||
cfg := &ModelConfig{}
|
cfg := &ModelConfig{}
|
||||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0", VRAM: 119 * gib})
|
||||||
Expect(cfg.Batch).To(Equal(0))
|
Expect(cfg.Batch).To(Equal(0))
|
||||||
})
|
})
|
||||||
It("never overrides an explicit batch", func() {
|
It("never overrides an explicit batch", func() {
|
||||||
cfg := &ModelConfig{}
|
cfg := &ModelConfig{}
|
||||||
cfg.Batch = 1024
|
cfg.Batch = 1024
|
||||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
||||||
Expect(cfg.Batch).To(Equal(1024))
|
Expect(cfg.Batch).To(Equal(1024))
|
||||||
})
|
})
|
||||||
It("no-ops on nil", func() {
|
It("no-ops on nil", func() {
|
||||||
@@ -57,8 +92,6 @@ var _ = Describe("Hardware-driven config defaults", func() {
|
|||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
const gib = uint64(1) << 30
|
|
||||||
|
|
||||||
DescribeTable("DefaultParallelSlots (by VRAM)",
|
DescribeTable("DefaultParallelSlots (by VRAM)",
|
||||||
func(vramGiB uint64, want int) {
|
func(vramGiB uint64, want int) {
|
||||||
Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
|
Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
|
||||||
|
|||||||
@@ -1204,11 +1204,6 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
|||||||
// This ensures gallery-installed and runtime-loaded models get optimal parameters.
|
// This ensures gallery-installed and runtime-loaded models get optimal parameters.
|
||||||
ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
|
ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
|
||||||
|
|
||||||
// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
|
|
||||||
// Uses the local GPU here; in distributed mode the router re-applies the same
|
|
||||||
// heuristics for the selected node's GPU before loading. Explicit config wins.
|
|
||||||
ApplyHardwareDefaults(cfg, localGPU())
|
|
||||||
|
|
||||||
// Apply serving-policy defaults (device-independent): cross-request prefix
|
// Apply serving-policy defaults (device-independent): cross-request prefix
|
||||||
// caching. Propagates to distributed nodes via the model options.
|
// caching. Propagates to distributed nodes via the model options.
|
||||||
ApplyServingDefaults(cfg)
|
ApplyServingDefaults(cfg)
|
||||||
@@ -1247,6 +1242,16 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
|||||||
cfg.ContextSize = &ctx
|
cfg.ContextSize = &ctx
|
||||||
}
|
}
|
||||||
runBackendHooks(cfg, lo.modelPath)
|
runBackendHooks(cfg, lo.modelPath)
|
||||||
|
|
||||||
|
// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell)
|
||||||
|
// LAST, after the context size is fully resolved (explicit config, LoadOptions,
|
||||||
|
// then the GGUF guess inside runBackendHooks): the Blackwell batch guard sizes
|
||||||
|
// the per-device compute buffer against this model's context, so it must see
|
||||||
|
// the final value, not a pre-guess nil. Uses the local GPU here; in distributed
|
||||||
|
// mode the router re-applies the same heuristics for the selected node's GPU
|
||||||
|
// before loading. Explicit config always wins.
|
||||||
|
ApplyHardwareDefaults(cfg, localGPU())
|
||||||
|
|
||||||
cfg.syncKnownUsecasesFromString()
|
cfg.syncKnownUsecasesFromString()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -149,18 +149,6 @@ func API(application *application.Application) (*echo.Echo, error) {
|
|||||||
// Middleware - StripPathPrefix must be registered early as it uses Rewrite which runs before routing
|
// Middleware - StripPathPrefix must be registered early as it uses Rewrite which runs before routing
|
||||||
e.Pre(httpMiddleware.StripPathPrefix())
|
e.Pre(httpMiddleware.StripPathPrefix())
|
||||||
|
|
||||||
// Stamp the configured external base URL into each request context so
|
|
||||||
// middleware.BaseURL can treat it as authoritative for self-referential
|
|
||||||
// links. Registered as Pre so it runs before routing and handlers.
|
|
||||||
if extBaseURL := application.ApplicationConfig().ExternalBaseURL; extBaseURL != "" {
|
|
||||||
e.Pre(func(next echo.HandlerFunc) echo.HandlerFunc {
|
|
||||||
return func(c echo.Context) error {
|
|
||||||
c.Set("_external_base_url", extBaseURL)
|
|
||||||
return next(c)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
e.Pre(middleware.RemoveTrailingSlash())
|
e.Pre(middleware.RemoveTrailingSlash())
|
||||||
|
|
||||||
if application.ApplicationConfig().MachineTag != "" {
|
if application.ApplicationConfig().MachineTag != "" {
|
||||||
|
|||||||
@@ -55,70 +55,17 @@ func BasePathPrefix(c echo.Context) string {
|
|||||||
// The returned URL is guaranteed to end with `/`.
|
// The returned URL is guaranteed to end with `/`.
|
||||||
// The method should be used in conjunction with the StripPathPrefix middleware.
|
// The method should be used in conjunction with the StripPathPrefix middleware.
|
||||||
func BaseURL(c echo.Context) string {
|
func BaseURL(c echo.Context) string {
|
||||||
// An explicit external base URL (LOCALAI_BASE_URL) is authoritative for
|
|
||||||
// the origin. The proxy-derived path prefix is still appended so a
|
|
||||||
// reverse-proxy mount point keeps working. Trailing slashes are
|
|
||||||
// normalized via BasePathPrefix, which always starts and ends with "/".
|
|
||||||
if ext, ok := c.Get("_external_base_url").(string); ok && ext != "" {
|
|
||||||
return strings.TrimRight(ext, "/") + BasePathPrefix(c)
|
|
||||||
}
|
|
||||||
|
|
||||||
fwdProto, fwdHost := parseForwarded(c.Request().Header.Get("Forwarded"))
|
|
||||||
|
|
||||||
scheme := "http"
|
scheme := "http"
|
||||||
switch {
|
if c.Request().Header.Get("X-Forwarded-Proto") == "https" {
|
||||||
case c.Request().TLS != nil:
|
|
||||||
scheme = "https"
|
scheme = "https"
|
||||||
case strings.EqualFold(firstToken(c.Request().Header.Get("X-Forwarded-Proto")), "https"):
|
} else if c.Request().TLS != nil {
|
||||||
scheme = "https"
|
|
||||||
case strings.EqualFold(fwdProto, "https"):
|
|
||||||
scheme = "https"
|
scheme = "https"
|
||||||
}
|
}
|
||||||
|
|
||||||
host := c.Request().Host
|
host := c.Request().Host
|
||||||
if forwardedHost := c.Request().Header.Get("X-Forwarded-Host"); forwardedHost != "" {
|
if forwardedHost := c.Request().Header.Get("X-Forwarded-Host"); forwardedHost != "" {
|
||||||
host = forwardedHost
|
host = forwardedHost
|
||||||
} else if fwdHost != "" {
|
|
||||||
host = fwdHost
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return scheme + "://" + host + BasePathPrefix(c)
|
return scheme + "://" + host + BasePathPrefix(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
// firstToken returns the first comma-separated token of v, trimmed of spaces.
|
|
||||||
// Reverse-proxy chains can emit X-Forwarded-Proto as "https,http"; only the
|
|
||||||
// first hop (closest to the client) is meaningful for scheme detection.
|
|
||||||
func firstToken(v string) string {
|
|
||||||
if i := strings.IndexByte(v, ','); i >= 0 {
|
|
||||||
v = v[:i]
|
|
||||||
}
|
|
||||||
return strings.TrimSpace(v)
|
|
||||||
}
|
|
||||||
|
|
||||||
// parseForwarded extracts the proto and host directives from the first element
|
|
||||||
// of an RFC 7239 Forwarded header (e.g. `for=x;proto=https;host=h, for=y`).
|
|
||||||
// Values may be quoted. Returns empty strings when absent or malformed so the
|
|
||||||
// caller can fall through to other signals.
|
|
||||||
func parseForwarded(header string) (proto, host string) {
|
|
||||||
if header == "" {
|
|
||||||
return "", ""
|
|
||||||
}
|
|
||||||
// Only the first element (closest proxy to the client) matters here.
|
|
||||||
if i := strings.IndexByte(header, ','); i >= 0 {
|
|
||||||
header = header[:i]
|
|
||||||
}
|
|
||||||
for _, directive := range strings.Split(header, ";") {
|
|
||||||
key, value, ok := strings.Cut(strings.TrimSpace(directive), "=")
|
|
||||||
if !ok {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
value = strings.Trim(strings.TrimSpace(value), `"`)
|
|
||||||
switch strings.ToLower(strings.TrimSpace(key)) {
|
|
||||||
case "proto":
|
|
||||||
proto = value
|
|
||||||
case "host":
|
|
||||||
host = value
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return proto, host
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -135,138 +135,4 @@ var _ = Describe("BaseURL", func() {
|
|||||||
Entry("missing leading slash", "evil"),
|
Entry("missing leading slash", "evil"),
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|
||||||
Context("scheme detection hardening", func() {
|
|
||||||
It("treats comma-separated X-Forwarded-Proto as https when first token is https", func() {
|
|
||||||
app := echo.New()
|
|
||||||
actualURL := ""
|
|
||||||
app.GET("/x", func(c echo.Context) error {
|
|
||||||
actualURL = BaseURL(c)
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
req := httptest.NewRequest("GET", "/x", nil)
|
|
||||||
req.Header.Set("X-Forwarded-Proto", "https,http")
|
|
||||||
rec := httptest.NewRecorder()
|
|
||||||
app.ServeHTTP(rec, req)
|
|
||||||
Expect(actualURL).To(Equal("https://example.com/"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("derives https from the RFC 7239 Forwarded proto directive", func() {
|
|
||||||
app := echo.New()
|
|
||||||
actualURL := ""
|
|
||||||
app.GET("/x", func(c echo.Context) error {
|
|
||||||
actualURL = BaseURL(c)
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
req := httptest.NewRequest("GET", "/x", nil)
|
|
||||||
req.Header.Set("Forwarded", "for=192.0.2.1;proto=https;host=proxy.example")
|
|
||||||
rec := httptest.NewRecorder()
|
|
||||||
app.ServeHTTP(rec, req)
|
|
||||||
Expect(actualURL).To(Equal("https://proxy.example/"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("prefers X-Forwarded-Host over the Forwarded host directive", func() {
|
|
||||||
app := echo.New()
|
|
||||||
actualURL := ""
|
|
||||||
app.GET("/x", func(c echo.Context) error {
|
|
||||||
actualURL = BaseURL(c)
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
req := httptest.NewRequest("GET", "/x", nil)
|
|
||||||
req.Header.Set("X-Forwarded-Host", "xfh.example")
|
|
||||||
req.Header.Set("Forwarded", "host=fwd.example;proto=https")
|
|
||||||
rec := httptest.NewRecorder()
|
|
||||||
app.ServeHTTP(rec, req)
|
|
||||||
Expect(actualURL).To(Equal("https://xfh.example/"))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
Context("explicit external base URL override", func() {
|
|
||||||
It("uses the configured origin over conflicting forwarded headers", func() {
|
|
||||||
app := echo.New()
|
|
||||||
actualURL := ""
|
|
||||||
app.GET("/x", func(c echo.Context) error {
|
|
||||||
c.Set("_external_base_url", "https://192.168.0.13:34567")
|
|
||||||
actualURL = BaseURL(c)
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
req := httptest.NewRequest("GET", "/x", nil)
|
|
||||||
req.Header.Set("X-Forwarded-Proto", "http")
|
|
||||||
req.Header.Set("X-Forwarded-Host", "internal:8080")
|
|
||||||
rec := httptest.NewRecorder()
|
|
||||||
app.ServeHTTP(rec, req)
|
|
||||||
Expect(actualURL).To(Equal("https://192.168.0.13:34567/"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("combines the configured origin with a detected path prefix", func() {
|
|
||||||
app := echo.New()
|
|
||||||
actualURL := ""
|
|
||||||
app.GET("/hello", func(c echo.Context) error {
|
|
||||||
c.Set("_original_path", "/localai/hello")
|
|
||||||
c.Set("_external_base_url", "https://ext.example")
|
|
||||||
actualURL = BaseURL(c)
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
req := httptest.NewRequest("GET", "/hello", nil)
|
|
||||||
rec := httptest.NewRecorder()
|
|
||||||
app.ServeHTTP(rec, req)
|
|
||||||
Expect(actualURL).To(Equal("https://ext.example/localai/"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("ignores an empty override", func() {
|
|
||||||
app := echo.New()
|
|
||||||
actualURL := ""
|
|
||||||
app.GET("/x", func(c echo.Context) error {
|
|
||||||
c.Set("_external_base_url", "")
|
|
||||||
actualURL = BaseURL(c)
|
|
||||||
return nil
|
|
||||||
})
|
|
||||||
req := httptest.NewRequest("GET", "/x", nil)
|
|
||||||
rec := httptest.NewRecorder()
|
|
||||||
app.ServeHTTP(rec, req)
|
|
||||||
Expect(actualURL).To(Equal("http://example.com/"))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
Context("parseForwarded helper", func() {
|
|
||||||
It("parses unquoted proto and host", func() {
|
|
||||||
proto, host := parseForwarded("for=192.0.2.1;proto=https;host=h.example")
|
|
||||||
Expect(proto).To(Equal("https"))
|
|
||||||
Expect(host).To(Equal("h.example"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("strips quotes around values", func() {
|
|
||||||
proto, host := parseForwarded(`proto="https";host="h.example"`)
|
|
||||||
Expect(proto).To(Equal("https"))
|
|
||||||
Expect(host).To(Equal("h.example"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("uses only the first element of a multi-element header", func() {
|
|
||||||
proto, host := parseForwarded("proto=https;host=first.example, proto=http;host=second.example")
|
|
||||||
Expect(proto).To(Equal("https"))
|
|
||||||
Expect(host).To(Equal("first.example"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("returns empty strings for an empty header", func() {
|
|
||||||
proto, host := parseForwarded("")
|
|
||||||
Expect(proto).To(BeEmpty())
|
|
||||||
Expect(host).To(BeEmpty())
|
|
||||||
})
|
|
||||||
|
|
||||||
It("skips directives without a value", func() {
|
|
||||||
proto, host := parseForwarded("proto;host=h.example")
|
|
||||||
Expect(proto).To(BeEmpty())
|
|
||||||
Expect(host).To(Equal("h.example"))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
Context("firstToken helper", func() {
|
|
||||||
It("returns the whole trimmed string when there is no comma", func() {
|
|
||||||
Expect(firstToken(" https ")).To(Equal("https"))
|
|
||||||
})
|
|
||||||
|
|
||||||
It("returns the first trimmed token when there is a comma", func() {
|
|
||||||
Expect(firstToken("https , http")).To(Equal("https"))
|
|
||||||
})
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -82,6 +82,7 @@
|
|||||||
"tier": {
|
"tier": {
|
||||||
"cpu": "CPU-only",
|
"cpu": "CPU-only",
|
||||||
"gpu-small": "GPU",
|
"gpu-small": "GPU",
|
||||||
|
"gpu-mid": "GPU",
|
||||||
"gpu-large": "GPU"
|
"gpu-large": "GPU"
|
||||||
},
|
},
|
||||||
"cpuNote": "No GPU detected — these small models stay responsive on CPU.",
|
"cpuNote": "No GPU detected — these small models stay responsive on CPU.",
|
||||||
|
|||||||
@@ -2,6 +2,16 @@
|
|||||||
"title": "Install Models",
|
"title": "Install Models",
|
||||||
"subtitle": "Browse and install AI models from the gallery",
|
"subtitle": "Browse and install AI models from the gallery",
|
||||||
"models": "Models",
|
"models": "Models",
|
||||||
|
"recommended": {
|
||||||
|
"title": "Recommended for your hardware",
|
||||||
|
"cpuNote": "No GPU detected - small models that stay responsive on CPU.",
|
||||||
|
"gpuNote": "Sized to fit your available VRAM with room for context.",
|
||||||
|
"install": "Install",
|
||||||
|
"installing": "Installing",
|
||||||
|
"installStarted": "Installing {{model}}…",
|
||||||
|
"installFailed": "Install failed: {{message}}",
|
||||||
|
"dismiss": "Dismiss recommendations"
|
||||||
|
},
|
||||||
"stats": {
|
"stats": {
|
||||||
"available": "Available",
|
"available": "Available",
|
||||||
"installed": "Installed"
|
"installed": "Installed"
|
||||||
|
|||||||
@@ -6409,6 +6409,9 @@ select.input {
|
|||||||
font-size: 0.875rem;
|
font-size: 0.875rem;
|
||||||
word-break: break-all;
|
word-break: break-all;
|
||||||
}
|
}
|
||||||
|
.home-starters-badge {
|
||||||
|
font-size: 0.625rem;
|
||||||
|
}
|
||||||
.home-starters-size {
|
.home-starters-size {
|
||||||
margin-left: auto;
|
margin-left: auto;
|
||||||
font-size: 0.75rem;
|
font-size: 0.75rem;
|
||||||
@@ -6416,6 +6419,74 @@ select.input {
|
|||||||
white-space: nowrap;
|
white-space: nowrap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ──────────────────── Models gallery: recommended-for-your-hardware strip ──────────────────── */
|
||||||
|
|
||||||
|
.rec-models {
|
||||||
|
margin-bottom: var(--spacing-md);
|
||||||
|
padding: var(--spacing-md) var(--spacing-lg);
|
||||||
|
}
|
||||||
|
.rec-models-head {
|
||||||
|
display: flex;
|
||||||
|
align-items: flex-start;
|
||||||
|
justify-content: space-between;
|
||||||
|
gap: var(--spacing-md);
|
||||||
|
}
|
||||||
|
.rec-models-title {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: var(--spacing-sm);
|
||||||
|
flex-wrap: wrap;
|
||||||
|
}
|
||||||
|
.rec-models-title i {
|
||||||
|
color: var(--color-primary);
|
||||||
|
}
|
||||||
|
.rec-models-note {
|
||||||
|
font-size: 0.8125rem;
|
||||||
|
color: var(--color-text-secondary);
|
||||||
|
}
|
||||||
|
.rec-models-dismiss {
|
||||||
|
background: none;
|
||||||
|
border: none;
|
||||||
|
color: var(--color-text-muted);
|
||||||
|
cursor: pointer;
|
||||||
|
padding: 4px;
|
||||||
|
flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.rec-models-dismiss:hover {
|
||||||
|
color: var(--color-text-primary);
|
||||||
|
}
|
||||||
|
.rec-models-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(auto-fill, minmax(220px, 1fr));
|
||||||
|
gap: var(--spacing-sm);
|
||||||
|
margin-top: var(--spacing-md);
|
||||||
|
}
|
||||||
|
.rec-models-item {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: var(--spacing-xs);
|
||||||
|
padding: var(--spacing-sm) var(--spacing-md);
|
||||||
|
border: 1px solid var(--color-border-subtle);
|
||||||
|
border-radius: var(--radius-md);
|
||||||
|
background: var(--color-bg-primary);
|
||||||
|
}
|
||||||
|
.rec-models-item-name {
|
||||||
|
font-weight: 500;
|
||||||
|
font-size: 0.8125rem;
|
||||||
|
word-break: break-all;
|
||||||
|
}
|
||||||
|
.rec-models-item-meta {
|
||||||
|
display: flex;
|
||||||
|
gap: var(--spacing-sm);
|
||||||
|
font-size: 0.75rem;
|
||||||
|
color: var(--color-text-muted);
|
||||||
|
}
|
||||||
|
.rec-models-item-fit {
|
||||||
|
display: inline-flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 4px;
|
||||||
|
}
|
||||||
|
|
||||||
/* ──────────────────── Home: drop-in endpoint / API compatibility ──────────────────── */
|
/* ──────────────────── Home: drop-in endpoint / API compatibility ──────────────────── */
|
||||||
|
|
||||||
.home-connect {
|
.home-connect {
|
||||||
|
|||||||
86
core/http/react-ui/src/components/RecommendedModels.jsx
Normal file
86
core/http/react-ui/src/components/RecommendedModels.jsx
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
import { useState } from 'react'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import { modelsApi } from '../utils/api'
|
||||||
|
import { useRecommendedModels, isNvfp4Name } from '../hooks/useRecommendedModels'
|
||||||
|
|
||||||
|
const DISMISS_KEY = 'localai_rec_models_dismissed'
|
||||||
|
|
||||||
|
// "Recommended for your hardware" strip at the top of the Models gallery. Shares
|
||||||
|
// the hardware-fit ranking with the empty-state starter widget via
|
||||||
|
// useRecommendedModels, but styled for the gallery page and dismissible (the
|
||||||
|
// gallery is a repeat-visit surface, so it shouldn't nag).
|
||||||
|
export default function RecommendedModels({ addToast }) {
|
||||||
|
const { t } = useTranslation('models')
|
||||||
|
const { recommended, tier, loading } = useRecommendedModels({ count: 4 })
|
||||||
|
const [installing, setInstalling] = useState(() => new Set())
|
||||||
|
const [dismissed, setDismissed] = useState(() => {
|
||||||
|
try { return localStorage.getItem(DISMISS_KEY) === '1' } catch { return false }
|
||||||
|
})
|
||||||
|
|
||||||
|
if (loading || dismissed) return null
|
||||||
|
if (!recommended || recommended.length === 0) return null
|
||||||
|
|
||||||
|
const dismiss = () => {
|
||||||
|
try { localStorage.setItem(DISMISS_KEY, '1') } catch { /* ignore */ }
|
||||||
|
setDismissed(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
const install = async (name) => {
|
||||||
|
setInstalling(prev => new Set(prev).add(name))
|
||||||
|
try {
|
||||||
|
await modelsApi.install(name)
|
||||||
|
addToast?.(t('recommended.installStarted', { model: name }), 'success')
|
||||||
|
} catch (err) {
|
||||||
|
addToast?.(t('recommended.installFailed', { message: err.message }), 'error')
|
||||||
|
setInstalling(prev => {
|
||||||
|
const next = new Set(prev)
|
||||||
|
next.delete(name)
|
||||||
|
return next
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const isGpu = tier.id !== 'cpu'
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="rec-models card">
|
||||||
|
<div className="rec-models-head">
|
||||||
|
<div className="rec-models-title">
|
||||||
|
<i className={`fas ${isGpu ? 'fa-microchip' : 'fa-memory'}`} aria-hidden="true" />
|
||||||
|
<strong>{t('recommended.title')}</strong>
|
||||||
|
<span className="rec-models-note">{isGpu ? t('recommended.gpuNote') : t('recommended.cpuNote')}</span>
|
||||||
|
</div>
|
||||||
|
<button type="button" className="rec-models-dismiss" onClick={dismiss} aria-label={t('recommended.dismiss')} title={t('recommended.dismiss')}>
|
||||||
|
<i className="fas fa-times" aria-hidden="true" />
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div className="rec-models-grid">
|
||||||
|
{recommended.map(m => {
|
||||||
|
const busy = installing.has(m.name)
|
||||||
|
return (
|
||||||
|
<div key={m.name} className="rec-models-item">
|
||||||
|
<div className="rec-models-item-name">{m.name}</div>
|
||||||
|
<div className="rec-models-item-meta">
|
||||||
|
{isNvfp4Name(m.name) && <span className="badge badge-info">NVFP4</span>}
|
||||||
|
{m.sizeDisplay && <span>{m.sizeDisplay}</span>}
|
||||||
|
{isGpu && m.vramDisplay && (
|
||||||
|
<span className="rec-models-item-fit"><i className="fas fa-microchip" aria-hidden="true" /> {m.vramDisplay}</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
className="btn btn-primary btn-sm"
|
||||||
|
disabled={busy}
|
||||||
|
onClick={() => install(m.name)}
|
||||||
|
>
|
||||||
|
{busy
|
||||||
|
? (<><i className="fas fa-spinner fa-spin" aria-hidden="true" /> {t('recommended.installing')}</>)
|
||||||
|
: (<><i className="fas fa-download" aria-hidden="true" /> {t('recommended.install')}</>)}
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
@@ -1,79 +1,78 @@
|
|||||||
import { useState, useEffect, useMemo } from 'react'
|
import { useState } from 'react'
|
||||||
import { useTranslation } from 'react-i18next'
|
import { useTranslation } from 'react-i18next'
|
||||||
import { modelsApi } from '../utils/api'
|
import { modelsApi } from '../utils/api'
|
||||||
import { useResources } from '../hooks/useResources'
|
import { useRecommendedModels, isNvfp4Name } from '../hooks/useRecommendedModels'
|
||||||
|
|
||||||
// Curated, hardware-tiered starter models for the empty-state onboarding. Names
|
// Static fallback used only when the live gallery / estimates can't be reached
|
||||||
// are real gallery entries (gallery/index.yaml); we intersect them against the
|
// (offline, trimmed gallery). The hook is the primary, data-driven path; these
|
||||||
// live gallery at render time so a custom/trimmed gallery degrades gracefully
|
// are real gallery names kept as a safety net so onboarding never shows nothing.
|
||||||
// (unmatched entries simply don't render).
|
// Gemma picks use the QAT (quantization-aware-trained) Q4 builds. NVIDIA boxes
|
||||||
//
|
// get NVFP4 + MTP variants at the mid/large tiers (see NVIDIA below).
|
||||||
// The guiding rule the maintainer asked for: CPU-only machines should be
|
const BASE = {
|
||||||
// steered to genuinely small models (1-4B, Q4) that stay responsive without a
|
cpu: [
|
||||||
// GPU. GPU tiers scale the suggestion up with available VRAM.
|
{ name: 'gemma-4-e2b-it-qat-q4_0', size: '~1.5 GB' },
|
||||||
const SMALL = [
|
{ name: 'qwen3.5-4b-claude-4.6-opus-reasoning-distilled', size: '~2.5 GB' },
|
||||||
{ name: 'llama-3.2-1b-instruct:q4_k_m', size: '~0.8 GB' },
|
{ name: 'gemma-4-e4b-it-qat-q4_0', size: '~3 GB' },
|
||||||
{ name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
|
{ name: 'lfm2.5-1.2b-instruct', size: '~0.8 GB' },
|
||||||
{ name: 'qwen3-1.7b', size: '~1.4 GB' },
|
],
|
||||||
{ name: 'gemma-3-1b-it', size: '~0.8 GB' },
|
'gpu-small': [
|
||||||
]
|
{ name: 'gemma-4-e4b-it-qat-q4_0', size: '~3 GB' },
|
||||||
const MID = [
|
{ name: 'lfm2.5-8b-a1b', size: '~5 GB' },
|
||||||
{ name: 'qwen3-4b', size: '~2.5 GB' },
|
{ name: 'qwen3.5-9b', size: '~5.5 GB' },
|
||||||
{ name: 'gemma-3-4b-it', size: '~3 GB' },
|
{ name: 'gemma-4-12b-it-qat-q4_0', size: '~7 GB' },
|
||||||
{ name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
|
],
|
||||||
]
|
'gpu-mid': [
|
||||||
const LARGE = [
|
{ name: 'qwen3.6-27b', size: '~16 GB' },
|
||||||
{ name: 'meta-llama-3.1-8b-instruct', size: '~5 GB' },
|
{ name: 'qwen3.6-27b-mtp-pi-tune', size: '~16 GB' },
|
||||||
{ name: 'qwen3-4b', size: '~2.5 GB' },
|
{ name: 'gemma-4-26b-a4b-it-qat-q4_0', size: '~16 GB' },
|
||||||
{ name: 'mistral-7b-instruct-v0.3', size: '~4 GB' },
|
{ name: 'qwen3.5-27b', size: '~16 GB' },
|
||||||
]
|
],
|
||||||
|
'gpu-large': [
|
||||||
|
{ name: 'qwen3.6-35b-a3b-apex', size: '~20 GB' },
|
||||||
|
{ name: 'qwen3.6-35b-a3b-claude-4.6-opus-reasoning-distilled', size: '~20 GB' },
|
||||||
|
{ name: 'gemma-4-31b-it-qat-q4_0', size: '~18 GB' },
|
||||||
|
{ name: 'qwen3.5-35b-a3b-apex', size: '~20 GB' },
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
const GB = 1024 * 1024 * 1024
|
// NVIDIA-only overrides: NVFP4 is a Blackwell-optimised 4-bit format paired with
|
||||||
|
// MTP (multi-token prediction) for speed. Only the mid/large tiers have these.
|
||||||
|
const NVIDIA = {
|
||||||
|
'gpu-mid': [
|
||||||
|
{ name: 'qwen3.6-27b-nvfp4-mtp', size: '~14 GB' },
|
||||||
|
{ name: 'qwen3.6-27b-mtp-pi-tune', size: '~16 GB' },
|
||||||
|
{ name: 'gemma-4-26b-a4b-it-qat-q4_0', size: '~16 GB' },
|
||||||
|
{ name: 'qwen3.6-27b', size: '~16 GB' },
|
||||||
|
],
|
||||||
|
'gpu-large': [
|
||||||
|
{ name: 'qwen3.6-35b-a3b-nvfp4-mtp', size: '~18 GB' },
|
||||||
|
{ name: 'qwen3.6-27b-nvfp4-mtp', size: '~14 GB' },
|
||||||
|
{ name: 'qwen3.6-35b-a3b-apex', size: '~20 GB' },
|
||||||
|
{ name: 'gemma-4-31b-it-qat-q4_0', size: '~18 GB' },
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
// Pick a tier from detected hardware. total_memory is GPU VRAM in bytes (0 when
|
function fallbackFor(tierId, isNvidia) {
|
||||||
// CPU-only). Thresholds are deliberately conservative so a suggestion that
|
if (isNvidia && NVIDIA[tierId]) return NVIDIA[tierId]
|
||||||
// "fits" really does.
|
return BASE[tierId] || BASE.cpu
|
||||||
function pickTier(resources) {
|
|
||||||
const isGpu = resources?.type === 'gpu'
|
|
||||||
const vram = resources?.aggregate?.total_memory || 0
|
|
||||||
if (!isGpu || vram <= 0) return { id: 'cpu', list: SMALL }
|
|
||||||
if (vram < 8 * GB) return { id: 'gpu-small', list: MID }
|
|
||||||
return { id: 'gpu-large', list: LARGE }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function StarterModels({ addToast, onInstallStarted }) {
|
export default function StarterModels({ addToast, onInstallStarted }) {
|
||||||
const { t } = useTranslation('home')
|
const { t } = useTranslation('home')
|
||||||
const { resources } = useResources()
|
const { recommended, tier, isNvidia, loading } = useRecommendedModels({ count: 4 })
|
||||||
const [available, setAvailable] = useState(null) // Set of gallery names, or null while loading
|
|
||||||
const [installing, setInstalling] = useState(() => new Set())
|
const [installing, setInstalling] = useState(() => new Set())
|
||||||
|
|
||||||
const tier = useMemo(() => pickTier(resources), [resources])
|
// While the hardware probe + gallery query are in flight, render nothing
|
||||||
const candidates = tier.list
|
// rather than flashing fallback content that may be replaced a moment later.
|
||||||
|
if (loading) return null
|
||||||
|
|
||||||
// Verify candidates exist in the live gallery. One search per name (the tier
|
// Prefer live recommendations; fall back to the static list only when the
|
||||||
// has at most a handful) keeps this resilient to gallery customization.
|
// gallery yielded nothing.
|
||||||
useEffect(() => {
|
const items = (recommended && recommended.length > 0)
|
||||||
let cancelled = false
|
? recommended.map(r => ({ name: r.name, size: r.sizeDisplay }))
|
||||||
const names = [...new Set(candidates.map(c => c.name))]
|
: fallbackFor(tier.id, isNvidia)
|
||||||
Promise.all(names.map(name =>
|
|
||||||
modelsApi.list({ search: name, page: 1 })
|
|
||||||
.then(data => (data?.models || []).some(m => (m.name || m.id) === name) ? name : null)
|
|
||||||
.catch(() => null)
|
|
||||||
)).then(found => {
|
|
||||||
if (cancelled) return
|
|
||||||
const hits = found.filter(Boolean)
|
|
||||||
// If verification yielded nothing (e.g. gallery unreachable), fall back to
|
|
||||||
// showing the curated list rather than an empty widget.
|
|
||||||
setAvailable(hits.length > 0 ? new Set(hits) : null)
|
|
||||||
})
|
|
||||||
return () => { cancelled = true }
|
|
||||||
}, [candidates])
|
|
||||||
|
|
||||||
const visible = available === null
|
if (items.length === 0) return null
|
||||||
? candidates
|
|
||||||
: candidates.filter(c => available.has(c.name))
|
|
||||||
|
|
||||||
if (visible.length === 0) return null
|
|
||||||
|
|
||||||
const install = async (name) => {
|
const install = async (name) => {
|
||||||
setInstalling(prev => new Set(prev).add(name))
|
setInstalling(prev => new Set(prev).add(name))
|
||||||
@@ -104,12 +103,13 @@ export default function StarterModels({ addToast, onInstallStarted }) {
|
|||||||
{tier.id === 'cpu' ? t('starters.cpuNote') : t('starters.gpuNote')}
|
{tier.id === 'cpu' ? t('starters.cpuNote') : t('starters.gpuNote')}
|
||||||
</p>
|
</p>
|
||||||
<ul className="home-starters-list">
|
<ul className="home-starters-list">
|
||||||
{visible.map(c => {
|
{items.map(c => {
|
||||||
const busy = installing.has(c.name)
|
const busy = installing.has(c.name)
|
||||||
return (
|
return (
|
||||||
<li key={c.name} className="home-starters-item">
|
<li key={c.name} className="home-starters-item">
|
||||||
<span className="home-starters-name">{c.name}</span>
|
<span className="home-starters-name">{c.name}</span>
|
||||||
<span className="home-starters-size">{c.size}</span>
|
{isNvfp4Name(c.name) && <span className="badge badge-info home-starters-badge">NVFP4</span>}
|
||||||
|
{c.size && <span className="home-starters-size">{c.size}</span>}
|
||||||
<button
|
<button
|
||||||
type="button"
|
type="button"
|
||||||
className="btn btn-primary btn-sm"
|
className="btn btn-primary btn-sm"
|
||||||
|
|||||||
108
core/http/react-ui/src/hooks/useRecommendedModels.js
vendored
Normal file
108
core/http/react-ui/src/hooks/useRecommendedModels.js
vendored
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
import { useState, useEffect } from 'react'
|
||||||
|
import { modelsApi } from '../utils/api'
|
||||||
|
import { useResources } from './useResources'
|
||||||
|
|
||||||
|
// Data-driven "recommended for your hardware" model picks. The gallery exposes
|
||||||
|
// no popularity/download signal and the list response carries no size, so we:
|
||||||
|
// 1. ask the server for chat-capable models in their natural (curated) order,
|
||||||
|
// 2. estimate size/VRAM for the top candidates (same endpoint the Models page
|
||||||
|
// uses), and
|
||||||
|
// 3. rank by hardware fit — smallest on CPU-only boxes, largest-that-fits on
|
||||||
|
// GPUs (bigger == better quality while still fitting VRAM).
|
||||||
|
//
|
||||||
|
// Returns `recommended === null` while loading, `[]` when nothing could be
|
||||||
|
// resolved (gallery/estimates unavailable) so callers can fall back.
|
||||||
|
|
||||||
|
const GB = 1024 * 1024 * 1024
|
||||||
|
const DEFAULT_CTX = 4096
|
||||||
|
|
||||||
|
// NVFP4 is a Blackwell/NVIDIA-specific 4-bit format — only worth suggesting on
|
||||||
|
// NVIDIA hardware, and to be filtered out elsewhere.
|
||||||
|
export const isNvfp4Name = (name) => /nvfp4/i.test(name || '')
|
||||||
|
|
||||||
|
export function hasNvidiaGpu(resources) {
|
||||||
|
return Array.isArray(resources?.gpus) &&
|
||||||
|
resources.gpus.some(g => (g?.vendor || '').toLowerCase() === 'nvidia')
|
||||||
|
}
|
||||||
|
|
||||||
|
export function recommendTier(resources) {
|
||||||
|
const isGpu = resources?.type === 'gpu'
|
||||||
|
const vram = resources?.aggregate?.total_memory || 0
|
||||||
|
if (!isGpu || vram <= 0) return { id: 'cpu', vram: 0 }
|
||||||
|
if (vram < 8 * GB) return { id: 'gpu-small', vram }
|
||||||
|
if (vram < 24 * GB) return { id: 'gpu-mid', vram }
|
||||||
|
return { id: 'gpu-large', vram }
|
||||||
|
}
|
||||||
|
|
||||||
|
function rank(candidates, tier, count, isNvidia) {
|
||||||
|
// NVFP4 only runs on NVIDIA (Blackwell) — drop it everywhere else, and prefer
|
||||||
|
// it on NVIDIA boxes where it's the fastest path.
|
||||||
|
const pool = candidates.filter(c => c.sizeBytes != null && (isNvidia || !isNvfp4Name(c.name)))
|
||||||
|
if (tier.id === 'cpu') {
|
||||||
|
// No GPU: smallest models stay responsive on CPU.
|
||||||
|
return [...pool].sort((a, b) => a.sizeBytes - b.sizeBytes).slice(0, count)
|
||||||
|
}
|
||||||
|
const limit = tier.vram * 0.95
|
||||||
|
const fits = pool.filter(c => c.vramBytes != null && c.vramBytes <= limit)
|
||||||
|
const base = fits.length > 0 ? fits : pool // tiny GPU where nothing fits → fall through to smallest
|
||||||
|
const byPreference = (a, b) => {
|
||||||
|
// On NVIDIA, surface NVFP4 first; then largest-that-fits (best quality).
|
||||||
|
if (isNvidia) {
|
||||||
|
const an = isNvfp4Name(a.name), bn = isNvfp4Name(b.name)
|
||||||
|
if (an !== bn) return an ? -1 : 1
|
||||||
|
}
|
||||||
|
return fits.length > 0 ? b.sizeBytes - a.sizeBytes : a.sizeBytes - b.sizeBytes
|
||||||
|
}
|
||||||
|
return [...base].sort(byPreference).slice(0, count)
|
||||||
|
}
|
||||||
|
|
||||||
|
export function useRecommendedModels({ count = 4, candidatePool = 10 } = {}) {
|
||||||
|
const { resources } = useResources()
|
||||||
|
const [recommended, setRecommended] = useState(null)
|
||||||
|
const [error, setError] = useState(null)
|
||||||
|
|
||||||
|
const resReady = resources !== null
|
||||||
|
const tier = recommendTier(resources)
|
||||||
|
const isNvidia = hasNvidiaGpu(resources)
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
if (!resReady) return
|
||||||
|
let cancelled = false
|
||||||
|
setRecommended(null)
|
||||||
|
setError(null)
|
||||||
|
;(async () => {
|
||||||
|
try {
|
||||||
|
const data = await modelsApi.list({ tag: 'chat', items: candidatePool, page: 1 })
|
||||||
|
// Recommend models the user hasn't installed yet.
|
||||||
|
const models = (data?.models || []).filter(m => !m.installed)
|
||||||
|
const estimated = await Promise.all(models.map(async (m) => {
|
||||||
|
const name = m.name || m.id
|
||||||
|
try {
|
||||||
|
const e = await modelsApi.estimate(name, [DEFAULT_CTX])
|
||||||
|
const ctx = e?.estimates?.[String(DEFAULT_CTX)]
|
||||||
|
return {
|
||||||
|
name,
|
||||||
|
description: m.description,
|
||||||
|
sizeBytes: e?.sizeBytes ?? null,
|
||||||
|
sizeDisplay: e?.sizeDisplay ?? null,
|
||||||
|
vramBytes: ctx?.vramBytes ?? null,
|
||||||
|
vramDisplay: ctx?.vramDisplay ?? null,
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
return { name, sizeBytes: null }
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
if (cancelled) return
|
||||||
|
setRecommended(rank(estimated, tier, count, isNvidia))
|
||||||
|
} catch (e) {
|
||||||
|
if (cancelled) return
|
||||||
|
setError(e.message)
|
||||||
|
setRecommended([])
|
||||||
|
}
|
||||||
|
})()
|
||||||
|
return () => { cancelled = true }
|
||||||
|
// tier.id / tier.vram / isNvidia are primitives, so resource polling doesn't re-run this.
|
||||||
|
}, [resReady, tier.id, tier.vram, isNvidia, count, candidatePool])
|
||||||
|
|
||||||
|
return { recommended, tier, isNvidia, error, loading: recommended === null }
|
||||||
|
}
|
||||||
@@ -13,6 +13,7 @@ import ConfirmDialog from '../components/ConfirmDialog'
|
|||||||
import GalleryLoader from '../components/GalleryLoader'
|
import GalleryLoader from '../components/GalleryLoader'
|
||||||
import Toggle from '../components/Toggle'
|
import Toggle from '../components/Toggle'
|
||||||
import ResponsiveTable from '../components/ResponsiveTable'
|
import ResponsiveTable from '../components/ResponsiveTable'
|
||||||
|
import RecommendedModels from '../components/RecommendedModels'
|
||||||
import React from 'react'
|
import React from 'react'
|
||||||
|
|
||||||
|
|
||||||
@@ -301,6 +302,8 @@ export default function Models() {
|
|||||||
}
|
}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
|
<RecommendedModels addToast={addToast} />
|
||||||
|
|
||||||
{/* Search */}
|
{/* Search */}
|
||||||
<div className="search-bar" style={{ marginBottom: 'var(--spacing-md)' }}>
|
<div className="search-bar" style={{ marginBottom: 'var(--spacing-md)' }}>
|
||||||
<i className="fas fa-search search-icon" />
|
<i className="fas fa-search search-icon" />
|
||||||
|
|||||||
@@ -268,7 +268,7 @@ func RegisterAuthRoutes(e *echo.Echo, app *application.Application) {
|
|||||||
// Set up OAuth manager when any OAuth/OIDC provider is configured
|
// Set up OAuth manager when any OAuth/OIDC provider is configured
|
||||||
if appConfig.Auth.GitHubClientID != "" || appConfig.Auth.OIDCClientID != "" {
|
if appConfig.Auth.GitHubClientID != "" || appConfig.Auth.OIDCClientID != "" {
|
||||||
oauthMgr, err := auth.NewOAuthManager(
|
oauthMgr, err := auth.NewOAuthManager(
|
||||||
appConfig.ExternalBaseURL,
|
appConfig.Auth.BaseURL,
|
||||||
auth.OAuthParams{
|
auth.OAuthParams{
|
||||||
GitHubClientID: appConfig.Auth.GitHubClientID,
|
GitHubClientID: appConfig.Auth.GitHubClientID,
|
||||||
GitHubClientSecret: appConfig.Auth.GitHubClientSecret,
|
GitHubClientSecret: appConfig.Auth.GitHubClientSecret,
|
||||||
|
|||||||
@@ -156,7 +156,10 @@ func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
|
|||||||
VRAM: node.TotalVRAM,
|
VRAM: node.TotalVRAM,
|
||||||
}
|
}
|
||||||
if config.IsManagedPhysicalBatch(int(opts.NBatch)) {
|
if config.IsManagedPhysicalBatch(int(opts.NBatch)) {
|
||||||
opts.NBatch = int32(config.PhysicalBatch(gpu))
|
// Gate the raised batch on the selected node's per-device VRAM at this
|
||||||
|
// model's context, so a large context can't overflow the node's compute
|
||||||
|
// buffer (issue #10485). node.TotalVRAM is the node's reported ceiling.
|
||||||
|
opts.NBatch = int32(config.PhysicalBatchForContext(gpu, int(opts.ContextSize)))
|
||||||
}
|
}
|
||||||
// Default concurrent serving for the selected node (the frontend that built
|
// Default concurrent serving for the selected node (the frontend that built
|
||||||
// the options may have no GPU). Only adds when no parallel option is set.
|
// the options may have no GPU). Only adds when no parallel option is set.
|
||||||
|
|||||||
@@ -8,12 +8,19 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var _ = Describe("applyNodeHardwareDefaults", func() {
|
var _ = Describe("applyNodeHardwareDefaults", func() {
|
||||||
It("raises a managed default batch on a Blackwell node", func() {
|
It("raises a managed default batch on a Blackwell node with headroom", func() {
|
||||||
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch}
|
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 8192}
|
||||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
|
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})
|
||||||
Expect(opts.NBatch).To(BeEquivalentTo(config.BlackwellPhysicalBatch))
|
Expect(opts.NBatch).To(BeEquivalentTo(config.BlackwellPhysicalBatch))
|
||||||
})
|
})
|
||||||
|
|
||||||
|
It("keeps the default batch when a large context would overflow the node", func() {
|
||||||
|
// Regression guard for issue #10485 on the distributed path.
|
||||||
|
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 204800}
|
||||||
|
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.0", TotalVRAM: 16 << 30})
|
||||||
|
Expect(opts.NBatch).To(BeEquivalentTo(config.DefaultPhysicalBatch))
|
||||||
|
})
|
||||||
|
|
||||||
It("resets a Blackwell guess on a non-Blackwell node", func() {
|
It("resets a Blackwell guess on a non-Blackwell node", func() {
|
||||||
// frontend (Blackwell) guessed high, but the selected node is not Blackwell
|
// frontend (Blackwell) guessed high, but the selected node is not Blackwell
|
||||||
opts := &pb.ModelOptions{NBatch: config.BlackwellPhysicalBatch}
|
opts := &pb.ModelOptions{NBatch: config.BlackwellPhysicalBatch}
|
||||||
|
|||||||
@@ -14,26 +14,6 @@ When running LocalAI behind a TLS termination reverse proxy, the Web UI may fail
|
|||||||
|
|
||||||
LocalAI uses the `X-Forwarded-Proto` HTTP header to determine the protocol used by clients. When this header is set to `https`, LocalAI will generate HTTPS URLs for static assets in the Web UI.
|
LocalAI uses the `X-Forwarded-Proto` HTTP header to determine the protocol used by clients. When this header is set to `https`, LocalAI will generate HTTPS URLs for static assets in the Web UI.
|
||||||
|
|
||||||
## Running behind a reverse proxy (HTTPS / subpath)
|
|
||||||
|
|
||||||
LocalAI does not terminate TLS itself, so HTTPS is provided by a reverse
|
|
||||||
proxy in front of it. Self-referential links (generated image and video
|
|
||||||
URLs, async job status URLs, OAuth callbacks) need the externally visible
|
|
||||||
scheme, host and port.
|
|
||||||
|
|
||||||
LocalAI determines these in this order:
|
|
||||||
|
|
||||||
1. `LOCALAI_BASE_URL` - if set, it is authoritative for the origin. Set it to
|
|
||||||
the externally visible base URL, e.g. `LOCALAI_BASE_URL=https://localai.example.com`
|
|
||||||
or `https://192.168.0.13:34567`. Recommended whenever links come back with
|
|
||||||
the wrong scheme or host.
|
|
||||||
2. Otherwise, the `X-Forwarded-Proto` and `X-Forwarded-Host` headers (or the
|
|
||||||
RFC 7239 `Forwarded` header) sent by the proxy. Ensure your proxy forwards
|
|
||||||
`X-Forwarded-Proto: https`.
|
|
||||||
|
|
||||||
A reverse-proxy subpath mount is supported via `X-Forwarded-Prefix`; it is
|
|
||||||
appended to `LOCALAI_BASE_URL` when both are present.
|
|
||||||
|
|
||||||
## Required Headers
|
## Required Headers
|
||||||
|
|
||||||
Your reverse proxy must forward these headers to LocalAI:
|
Your reverse proxy must forward these headers to LocalAI:
|
||||||
|
|||||||
@@ -129,6 +129,61 @@ func TotalAvailableVRAM() (uint64, error) {
|
|||||||
return 0, nil
|
return 0, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MinPerGPUVRAM returns the total VRAM of the SMALLEST GPU on the host (in
|
||||||
|
// bytes), or 0 when no per-device VRAM is known. Unlike TotalAvailableVRAM
|
||||||
|
// (which sums across devices) this reports a single device's ceiling, which is
|
||||||
|
// the right figure for decisions about what must fit on one card: the compute
|
||||||
|
// buffer (sized by n_ubatch) and the parallel-slot tier. Summing a multi-GPU
|
||||||
|
// host's VRAM over-provisions those into a per-device OOM (issue #10485).
|
||||||
|
//
|
||||||
|
// Unified-memory devices (GB10, Apple) report system RAM as their single
|
||||||
|
// device's VRAM, so they are unaffected.
|
||||||
|
func MinPerGPUVRAM() (uint64, error) {
|
||||||
|
// Prefer per-device binary detection (nvidia-smi/rocm-smi report true
|
||||||
|
// per-card VRAM); ghw's per-card memory can reflect NUMA node RAM on some
|
||||||
|
// hosts, which is why TotalAvailableVRAM treats it as a sum.
|
||||||
|
if infos := GetGPUMemoryUsage(); len(infos) > 0 {
|
||||||
|
if v := minNonZeroVRAM(infos); v > 0 {
|
||||||
|
return v, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: ghw per-card memory, taking the minimum non-zero card.
|
||||||
|
if gpus, err := GPUs(); err == nil {
|
||||||
|
var min uint64
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
if gpu == nil || gpu.Node == nil || gpu.Node.Memory == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if b := gpu.Node.Memory.TotalUsableBytes; b > 0 {
|
||||||
|
if u := uint64(b); min == 0 || u < min {
|
||||||
|
min = u
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if min > 0 {
|
||||||
|
return min, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// minNonZeroVRAM returns the smallest non-zero TotalVRAM across the given GPUs,
|
||||||
|
// or 0 when none report VRAM.
|
||||||
|
func minNonZeroVRAM(infos []GPUMemoryInfo) uint64 {
|
||||||
|
var min uint64
|
||||||
|
for _, g := range infos {
|
||||||
|
if g.TotalVRAM == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if min == 0 || g.TotalVRAM < min {
|
||||||
|
min = g.TotalVRAM
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return min
|
||||||
|
}
|
||||||
|
|
||||||
func HasGPU(vendor string) bool {
|
func HasGPU(vendor string) bool {
|
||||||
gpus, err := GPUs()
|
gpus, err := GPUs()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
37
pkg/xsysinfo/minvram_internal_test.go
Normal file
37
pkg/xsysinfo/minvram_internal_test.go
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
package xsysinfo
|
||||||
|
|
||||||
|
import (
|
||||||
|
. "github.com/onsi/ginkgo/v2"
|
||||||
|
. "github.com/onsi/gomega"
|
||||||
|
)
|
||||||
|
|
||||||
|
var _ = Describe("minNonZeroVRAM", func() {
|
||||||
|
const gib = uint64(1) << 30
|
||||||
|
|
||||||
|
It("returns the smallest device on a multi-GPU host", func() {
|
||||||
|
// Two unequal cards (e.g. RTX 5070 Ti + 5060 Ti, both 16 GiB, or a
|
||||||
|
// mixed pair): the smallest device is the per-card allocation ceiling.
|
||||||
|
infos := []GPUMemoryInfo{
|
||||||
|
{TotalVRAM: 16 * gib},
|
||||||
|
{TotalVRAM: 12 * gib},
|
||||||
|
}
|
||||||
|
Expect(minNonZeroVRAM(infos)).To(Equal(12 * gib))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("ignores devices that report zero VRAM", func() {
|
||||||
|
infos := []GPUMemoryInfo{
|
||||||
|
{TotalVRAM: 0},
|
||||||
|
{TotalVRAM: 24 * gib},
|
||||||
|
}
|
||||||
|
Expect(minNonZeroVRAM(infos)).To(Equal(24 * gib))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("returns the single device's VRAM on a one-GPU host", func() {
|
||||||
|
Expect(minNonZeroVRAM([]GPUMemoryInfo{{TotalVRAM: 16 * gib}})).To(Equal(16 * gib))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("returns 0 when no device reports VRAM", func() {
|
||||||
|
Expect(minNonZeroVRAM([]GPUMemoryInfo{{TotalVRAM: 0}})).To(BeZero())
|
||||||
|
Expect(minNonZeroVRAM(nil)).To(BeZero())
|
||||||
|
})
|
||||||
|
})
|
||||||
Reference in New Issue
Block a user