chore: ⬆️ Update ServeurpersoCom/omnivoice.cpp to 0f37401bebe9b20c0160a888e592108fc1d17607 (#10492 )

⬆️ Update ServeurpersoCom/omnivoice.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
chore: ⬆️ Update ikawrakow/ik_llama.cpp to d5507e33ae7ee2b7b41475f08044d3bde3b839ee (#10498 )
2026-06-25 00:59:28 -04:00 · 2026-06-25 00:57:58 +02:00 · 2026-06-25 00:57:42 +02:00 · 2026-06-25 00:22:45 +02:00 · 2026-06-25 00:07:48 +02:00
24 changed files with 600 additions and 330 deletions
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@
-IK_LLAMA_VERSION?=7ccf1d209588962b96eacca325b37e9b3e8faf5e
+IK_LLAMA_VERSION?=d5507e33ae7ee2b7b41475f08044d3bde3b839ee
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
 CMAKE_ARGS?=
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
 # omnivoice.cpp version
 OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
-OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
+OMNIVOICE_VERSION?=0f37401bebe9b20c0160a888e592108fc1d17607
 SO_TARGET?=libgomnivoicecpp.so
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -140,7 +140,7 @@ type RunCMD struct {
 	OIDCIssuer           string `env:"LOCALAI_OIDC_ISSUER" help:"OIDC issuer URL for auto-discovery" group:"auth"`
 	OIDCClientID         string `env:"LOCALAI_OIDC_CLIENT_ID" help:"OIDC Client ID (auto-enables auth)" group:"auth"`
 	OIDCClientSecret     string `env:"LOCALAI_OIDC_CLIENT_SECRET" help:"OIDC Client Secret" group:"auth"`
-	ExternalBaseURL      string `env:"LOCALAI_BASE_URL" help:"External base URL of this instance (e.g. https://localhost:8080). Used for OAuth callbacks and self-referential links (generated images/videos, job status). When unset, derived from X-Forwarded-Proto/Host or Forwarded headers." group:"api"`
+	AuthBaseURL          string `env:"LOCALAI_BASE_URL" help:"Base URL for OAuth callbacks (e.g. http://localhost:8080)" group:"auth"`
 	AuthAdminEmail       string `env:"LOCALAI_ADMIN_EMAIL" help:"Email address to auto-promote to admin role" group:"auth"`
 	AuthRegistrationMode string `env:"LOCALAI_REGISTRATION_MODE" default:"open" help:"Registration mode: 'open' (default), 'approval', or 'invite' (invite code required)" group:"auth"`
 	DisableLocalAuth     bool   `env:"LOCALAI_DISABLE_LOCAL_AUTH" default:"false" help:"Disable local email/password registration and login (use with OAuth/OIDC-only setups)" group:"auth"`
@@ -503,6 +503,9 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 			opts = append(opts, config.WithAuthOIDCClientID(r.OIDCClientID))
 			opts = append(opts, config.WithAuthOIDCClientSecret(r.OIDCClientSecret))
 		}
 		if r.AuthBaseURL != "" {
 			opts = append(opts, config.WithAuthBaseURL(r.AuthBaseURL))
 		}
 		if r.AuthAdminEmail != "" {
 			opts = append(opts, config.WithAuthAdminEmail(r.AuthAdminEmail))
 		}
@@ -520,12 +523,6 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		}
 	}
 	// Applied unconditionally: the external base URL governs all self-referential
 	// links (not just OAuth callbacks), so it must take effect even when auth is off.
 	if r.ExternalBaseURL != "" {
 		opts = append(opts, config.WithExternalBaseURL(r.ExternalBaseURL))
 	}
 	if idleWatchDog || busyWatchDog {
 		opts = append(opts, config.EnableWatchDog)
 		if idleWatchDog {
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -49,13 +49,6 @@ type ApplicationConfig struct {
 	P2PNetworkID                  string
 	Federated                     bool
 	// ExternalBaseURL is the externally visible base URL of this instance
 	// (scheme+host[:port]), set via LOCALAI_BASE_URL. When non-empty it is
 	// authoritative for every self-referential URL LocalAI emits (OAuth
 	// callbacks, generated image/video links, async job StatusURLs),
 	// overriding proxy-header detection. Empty = derive from request headers.
 	ExternalBaseURL string
 	// DisableStats turns off per-request token tracking. By default the
 	// routing module's billing recorder runs in every mode (including
 	// no-auth single-user) so dashboards and `/api/usage` are immediately
@@ -203,6 +196,7 @@ type AuthConfig struct {
 	OIDCIssuer          string // OIDC issuer URL for auto-discovery (e.g. https://accounts.google.com)
 	OIDCClientID        string
 	OIDCClientSecret    string
 	BaseURL             string // for OAuth callback URLs (e.g. "http://localhost:8080")
 	AdminEmail          string // auto-promote to admin on login
 	RegistrationMode    string // "open", "approval" (default when empty), "invite"
 	DisableLocalAuth    bool   // disable local email/password registration and login
@@ -956,9 +950,9 @@ func WithAuthGitHubClientSecret(clientSecret string) AppOption {
 	}
 }
-func WithExternalBaseURL(url string) AppOption {
+func WithAuthBaseURL(baseURL string) AppOption {
 	return func(o *ApplicationConfig) {
-		o.ExternalBaseURL = url
+		o.Auth.BaseURL = baseURL
 	}
 }
--- a/core/config/hardware_defaults.go
+++ b/core/config/hardware_defaults.go
@@ -54,8 +54,35 @@ func (g GPU) IsNVIDIABlackwell() bool {
 	return maj >= 12
 }
 // Compute-buffer headroom guard for the raised physical batch.
 //
 // Raising n_ubatch grows the CUDA *compute buffer* (the scratch for the forward
 // graph), which is allocated PER DEVICE — it does not benefit from a second GPU
 // the way weights or KV (which are split across devices) do. The buffer scales
 // ~linearly with n_ubatch * n_ctx, so a large context turns the GB10-tuned
 // ub2048 into multi-GiB of extra scratch that must fit on a SINGLE card. On a
 // 16 GiB consumer Blackwell with a 200k context that overflows (issue #10485),
 // even though the GB10 it was measured on (128 GiB unified memory) had room.
 //
 // These constants size a conservative guard: only raise the batch when the
 // extra scratch fits the per-device VRAM ceiling.
 const (
 	// computeBufferBytesPerCell approximates the CUDA compute-buffer cost of one
 	// (n_ubatch * n_ctx) cell. Derived from an observed allocation (ub2048 *
 	// ctx204800 ~= 4.5 GiB => ~11 B/cell) and rounded up to 16 for margin, since
 	// the real cost also grows with model width (heads / embedding dim) which we
 	// don't know at config time.
 	computeBufferBytesPerCell = 16
 	// blackwellBatchHeadroomDivisor caps the extra compute buffer from raising the
 	// physical batch at VRAM/divisor. /4 keeps the bulk of a device for weights +
 	// KV, which already dominate VRAM use.
 	blackwellBatchHeadroomDivisor = 4
 )
 // PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
-// given hardware, used when the model config leaves batch unset.
+// given hardware class, ignoring context/VRAM headroom. Use
 // PhysicalBatchForContext when a model context and per-device VRAM are known
 // (the load paths) so the raised batch can't overflow a single device.
 func PhysicalBatch(g GPU) int {
 	if g.IsNVIDIABlackwell() {
 		return BlackwellPhysicalBatch
@@ -63,6 +90,32 @@ func PhysicalBatch(g GPU) int {
 	return DefaultPhysicalBatch
 }
 // PhysicalBatchForContext is PhysicalBatch gated on per-device VRAM headroom for
 // the given context: it only raises the batch above the conservative default
 // when the extra compute buffer (which is allocated on a single device and grows
 // with n_ubatch * n_ctx) fits within blackwellBatchHeadroomDivisor of the GPU's
 // VRAM. g.VRAM must be the PER-DEVICE ceiling (the smallest device on a
 // multi-GPU host), not the summed total — the compute buffer can't be split.
 //
 // VRAM 0 (unknown) stays conservative rather than risk a per-device OOM; the
 // GB10 / unified-memory path reports system RAM, so it still clears the guard.
 func PhysicalBatchForContext(g GPU, ctx int) int {
 	if !g.IsNVIDIABlackwell() {
 		return DefaultPhysicalBatch
 	}
 	if ctx <= 0 {
 		ctx = DefaultContextSize
 	}
 	if g.VRAM == 0 {
 		return DefaultPhysicalBatch
 	}
 	extra := uint64(ctx) * uint64(BlackwellPhysicalBatch-DefaultPhysicalBatch) * computeBufferBytesPerCell
 	if extra <= g.VRAM/blackwellBatchHeadroomDivisor {
 		return BlackwellPhysicalBatch
 	}
 	return DefaultPhysicalBatch
 }
 // IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
 // Callers that re-tune a value chosen by an upstream host (the distributed
 // router correcting the frontend's guess) use this to avoid clobbering an
@@ -122,7 +175,12 @@ func hasParallelOption(opts []string) bool {
 // deterministic device — detection does a live nvidia-smi call.
 var localGPU = func() GPU {
 	vendor, _ := xsysinfo.DetectGPUVendor()
-	vram, _ := xsysinfo.TotalAvailableVRAM()
+	// Use the SMALLEST device's VRAM, not the summed total: the parallel-slot
 	// tier and the batch headroom guard both reason about what fits on a single
 	// card, and per-device compute buffers can't be split across GPUs. Summing
 	// two 16 GiB cards into "32 GiB" is what over-provisioned multi-GPU hosts
 	// into OOM (issue #10485).
 	vram, _ := xsysinfo.MinPerGPUVRAM()
 	return GPU{
 		Vendor:            vendor,
 		ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
@@ -137,10 +195,20 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
 	if cfg == nil {
 		return
 	}
-	if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
+	// Raise the physical batch on Blackwell only when the resulting compute
-		cfg.Batch = BlackwellPhysicalBatch
+	// buffer fits the per-device VRAM at THIS model's context. Leaving Batch at 0
-		xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
+	// (rather than writing the default 512) preserves the downstream single-pass
-			"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
+	// sizing in core/backend.EffectiveBatchSize for embedding/score/rerank.
 	if cfg.Batch == 0 {
 		ctx := DefaultContextSize
 		if cfg.ContextSize != nil {
 			ctx = *cfg.ContextSize
 		}
 		if PhysicalBatchForContext(gpu, ctx) == BlackwellPhysicalBatch {
 			cfg.Batch = BlackwellPhysicalBatch
 			xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
 				"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability, "context", ctx, "vram_gib", gpu.VRAM>>30)
 		}
 	}
 	// Enable concurrent serving by default on a capable GPU: without this the
--- a/core/config/hardware_defaults_internal_test.go
+++ b/core/config/hardware_defaults_internal_test.go
@@ -9,26 +9,37 @@ import (
 // GPU. The detection seam (localGPU) is injected so the path is deterministic
 // without a real GPU.
 var _ = Describe("SetDefaults hardware defaults (single-instance)", func() {
 	const gib = uint64(1) << 30
 	var orig func() GPU
 	BeforeEach(func() { orig = localGPU })
 	AfterEach(func() { localGPU = orig })
-	It("sets the physical batch on a local Blackwell GPU", func() {
+	It("sets the physical batch on a local Blackwell GPU with headroom", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 	})
 	It("leaves batch unset when a large context would overflow the device", func() {
 		// Regression guard for issue #10485: 16 GiB consumer Blackwell + ~200k ctx.
 		localGPU = func() GPU { return GPU{ComputeCapability: "12.0", VRAM: 16 * gib} }
 		ctx := 204800
 		cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(0))
 	})
 	It("leaves batch unset on a non-Blackwell local GPU", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} }
+		localGPU = func() GPU { return GPU{ComputeCapability: "8.9", VRAM: 119 * gib} }
 		cfg := &ModelConfig{}
 		cfg.SetDefaults()
 		Expect(cfg.Batch).To(Equal(0))
 	})
 	It("never overrides an explicit batch", func() {
-		localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} }
+		localGPU = func() GPU { return GPU{ComputeCapability: "12.1", VRAM: 119 * gib} }
 		cfg := &ModelConfig{}
 		cfg.Batch = 1024
 		cfg.SetDefaults()
--- a/core/config/hardware_defaults_test.go
+++ b/core/config/hardware_defaults_test.go
@@ -7,6 +7,8 @@ import (
 )
 var _ = Describe("Hardware-driven config defaults", func() {
 	const gib = uint64(1) << 30
 	DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
 		func(cc string, want bool) {
 			Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
@@ -35,21 +37,54 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		})
 	})
 	Describe("PhysicalBatchForContext (per-device VRAM headroom)", func() {
 		It("raises the batch when the compute buffer fits the device", func() {
 			// 16 GiB Blackwell with a small context: the extra scratch is tiny.
 			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 8192)).
 				To(Equal(BlackwellPhysicalBatch))
 		})
 		It("keeps the default batch when a large context would overflow one device", func() {
 			// The issue #10485 case: 16 GiB consumer Blackwell, ~200k context.
 			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.0", VRAM: 16 * gib}, 204800)).
 				To(Equal(DefaultPhysicalBatch))
 		})
 		It("still raises the batch on a large unified-memory device (GB10)", func() {
 			// GB10 reports system RAM (~119 GiB) as its single device's VRAM.
 			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1", VRAM: 119 * gib}, 204800)).
 				To(Equal(BlackwellPhysicalBatch))
 		})
 		It("stays conservative when VRAM is unknown", func() {
 			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "12.1"}, 8192)).
 				To(Equal(DefaultPhysicalBatch))
 		})
 		It("never raises the batch on non-Blackwell", func() {
 			Expect(PhysicalBatchForContext(GPU{ComputeCapability: "9.0", VRAM: 80 * gib}, 8192)).
 				To(Equal(DefaultPhysicalBatch))
 		})
 	})
 	Describe("ApplyHardwareDefaults", func() {
-		It("raises an unset batch to 2048 on Blackwell", func() {
+		It("raises an unset batch to 2048 on Blackwell with headroom", func() {
 			cfg := &ModelConfig{}
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
 			Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
 		})
 		It("leaves batch unset when a large context would overflow one device", func() {
 			// Regression guard for issue #10485: 16 GiB card + ~200k context.
 			ctx := 204800
 			cfg := &ModelConfig{LLMConfig: LLMConfig{ContextSize: &ctx}}
 			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.0", VRAM: 16 * gib})
 			Expect(cfg.Batch).To(Equal(0))
 		})
 		It("leaves batch unset on non-Blackwell", func() {
 			cfg := &ModelConfig{}
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0", VRAM: 119 * gib})
 			Expect(cfg.Batch).To(Equal(0))
 		})
 		It("never overrides an explicit batch", func() {
 			cfg := &ModelConfig{}
 			cfg.Batch = 1024
-			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
 			Expect(cfg.Batch).To(Equal(1024))
 		})
 		It("no-ops on nil", func() {
@@ -57,8 +92,6 @@ var _ = Describe("Hardware-driven config defaults", func() {
 		})
 	})
 	const gib = uint64(1) << 30
 	DescribeTable("DefaultParallelSlots (by VRAM)",
 		func(vramGiB uint64, want int) {
 			Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -1204,11 +1204,6 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	// This ensures gallery-installed and runtime-loaded models get optimal parameters.
 	ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
 	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
 	// Uses the local GPU here; in distributed mode the router re-applies the same
 	// heuristics for the selected node's GPU before loading. Explicit config wins.
 	ApplyHardwareDefaults(cfg, localGPU())
 	// Apply serving-policy defaults (device-independent): cross-request prefix
 	// caching. Propagates to distributed nodes via the model options.
 	ApplyServingDefaults(cfg)
@@ -1247,6 +1242,16 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.ContextSize = &ctx
 	}
 	runBackendHooks(cfg, lo.modelPath)
 	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell)
 	// LAST, after the context size is fully resolved (explicit config, LoadOptions,
 	// then the GGUF guess inside runBackendHooks): the Blackwell batch guard sizes
 	// the per-device compute buffer against this model's context, so it must see
 	// the final value, not a pre-guess nil. Uses the local GPU here; in distributed
 	// mode the router re-applies the same heuristics for the selected node's GPU
 	// before loading. Explicit config always wins.
 	ApplyHardwareDefaults(cfg, localGPU())
 	cfg.syncKnownUsecasesFromString()
 }
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -149,18 +149,6 @@ func API(application *application.Application) (*echo.Echo, error) {
 	// Middleware - StripPathPrefix must be registered early as it uses Rewrite which runs before routing
 	e.Pre(httpMiddleware.StripPathPrefix())
 	// Stamp the configured external base URL into each request context so
 	// middleware.BaseURL can treat it as authoritative for self-referential
 	// links. Registered as Pre so it runs before routing and handlers.
 	if extBaseURL := application.ApplicationConfig().ExternalBaseURL; extBaseURL != "" {
 		e.Pre(func(next echo.HandlerFunc) echo.HandlerFunc {
 			return func(c echo.Context) error {
 				c.Set("_external_base_url", extBaseURL)
 				return next(c)
 			}
 		})
 	}
 	e.Pre(middleware.RemoveTrailingSlash())
 	if application.ApplicationConfig().MachineTag != "" {
--- a/core/http/middleware/baseurl.go
+++ b/core/http/middleware/baseurl.go
@@ -55,70 +55,17 @@ func BasePathPrefix(c echo.Context) string {
 // The returned URL is guaranteed to end with `/`.
 // The method should be used in conjunction with the StripPathPrefix middleware.
 func BaseURL(c echo.Context) string {
 	// An explicit external base URL (LOCALAI_BASE_URL) is authoritative for
 	// the origin. The proxy-derived path prefix is still appended so a
 	// reverse-proxy mount point keeps working. Trailing slashes are
 	// normalized via BasePathPrefix, which always starts and ends with "/".
 	if ext, ok := c.Get("_external_base_url").(string); ok && ext != "" {
 		return strings.TrimRight(ext, "/") + BasePathPrefix(c)
 	}
 	fwdProto, fwdHost := parseForwarded(c.Request().Header.Get("Forwarded"))
 	scheme := "http"
-	switch {
+	if c.Request().Header.Get("X-Forwarded-Proto") == "https" {
 	case c.Request().TLS != nil:
 		scheme = "https"
-	case strings.EqualFold(firstToken(c.Request().Header.Get("X-Forwarded-Proto")), "https"):
+	} else if c.Request().TLS != nil {
 		scheme = "https"
 	case strings.EqualFold(fwdProto, "https"):
 		scheme = "https"
 	}
 	host := c.Request().Host
 	if forwardedHost := c.Request().Header.Get("X-Forwarded-Host"); forwardedHost != "" {
 		host = forwardedHost
 	} else if fwdHost != "" {
 		host = fwdHost
 	}
 	return scheme + "://" + host + BasePathPrefix(c)
 }
 // firstToken returns the first comma-separated token of v, trimmed of spaces.
 // Reverse-proxy chains can emit X-Forwarded-Proto as "https,http"; only the
 // first hop (closest to the client) is meaningful for scheme detection.
 func firstToken(v string) string {
 	if i := strings.IndexByte(v, ','); i >= 0 {
 		v = v[:i]
 	}
 	return strings.TrimSpace(v)
 }
 // parseForwarded extracts the proto and host directives from the first element
 // of an RFC 7239 Forwarded header (e.g. `for=x;proto=https;host=h, for=y`).
 // Values may be quoted. Returns empty strings when absent or malformed so the
 // caller can fall through to other signals.
 func parseForwarded(header string) (proto, host string) {
 	if header == "" {
 		return "", ""
 	}
 	// Only the first element (closest proxy to the client) matters here.
 	if i := strings.IndexByte(header, ','); i >= 0 {
 		header = header[:i]
 	}
 	for _, directive := range strings.Split(header, ";") {
 		key, value, ok := strings.Cut(strings.TrimSpace(directive), "=")
 		if !ok {
 			continue
 		}
 		value = strings.Trim(strings.TrimSpace(value), `"`)
 		switch strings.ToLower(strings.TrimSpace(key)) {
 		case "proto":
 			proto = value
 		case "host":
 			host = value
 		}
 	}
 	return proto, host
 }
--- a/core/http/middleware/baseurl_test.go
+++ b/core/http/middleware/baseurl_test.go
@@ -135,138 +135,4 @@ var _ = Describe("BaseURL", func() {
 			Entry("missing leading slash", "evil"),
 		)
 	})
 	Context("scheme detection hardening", func() {
 		It("treats comma-separated X-Forwarded-Proto as https when first token is https", func() {
 			app := echo.New()
 			actualURL := ""
 			app.GET("/x", func(c echo.Context) error {
 				actualURL = BaseURL(c)
 				return nil
 			})
 			req := httptest.NewRequest("GET", "/x", nil)
 			req.Header.Set("X-Forwarded-Proto", "https,http")
 			rec := httptest.NewRecorder()
 			app.ServeHTTP(rec, req)
 			Expect(actualURL).To(Equal("https://example.com/"))
 		})
 		It("derives https from the RFC 7239 Forwarded proto directive", func() {
 			app := echo.New()
 			actualURL := ""
 			app.GET("/x", func(c echo.Context) error {
 				actualURL = BaseURL(c)
 				return nil
 			})
 			req := httptest.NewRequest("GET", "/x", nil)
 			req.Header.Set("Forwarded", "for=192.0.2.1;proto=https;host=proxy.example")
 			rec := httptest.NewRecorder()
 			app.ServeHTTP(rec, req)
 			Expect(actualURL).To(Equal("https://proxy.example/"))
 		})
 		It("prefers X-Forwarded-Host over the Forwarded host directive", func() {
 			app := echo.New()
 			actualURL := ""
 			app.GET("/x", func(c echo.Context) error {
 				actualURL = BaseURL(c)
 				return nil
 			})
 			req := httptest.NewRequest("GET", "/x", nil)
 			req.Header.Set("X-Forwarded-Host", "xfh.example")
 			req.Header.Set("Forwarded", "host=fwd.example;proto=https")
 			rec := httptest.NewRecorder()
 			app.ServeHTTP(rec, req)
 			Expect(actualURL).To(Equal("https://xfh.example/"))
 		})
 	})
 	Context("explicit external base URL override", func() {
 		It("uses the configured origin over conflicting forwarded headers", func() {
 			app := echo.New()
 			actualURL := ""
 			app.GET("/x", func(c echo.Context) error {
 				c.Set("_external_base_url", "https://192.168.0.13:34567")
 				actualURL = BaseURL(c)
 				return nil
 			})
 			req := httptest.NewRequest("GET", "/x", nil)
 			req.Header.Set("X-Forwarded-Proto", "http")
 			req.Header.Set("X-Forwarded-Host", "internal:8080")
 			rec := httptest.NewRecorder()
 			app.ServeHTTP(rec, req)
 			Expect(actualURL).To(Equal("https://192.168.0.13:34567/"))
 		})
 		It("combines the configured origin with a detected path prefix", func() {
 			app := echo.New()
 			actualURL := ""
 			app.GET("/hello", func(c echo.Context) error {
 				c.Set("_original_path", "/localai/hello")
 				c.Set("_external_base_url", "https://ext.example")
 				actualURL = BaseURL(c)
 				return nil
 			})
 			req := httptest.NewRequest("GET", "/hello", nil)
 			rec := httptest.NewRecorder()
 			app.ServeHTTP(rec, req)
 			Expect(actualURL).To(Equal("https://ext.example/localai/"))
 		})
 		It("ignores an empty override", func() {
 			app := echo.New()
 			actualURL := ""
 			app.GET("/x", func(c echo.Context) error {
 				c.Set("_external_base_url", "")
 				actualURL = BaseURL(c)
 				return nil
 			})
 			req := httptest.NewRequest("GET", "/x", nil)
 			rec := httptest.NewRecorder()
 			app.ServeHTTP(rec, req)
 			Expect(actualURL).To(Equal("http://example.com/"))
 		})
 	})
 	Context("parseForwarded helper", func() {
 		It("parses unquoted proto and host", func() {
 			proto, host := parseForwarded("for=192.0.2.1;proto=https;host=h.example")
 			Expect(proto).To(Equal("https"))
 			Expect(host).To(Equal("h.example"))
 		})
 		It("strips quotes around values", func() {
 			proto, host := parseForwarded(`proto="https";host="h.example"`)
 			Expect(proto).To(Equal("https"))
 			Expect(host).To(Equal("h.example"))
 		})
 		It("uses only the first element of a multi-element header", func() {
 			proto, host := parseForwarded("proto=https;host=first.example, proto=http;host=second.example")
 			Expect(proto).To(Equal("https"))
 			Expect(host).To(Equal("first.example"))
 		})
 		It("returns empty strings for an empty header", func() {
 			proto, host := parseForwarded("")
 			Expect(proto).To(BeEmpty())
 			Expect(host).To(BeEmpty())
 		})
 		It("skips directives without a value", func() {
 			proto, host := parseForwarded("proto;host=h.example")
 			Expect(proto).To(BeEmpty())
 			Expect(host).To(Equal("h.example"))
 		})
 	})
 	Context("firstToken helper", func() {
 		It("returns the whole trimmed string when there is no comma", func() {
 			Expect(firstToken("  https  ")).To(Equal("https"))
 		})
 		It("returns the first trimmed token when there is a comma", func() {
 			Expect(firstToken("https , http")).To(Equal("https"))
 		})
 	})
 })
--- a/core/http/react-ui/public/locales/en/home.json
+++ b/core/http/react-ui/public/locales/en/home.json
@@ -82,6 +82,7 @@
    "tier": {
      "cpu": "CPU-only",
      "gpu-small": "GPU",
      "gpu-mid": "GPU",
      "gpu-large": "GPU"
    },
    "cpuNote": "No GPU detected — these small models stay responsive on CPU.",
--- a/core/http/react-ui/public/locales/en/models.json
+++ b/core/http/react-ui/public/locales/en/models.json
@@ -2,6 +2,16 @@
  "title": "Install Models",
  "subtitle": "Browse and install AI models from the gallery",
  "models": "Models",
  "recommended": {
    "title": "Recommended for your hardware",
    "cpuNote": "No GPU detected - small models that stay responsive on CPU.",
    "gpuNote": "Sized to fit your available VRAM with room for context.",
    "install": "Install",
    "installing": "Installing",
    "installStarted": "Installing {{model}}…",
    "installFailed": "Install failed: {{message}}",
    "dismiss": "Dismiss recommendations"
  },
  "stats": {
    "available": "Available",
    "installed": "Installed"
--- a/core/http/react-ui/src/App.css
+++ b/core/http/react-ui/src/App.css
@@ -6409,6 +6409,9 @@ select.input {
  font-size: 0.875rem;
  word-break: break-all;
 }
 .home-starters-badge {
  font-size: 0.625rem;
 }
 .home-starters-size {
  margin-left: auto;
  font-size: 0.75rem;
@@ -6416,6 +6419,74 @@ select.input {
  white-space: nowrap;
 }
 /* ──────────────────── Models gallery: recommended-for-your-hardware strip ──────────────────── */
 .rec-models {
  margin-bottom: var(--spacing-md);
  padding: var(--spacing-md) var(--spacing-lg);
 }
 .rec-models-head {
  display: flex;
  align-items: flex-start;
  justify-content: space-between;
  gap: var(--spacing-md);
 }
 .rec-models-title {
  display: flex;
  align-items: center;
  gap: var(--spacing-sm);
  flex-wrap: wrap;
 }
 .rec-models-title i {
  color: var(--color-primary);
 }
 .rec-models-note {
  font-size: 0.8125rem;
  color: var(--color-text-secondary);
 }
 .rec-models-dismiss {
  background: none;
  border: none;
  color: var(--color-text-muted);
  cursor: pointer;
  padding: 4px;
  flex-shrink: 0;
 }
 .rec-models-dismiss:hover {
  color: var(--color-text-primary);
 }
 .rec-models-grid {
  display: grid;
  grid-template-columns: repeat(auto-fill, minmax(220px, 1fr));
  gap: var(--spacing-sm);
  margin-top: var(--spacing-md);
 }
 .rec-models-item {
  display: flex;
  flex-direction: column;
  gap: var(--spacing-xs);
  padding: var(--spacing-sm) var(--spacing-md);
  border: 1px solid var(--color-border-subtle);
  border-radius: var(--radius-md);
  background: var(--color-bg-primary);
 }
 .rec-models-item-name {
  font-weight: 500;
  font-size: 0.8125rem;
  word-break: break-all;
 }
 .rec-models-item-meta {
  display: flex;
  gap: var(--spacing-sm);
  font-size: 0.75rem;
  color: var(--color-text-muted);
 }
 .rec-models-item-fit {
  display: inline-flex;
  align-items: center;
  gap: 4px;
 }
 /* ──────────────────── Home: drop-in endpoint / API compatibility ──────────────────── */
 .home-connect {
--- a/core/http/react-ui/src/components/RecommendedModels.jsx
+++ b/core/http/react-ui/src/components/RecommendedModels.jsx
@@ -0,0 +1,86 @@
 import { useState } from 'react'
 import { useTranslation } from 'react-i18next'
 import { modelsApi } from '../utils/api'
 import { useRecommendedModels, isNvfp4Name } from '../hooks/useRecommendedModels'
 const DISMISS_KEY = 'localai_rec_models_dismissed'
 // "Recommended for your hardware" strip at the top of the Models gallery. Shares
 // the hardware-fit ranking with the empty-state starter widget via
 // useRecommendedModels, but styled for the gallery page and dismissible (the
 // gallery is a repeat-visit surface, so it shouldn't nag).
 export default function RecommendedModels({ addToast }) {
  const { t } = useTranslation('models')
  const { recommended, tier, loading } = useRecommendedModels({ count: 4 })
  const [installing, setInstalling] = useState(() => new Set())
  const [dismissed, setDismissed] = useState(() => {
    try { return localStorage.getItem(DISMISS_KEY) === '1' } catch { return false }
  })
  if (loading || dismissed) return null
  if (!recommended || recommended.length === 0) return null
  const dismiss = () => {
    try { localStorage.setItem(DISMISS_KEY, '1') } catch { /* ignore */ }
    setDismissed(true)
  }
  const install = async (name) => {
    setInstalling(prev => new Set(prev).add(name))
    try {
      await modelsApi.install(name)
      addToast?.(t('recommended.installStarted', { model: name }), 'success')
    } catch (err) {
      addToast?.(t('recommended.installFailed', { message: err.message }), 'error')
      setInstalling(prev => {
        const next = new Set(prev)
        next.delete(name)
        return next
      })
    }
  }
  const isGpu = tier.id !== 'cpu'
  return (
    <div className="rec-models card">
      <div className="rec-models-head">
        <div className="rec-models-title">
          <i className={`fas ${isGpu ? 'fa-microchip' : 'fa-memory'}`} aria-hidden="true" />
          <strong>{t('recommended.title')}</strong>
          <span className="rec-models-note">{isGpu ? t('recommended.gpuNote') : t('recommended.cpuNote')}</span>
        </div>
        <button type="button" className="rec-models-dismiss" onClick={dismiss} aria-label={t('recommended.dismiss')} title={t('recommended.dismiss')}>
          <i className="fas fa-times" aria-hidden="true" />
        </button>
      </div>
      <div className="rec-models-grid">
        {recommended.map(m => {
          const busy = installing.has(m.name)
          return (
            <div key={m.name} className="rec-models-item">
              <div className="rec-models-item-name">{m.name}</div>
              <div className="rec-models-item-meta">
                {isNvfp4Name(m.name) && <span className="badge badge-info">NVFP4</span>}
                {m.sizeDisplay && <span>{m.sizeDisplay}</span>}
                {isGpu && m.vramDisplay && (
                  <span className="rec-models-item-fit"><i className="fas fa-microchip" aria-hidden="true" /> {m.vramDisplay}</span>
                )}
              </div>
              <button
                type="button"
                className="btn btn-primary btn-sm"
                disabled={busy}
                onClick={() => install(m.name)}
              >
                {busy
                  ? (<><i className="fas fa-spinner fa-spin" aria-hidden="true" /> {t('recommended.installing')}</>)
                  : (<><i className="fas fa-download" aria-hidden="true" /> {t('recommended.install')}</>)}
              </button>
            </div>
          )
        })}
      </div>
    </div>
  )
 }
--- a/core/http/react-ui/src/components/StarterModels.jsx
+++ b/core/http/react-ui/src/components/StarterModels.jsx
@@ -1,79 +1,78 @@
-import { useState, useEffect, useMemo } from 'react'
+import { useState } from 'react'
 import { useTranslation } from 'react-i18next'
 import { modelsApi } from '../utils/api'
-import { useResources } from '../hooks/useResources'
+import { useRecommendedModels, isNvfp4Name } from '../hooks/useRecommendedModels'
-// Curated, hardware-tiered starter models for the empty-state onboarding. Names
+// Static fallback used only when the live gallery / estimates can't be reached
-// are real gallery entries (gallery/index.yaml); we intersect them against the
+// (offline, trimmed gallery). The hook is the primary, data-driven path; these
-// live gallery at render time so a custom/trimmed gallery degrades gracefully
+// are real gallery names kept as a safety net so onboarding never shows nothing.
-// (unmatched entries simply don't render).
+// Gemma picks use the QAT (quantization-aware-trained) Q4 builds. NVIDIA boxes
-//
+// get NVFP4 + MTP variants at the mid/large tiers (see NVIDIA below).
-// The guiding rule the maintainer asked for: CPU-only machines should be
+const BASE = {
-// steered to genuinely small models (1-4B, Q4) that stay responsive without a
+  cpu: [
-// GPU. GPU tiers scale the suggestion up with available VRAM.
+    { name: 'gemma-4-e2b-it-qat-q4_0', size: '~1.5 GB' },
-const SMALL = [
+    { name: 'qwen3.5-4b-claude-4.6-opus-reasoning-distilled', size: '~2.5 GB' },
-  { name: 'llama-3.2-1b-instruct:q4_k_m', size: '~0.8 GB' },
+    { name: 'gemma-4-e4b-it-qat-q4_0', size: '~3 GB' },
-  { name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
+    { name: 'lfm2.5-1.2b-instruct', size: '~0.8 GB' },
-  { name: 'qwen3-1.7b', size: '~1.4 GB' },
+  ],
-  { name: 'gemma-3-1b-it', size: '~0.8 GB' },
+  'gpu-small': [
-]
+    { name: 'gemma-4-e4b-it-qat-q4_0', size: '~3 GB' },
-const MID = [
+    { name: 'lfm2.5-8b-a1b', size: '~5 GB' },
-  { name: 'qwen3-4b', size: '~2.5 GB' },
+    { name: 'qwen3.5-9b', size: '~5.5 GB' },
-  { name: 'gemma-3-4b-it', size: '~3 GB' },
+    { name: 'gemma-4-12b-it-qat-q4_0', size: '~7 GB' },
-  { name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
+  ],
-]
+  'gpu-mid': [
-const LARGE = [
+    { name: 'qwen3.6-27b', size: '~16 GB' },
-  { name: 'meta-llama-3.1-8b-instruct', size: '~5 GB' },
+    { name: 'qwen3.6-27b-mtp-pi-tune', size: '~16 GB' },
-  { name: 'qwen3-4b', size: '~2.5 GB' },
+    { name: 'gemma-4-26b-a4b-it-qat-q4_0', size: '~16 GB' },
-  { name: 'mistral-7b-instruct-v0.3', size: '~4 GB' },
+    { name: 'qwen3.5-27b', size: '~16 GB' },
-]
+  ],
  'gpu-large': [
    { name: 'qwen3.6-35b-a3b-apex', size: '~20 GB' },
    { name: 'qwen3.6-35b-a3b-claude-4.6-opus-reasoning-distilled', size: '~20 GB' },
    { name: 'gemma-4-31b-it-qat-q4_0', size: '~18 GB' },
    { name: 'qwen3.5-35b-a3b-apex', size: '~20 GB' },
  ],
 }
-const GB = 1024 * 1024 * 1024
+// NVIDIA-only overrides: NVFP4 is a Blackwell-optimised 4-bit format paired with
 // MTP (multi-token prediction) for speed. Only the mid/large tiers have these.
 const NVIDIA = {
  'gpu-mid': [
    { name: 'qwen3.6-27b-nvfp4-mtp', size: '~14 GB' },
    { name: 'qwen3.6-27b-mtp-pi-tune', size: '~16 GB' },
    { name: 'gemma-4-26b-a4b-it-qat-q4_0', size: '~16 GB' },
    { name: 'qwen3.6-27b', size: '~16 GB' },
  ],
  'gpu-large': [
    { name: 'qwen3.6-35b-a3b-nvfp4-mtp', size: '~18 GB' },
    { name: 'qwen3.6-27b-nvfp4-mtp', size: '~14 GB' },
    { name: 'qwen3.6-35b-a3b-apex', size: '~20 GB' },
    { name: 'gemma-4-31b-it-qat-q4_0', size: '~18 GB' },
  ],
 }
-// Pick a tier from detected hardware. total_memory is GPU VRAM in bytes (0 when
+function fallbackFor(tierId, isNvidia) {
-// CPU-only). Thresholds are deliberately conservative so a suggestion that
+  if (isNvidia && NVIDIA[tierId]) return NVIDIA[tierId]
-// "fits" really does.
+  return BASE[tierId] || BASE.cpu
 function pickTier(resources) {
  const isGpu = resources?.type === 'gpu'
  const vram = resources?.aggregate?.total_memory || 0
  if (!isGpu || vram <= 0) return { id: 'cpu', list: SMALL }
  if (vram < 8 * GB) return { id: 'gpu-small', list: MID }
  return { id: 'gpu-large', list: LARGE }
 }
 export default function StarterModels({ addToast, onInstallStarted }) {
  const { t } = useTranslation('home')
-  const { resources } = useResources()
+  const { recommended, tier, isNvidia, loading } = useRecommendedModels({ count: 4 })
  const [available, setAvailable] = useState(null) // Set of gallery names, or null while loading
  const [installing, setInstalling] = useState(() => new Set())
-  const tier = useMemo(() => pickTier(resources), [resources])
+  // While the hardware probe + gallery query are in flight, render nothing
-  const candidates = tier.list
+  // rather than flashing fallback content that may be replaced a moment later.
  if (loading) return null
-  // Verify candidates exist in the live gallery. One search per name (the tier
+  // Prefer live recommendations; fall back to the static list only when the
-  // has at most a handful) keeps this resilient to gallery customization.
+  // gallery yielded nothing.
-  useEffect(() => {
+  const items = (recommended && recommended.length > 0)
-    let cancelled = false
+    ? recommended.map(r => ({ name: r.name, size: r.sizeDisplay }))
-    const names = [...new Set(candidates.map(c => c.name))]
+    : fallbackFor(tier.id, isNvidia)
    Promise.all(names.map(name =>
      modelsApi.list({ search: name, page: 1 })
        .then(data => (data?.models || []).some(m => (m.name || m.id) === name) ? name : null)
        .catch(() => null)
    )).then(found => {
      if (cancelled) return
      const hits = found.filter(Boolean)
      // If verification yielded nothing (e.g. gallery unreachable), fall back to
      // showing the curated list rather than an empty widget.
      setAvailable(hits.length > 0 ? new Set(hits) : null)
    })
    return () => { cancelled = true }
  }, [candidates])
-  const visible = available === null
+  if (items.length === 0) return null
    ? candidates
    : candidates.filter(c => available.has(c.name))
  if (visible.length === 0) return null
  const install = async (name) => {
    setInstalling(prev => new Set(prev).add(name))
@@ -104,12 +103,13 @@ export default function StarterModels({ addToast, onInstallStarted }) {
        {tier.id === 'cpu' ? t('starters.cpuNote') : t('starters.gpuNote')}
      </p>
      <ul className="home-starters-list">
-        {visible.map(c => {
+        {items.map(c => {
          const busy = installing.has(c.name)
          return (
            <li key={c.name} className="home-starters-item">
              <span className="home-starters-name">{c.name}</span>
-              <span className="home-starters-size">{c.size}</span>
+              {isNvfp4Name(c.name) && <span className="badge badge-info home-starters-badge">NVFP4</span>}
              {c.size && <span className="home-starters-size">{c.size}</span>}
              <button
                type="button"
                className="btn btn-primary btn-sm"
--- a/core/http/react-ui/src/hooks/useRecommendedModels.js
+++ b/core/http/react-ui/src/hooks/useRecommendedModels.js
@@ -0,0 +1,108 @@
 import { useState, useEffect } from 'react'
 import { modelsApi } from '../utils/api'
 import { useResources } from './useResources'
 // Data-driven "recommended for your hardware" model picks. The gallery exposes
 // no popularity/download signal and the list response carries no size, so we:
 //   1. ask the server for chat-capable models in their natural (curated) order,
 //   2. estimate size/VRAM for the top candidates (same endpoint the Models page
 //      uses), and
 //   3. rank by hardware fit — smallest on CPU-only boxes, largest-that-fits on
 //      GPUs (bigger == better quality while still fitting VRAM).
 //
 // Returns `recommended === null` while loading, `[]` when nothing could be
 // resolved (gallery/estimates unavailable) so callers can fall back.
 const GB = 1024 * 1024 * 1024
 const DEFAULT_CTX = 4096
 // NVFP4 is a Blackwell/NVIDIA-specific 4-bit format — only worth suggesting on
 // NVIDIA hardware, and to be filtered out elsewhere.
 export const isNvfp4Name = (name) => /nvfp4/i.test(name || '')
 export function hasNvidiaGpu(resources) {
  return Array.isArray(resources?.gpus) &&
    resources.gpus.some(g => (g?.vendor || '').toLowerCase() === 'nvidia')
 }
 export function recommendTier(resources) {
  const isGpu = resources?.type === 'gpu'
  const vram = resources?.aggregate?.total_memory || 0
  if (!isGpu || vram <= 0) return { id: 'cpu', vram: 0 }
  if (vram < 8 * GB) return { id: 'gpu-small', vram }
  if (vram < 24 * GB) return { id: 'gpu-mid', vram }
  return { id: 'gpu-large', vram }
 }
 function rank(candidates, tier, count, isNvidia) {
  // NVFP4 only runs on NVIDIA (Blackwell) — drop it everywhere else, and prefer
  // it on NVIDIA boxes where it's the fastest path.
  const pool = candidates.filter(c => c.sizeBytes != null && (isNvidia || !isNvfp4Name(c.name)))
  if (tier.id === 'cpu') {
    // No GPU: smallest models stay responsive on CPU.
    return [...pool].sort((a, b) => a.sizeBytes - b.sizeBytes).slice(0, count)
  }
  const limit = tier.vram * 0.95
  const fits = pool.filter(c => c.vramBytes != null && c.vramBytes <= limit)
  const base = fits.length > 0 ? fits : pool // tiny GPU where nothing fits → fall through to smallest
  const byPreference = (a, b) => {
    // On NVIDIA, surface NVFP4 first; then largest-that-fits (best quality).
    if (isNvidia) {
      const an = isNvfp4Name(a.name), bn = isNvfp4Name(b.name)
      if (an !== bn) return an ? -1 : 1
    }
    return fits.length > 0 ? b.sizeBytes - a.sizeBytes : a.sizeBytes - b.sizeBytes
  }
  return [...base].sort(byPreference).slice(0, count)
 }
 export function useRecommendedModels({ count = 4, candidatePool = 10 } = {}) {
  const { resources } = useResources()
  const [recommended, setRecommended] = useState(null)
  const [error, setError] = useState(null)
  const resReady = resources !== null
  const tier = recommendTier(resources)
  const isNvidia = hasNvidiaGpu(resources)
  useEffect(() => {
    if (!resReady) return
    let cancelled = false
    setRecommended(null)
    setError(null)
    ;(async () => {
      try {
        const data = await modelsApi.list({ tag: 'chat', items: candidatePool, page: 1 })
        // Recommend models the user hasn't installed yet.
        const models = (data?.models || []).filter(m => !m.installed)
        const estimated = await Promise.all(models.map(async (m) => {
          const name = m.name || m.id
          try {
            const e = await modelsApi.estimate(name, [DEFAULT_CTX])
            const ctx = e?.estimates?.[String(DEFAULT_CTX)]
            return {
              name,
              description: m.description,
              sizeBytes: e?.sizeBytes ?? null,
              sizeDisplay: e?.sizeDisplay ?? null,
              vramBytes: ctx?.vramBytes ?? null,
              vramDisplay: ctx?.vramDisplay ?? null,
            }
          } catch {
            return { name, sizeBytes: null }
          }
        }))
        if (cancelled) return
        setRecommended(rank(estimated, tier, count, isNvidia))
      } catch (e) {
        if (cancelled) return
        setError(e.message)
        setRecommended([])
      }
    })()
    return () => { cancelled = true }
    // tier.id / tier.vram / isNvidia are primitives, so resource polling doesn't re-run this.
  }, [resReady, tier.id, tier.vram, isNvidia, count, candidatePool])
  return { recommended, tier, isNvidia, error, loading: recommended === null }
 }
--- a/core/http/react-ui/src/pages/Models.jsx
+++ b/core/http/react-ui/src/pages/Models.jsx
@@ -13,6 +13,7 @@ import ConfirmDialog from '../components/ConfirmDialog'
 import GalleryLoader from '../components/GalleryLoader'
 import Toggle from '../components/Toggle'
 import ResponsiveTable from '../components/ResponsiveTable'
 import RecommendedModels from '../components/RecommendedModels'
 import React from 'react'
@@ -301,6 +302,8 @@ export default function Models() {
        }
      />
      <RecommendedModels addToast={addToast} />
      {/* Search */}
      <div className="search-bar" style={{ marginBottom: 'var(--spacing-md)' }}>
        <i className="fas fa-search search-icon" />
--- a/core/http/routes/auth.go
+++ b/core/http/routes/auth.go
@@ -268,7 +268,7 @@ func RegisterAuthRoutes(e *echo.Echo, app *application.Application) {
 	// Set up OAuth manager when any OAuth/OIDC provider is configured
 	if appConfig.Auth.GitHubClientID != "" || appConfig.Auth.OIDCClientID != "" {
 		oauthMgr, err := auth.NewOAuthManager(
-			appConfig.ExternalBaseURL,
+			appConfig.Auth.BaseURL,
 			auth.OAuthParams{
 				GitHubClientID:     appConfig.Auth.GitHubClientID,
 				GitHubClientSecret: appConfig.Auth.GitHubClientSecret,
--- a/core/services/nodes/router.go
+++ b/core/services/nodes/router.go
@@ -156,7 +156,10 @@ func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
 		VRAM:              node.TotalVRAM,
 	}
 	if config.IsManagedPhysicalBatch(int(opts.NBatch)) {
-		opts.NBatch = int32(config.PhysicalBatch(gpu))
+		// Gate the raised batch on the selected node's per-device VRAM at this
 		// model's context, so a large context can't overflow the node's compute
 		// buffer (issue #10485). node.TotalVRAM is the node's reported ceiling.
 		opts.NBatch = int32(config.PhysicalBatchForContext(gpu, int(opts.ContextSize)))
 	}
 	// Default concurrent serving for the selected node (the frontend that built
 	// the options may have no GPU). Only adds when no parallel option is set.
--- a/core/services/nodes/router_hardware_internal_test.go
+++ b/core/services/nodes/router_hardware_internal_test.go
@@ -8,12 +8,19 @@ import (
 )
 var _ = Describe("applyNodeHardwareDefaults", func() {
-	It("raises a managed default batch on a Blackwell node", func() {
+	It("raises a managed default batch on a Blackwell node with headroom", func() {
-		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch}
+		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 8192}
-		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})
 		Expect(opts.NBatch).To(BeEquivalentTo(config.BlackwellPhysicalBatch))
 	})
 	It("keeps the default batch when a large context would overflow the node", func() {
 		// Regression guard for issue #10485 on the distributed path.
 		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, ContextSize: 204800}
 		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.0", TotalVRAM: 16 << 30})
 		Expect(opts.NBatch).To(BeEquivalentTo(config.DefaultPhysicalBatch))
 	})
 	It("resets a Blackwell guess on a non-Blackwell node", func() {
 		// frontend (Blackwell) guessed high, but the selected node is not Blackwell
 		opts := &pb.ModelOptions{NBatch: config.BlackwellPhysicalBatch}
--- a/docs/content/advanced/reverse-proxy-tls.md
+++ b/docs/content/advanced/reverse-proxy-tls.md
@@ -14,26 +14,6 @@ When running LocalAI behind a TLS termination reverse proxy, the Web UI may fail
 LocalAI uses the `X-Forwarded-Proto` HTTP header to determine the protocol used by clients. When this header is set to `https`, LocalAI will generate HTTPS URLs for static assets in the Web UI.
 ## Running behind a reverse proxy (HTTPS / subpath)
 LocalAI does not terminate TLS itself, so HTTPS is provided by a reverse
 proxy in front of it. Self-referential links (generated image and video
 URLs, async job status URLs, OAuth callbacks) need the externally visible
 scheme, host and port.
 LocalAI determines these in this order:
 1. `LOCALAI_BASE_URL` - if set, it is authoritative for the origin. Set it to
   the externally visible base URL, e.g. `LOCALAI_BASE_URL=https://localai.example.com`
   or `https://192.168.0.13:34567`. Recommended whenever links come back with
   the wrong scheme or host.
 2. Otherwise, the `X-Forwarded-Proto` and `X-Forwarded-Host` headers (or the
   RFC 7239 `Forwarded` header) sent by the proxy. Ensure your proxy forwards
   `X-Forwarded-Proto: https`.
 A reverse-proxy subpath mount is supported via `X-Forwarded-Prefix`; it is
 appended to `LOCALAI_BASE_URL` when both are present.
 ## Required Headers
 Your reverse proxy must forward these headers to LocalAI:
--- a/pkg/xsysinfo/gpu.go
+++ b/pkg/xsysinfo/gpu.go
@@ -129,6 +129,61 @@ func TotalAvailableVRAM() (uint64, error) {
 	return 0, nil
 }
 // MinPerGPUVRAM returns the total VRAM of the SMALLEST GPU on the host (in
 // bytes), or 0 when no per-device VRAM is known. Unlike TotalAvailableVRAM
 // (which sums across devices) this reports a single device's ceiling, which is
 // the right figure for decisions about what must fit on one card: the compute
 // buffer (sized by n_ubatch) and the parallel-slot tier. Summing a multi-GPU
 // host's VRAM over-provisions those into a per-device OOM (issue #10485).
 //
 // Unified-memory devices (GB10, Apple) report system RAM as their single
 // device's VRAM, so they are unaffected.
 func MinPerGPUVRAM() (uint64, error) {
 	// Prefer per-device binary detection (nvidia-smi/rocm-smi report true
 	// per-card VRAM); ghw's per-card memory can reflect NUMA node RAM on some
 	// hosts, which is why TotalAvailableVRAM treats it as a sum.
 	if infos := GetGPUMemoryUsage(); len(infos) > 0 {
 		if v := minNonZeroVRAM(infos); v > 0 {
 			return v, nil
 		}
 	}
 	// Fallback: ghw per-card memory, taking the minimum non-zero card.
 	if gpus, err := GPUs(); err == nil {
 		var min uint64
 		for _, gpu := range gpus {
 			if gpu == nil || gpu.Node == nil || gpu.Node.Memory == nil {
 				continue
 			}
 			if b := gpu.Node.Memory.TotalUsableBytes; b > 0 {
 				if u := uint64(b); min == 0 || u < min {
 					min = u
 				}
 			}
 		}
 		if min > 0 {
 			return min, nil
 		}
 	}
 	return 0, nil
 }
 // minNonZeroVRAM returns the smallest non-zero TotalVRAM across the given GPUs,
 // or 0 when none report VRAM.
 func minNonZeroVRAM(infos []GPUMemoryInfo) uint64 {
 	var min uint64
 	for _, g := range infos {
 		if g.TotalVRAM == 0 {
 			continue
 		}
 		if min == 0 || g.TotalVRAM < min {
 			min = g.TotalVRAM
 		}
 	}
 	return min
 }
 func HasGPU(vendor string) bool {
 	gpus, err := GPUs()
 	if err != nil {
--- a/pkg/xsysinfo/minvram_internal_test.go
+++ b/pkg/xsysinfo/minvram_internal_test.go
@@ -0,0 +1,37 @@
 package xsysinfo
 import (
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
 var _ = Describe("minNonZeroVRAM", func() {
 	const gib = uint64(1) << 30
 	It("returns the smallest device on a multi-GPU host", func() {
 		// Two unequal cards (e.g. RTX 5070 Ti + 5060 Ti, both 16 GiB, or a
 		// mixed pair): the smallest device is the per-card allocation ceiling.
 		infos := []GPUMemoryInfo{
 			{TotalVRAM: 16 * gib},
 			{TotalVRAM: 12 * gib},
 		}
 		Expect(minNonZeroVRAM(infos)).To(Equal(12 * gib))
 	})
 	It("ignores devices that report zero VRAM", func() {
 		infos := []GPUMemoryInfo{
 			{TotalVRAM: 0},
 			{TotalVRAM: 24 * gib},
 		}
 		Expect(minNonZeroVRAM(infos)).To(Equal(24 * gib))
 	})
 	It("returns the single device's VRAM on a one-GPU host", func() {
 		Expect(minNonZeroVRAM([]GPUMemoryInfo{{TotalVRAM: 16 * gib}})).To(Equal(16 * gib))
 	})
 	It("returns 0 when no device reports VRAM", func() {
 		Expect(minNonZeroVRAM([]GPUMemoryInfo{{TotalVRAM: 0}})).To(BeZero())
 		Expect(minNonZeroVRAM(nil)).To(BeZero())
 	})
 })
Author	SHA1	Message	Date
LocalAI [bot]	3c63431e46	chore: ⬆️ Update ServeurpersoCom/omnivoice.cpp to `0f37401bebe9b20c0160a888e592108fc1d17607` (#10492 ) ⬆️ Update ServeurpersoCom/omnivoice.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-06-25 00:57:58 +02:00
LocalAI [bot]	3f647a2764	chore: ⬆️ Update ikawrakow/ik_llama.cpp to `d5507e33ae7ee2b7b41475f08044d3bde3b839ee` (#10498 ) ⬆️ Update ikawrakow/ik_llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-06-25 00:57:42 +02:00
LocalAI [bot]	f88981cdce	feat(ui): data-driven hardware model recommendations + gallery surfacing (#10500 ) * feat(ui): make hardware starter models data-driven The empty-state starter widget recommended from a hardcoded list, which drifts as the gallery evolves. Add useRecommendedModels: it queries the live gallery for chat-capable models (their natural curated order, since the gallery exposes no popularity signal), estimates size/VRAM for the top candidates via the existing estimate endpoint, and ranks by hardware fit - smallest on CPU-only boxes, largest-that-fits on GPUs. StarterModels now renders those live picks and keeps the curated static list only as an offline/trimmed-gallery fallback. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(ui): recommend models for your hardware in the gallery Hardware-aware recommendations were only shown on the first-run empty state. Surface them on the main Models gallery too: a dismissible "Recommended for your hardware" strip at the top, sharing the useRecommendedModels fit-ranking with the starter widget. CPU-only boxes get small models; GPUs get the largest picks that fit VRAM, with size and VRAM shown per card. One-click install; dismissal persists per browser. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] * feat(ui): gpu-mid tier + NVIDIA NVFP4 model recommendations Refine the hardware recommendation tiers and curated picks: - Add a gpu-mid tier (8-24GB VRAM) between gpu-small and gpu-large, so ~27B-class models are suggested separately from the 30B+ large tier. - Detect NVIDIA GPUs (resources.gpus[].vendor) and, on NVIDIA only, prefer NVFP4 + MTP variants (Blackwell-optimised); NVFP4 models are filtered out of recommendations on non-NVIDIA hardware where they can't run. This applies to both the live ranking and the static fallback, with an NVFP4 badge shown on those picks. - Refresh the curated fallback to current models: Gemma-4 QAT Q4 builds at every tier, low qwen3.5 (4B distilled / 9B) on CPU/small, qwen3.6-27b and MTP variants at mid, qwen3.6/qwen3.5 35B-A3B apex/distilled at large. All names verified against gallery/index.yaml. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code] --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>	2026-06-25 00:22:45 +02:00
LocalAI [bot]	0d6de15ae9	fix(config): per-device VRAM headroom for Blackwell defaults (#10485 ) (#10494 ) The hardware-tuned defaults from #10411 were measured on a GB10 / DGX Spark (128 GiB unified memory) and over-provisioned multi-GPU consumer Blackwell (e.g. 2x16 GiB RTX 50-series) into CUDA OOM during model init: - The Blackwell physical batch (512 -> 2048) sets both n_batch and n_ubatch. The compute buffer scales ~n_ubatch * n_ctx and is allocated PER DEVICE (it can't be split across GPUs), so a large context turns ub2048 into multi-GiB of scratch that must fit one 16 GiB card. - The VRAM-scaled parallel-slot default tiered off TotalAvailableVRAM(), which SUMS all GPUs (2x16 -> "32 GiB" -> 8 slots), but the allocations are per-device. Make both decisions per-device and context-aware: - xsysinfo.MinPerGPUVRAM() reports the smallest device's VRAM; localGPU() uses it so the parallel tier and batch guard reason about one card. - PhysicalBatchForContext(gpu, ctx) raises the batch only when the extra compute buffer fits VRAM/4 at this model's context (16 GiB crosses over ~174k ctx, 32 GiB ~349k; GB10 reports system RAM so it still clears it). - Apply hardware defaults AFTER runBackendHooks in SetDefaults so the GGUF-guessed context is resolved before the batch decision. - The distributed router gates the node batch the same way. Unified-memory devices (GB10, Apple) report system RAM as their single device's VRAM, so they keep the prefill win. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>	2026-06-25 00:07:48 +02:00