fix(gallery): match mmproj/model quant as a whole token so F16 no longer selects BF16 (#10559 )

pickPreferredGroup matched a quant preference against the shard base filename with strings.Contains. Because `f16` is a substring of `bf16`, asking for the `F16` mmproj quant would wrongly satisfy a `BF16` file and select it when its group came first. Match the preference as a whole token instead: it must be delimited by a non-alphanumeric character (or the string start/end) on both outer edges. Separators inside the preference itself (e.g. `ud-q4_k_xl`) are left untouched, and all occurrences are scanned before rejecting. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
2026-06-27 18:06:58 -04:00 · 2026-06-27 21:57:23 +00:00
9 changed files with 136 additions and 149 deletions
--- a/core/application/distributed.go
+++ b/core/application/distributed.go
@@ -355,7 +355,6 @@ func initDistributed(cfg *config.ApplicationConfig, authDB *gorm.DB, configLoade
 		PrefixProvider:   prefixProvider,
 		PrefixConfig:     prefixCfg,
 		Pressure:         pressure,
-		SharedModels:     cfg.Distributed.SharedModels,
 	})

 	// Wire staging-progress broadcasting so file-staging shows up on every
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -160,7 +160,6 @@ type RunCMD struct {
 	RegistrationRequireAuth   bool   `env:"LOCALAI_REGISTRATION_REQUIRE_AUTH" default:"false" help:"Fail startup when distributed mode is enabled but LOCALAI_REGISTRATION_TOKEN is empty (node endpoints and worker file-transfer server would otherwise be unauthenticated)" group:"distributed"`
 	DistributedRequireAuth    bool   `env:"LOCALAI_DISTRIBUTED_REQUIRE_AUTH" default:"false" help:"Umbrella switch: require BOTH NATS JWT credentials and a registration token when distributed mode is enabled (implies --nats-require-auth and --registration-require-auth)" group:"distributed"`
 	AutoApproveNodes          bool   `env:"LOCALAI_AUTO_APPROVE_NODES" default:"false" help:"Auto-approve new worker nodes (skip admin approval)" group:"distributed"`
-	DistributedSharedModels   bool   `env:"LOCALAI_DISTRIBUTED_SHARED_MODELS" default:"false" help:"Assert that every node mounts the SAME models directory at the SAME path (shared volume). When true, the router skips staging model files to workers and loads them directly from the shared path, avoiding re-downloads." group:"distributed"`
 	DistributedPrefixCache    bool   `env:"LOCALAI_DISTRIBUTED_PREFIX_CACHE" default:"true" help:"Enable prefix-cache-aware routing in distributed mode (default true). When false, routing falls back to round-robin." group:"distributed"`
 	DistributedPrefixCacheTTL string `env:"LOCALAI_DISTRIBUTED_PREFIX_CACHE_TTL" help:"Idle-timeout for prefix-cache index entries; also drives the background eviction cadence (every TTL/2). Default 5m." group:"distributed"`
 	BackendInstallTimeout     string `env:"LOCALAI_NATS_BACKEND_INSTALL_TIMEOUT" help:"NATS round-trip timeout for backend.install requests sent to worker nodes (default 15m). Increase for slow links pulling multi-GB images." group:"distributed"`
@@ -311,9 +310,6 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	if r.DistributedRequireAuth {
 		opts = append(opts, config.EnableDistributedRequireAuth)
 	}
-	if r.DistributedSharedModels {
-		opts = append(opts, config.EnableDistributedSharedModels)
-	}
 	if r.NatsAccountSeed != "" {
 		opts = append(opts, config.WithNatsAccountSeed(r.NatsAccountSeed))
 	}
--- a/core/config/distributed_config.go
+++ b/core/config/distributed_config.go
@@ -31,14 +31,6 @@ type DistributedConfig struct {
 	// available to enforce just one layer.
 	RequireAuth      bool // LOCALAI_DISTRIBUTED_REQUIRE_AUTH
 	AutoApproveNodes bool // --auto-approve-nodes / LOCALAI_AUTO_APPROVE_NODES (skip admin approval for new workers)
-	// SharedModels asserts that every node (frontend and workers) mounts the
-	// SAME models directory at the SAME path (e.g. a shared volume, as in
-	// docker-compose.distributed.yaml). When true, the router skips staging
-	// model files to workers entirely: the frontend's absolute model paths are
-	// already valid on the worker, so re-uploading them into a per-model
-	// subdirectory only re-downloads what is already present (#10556). Default
-	// false preserves the historical per-node staging behavior.
-	SharedModels bool // --distributed-shared-models / LOCALAI_DISTRIBUTED_SHARED_MODELS

 	// NATS JWT auth (optional; see pkg/natsauth and docs/features/distributed-mode.md)
 	NatsAccountSeed  string        // LOCALAI_NATS_ACCOUNT_SEED — account signing seed to mint per-node worker JWTs
@@ -290,13 +282,6 @@ var EnableAutoApproveNodes = func(o *ApplicationConfig) {
 	o.Distributed.AutoApproveNodes = true
 }

-// EnableDistributedSharedModels marks the cluster as sharing one models
-// directory across all nodes, so the router skips staging model files to
-// workers (see DistributedConfig.SharedModels).
-var EnableDistributedSharedModels = func(o *ApplicationConfig) {
-	o.Distributed.SharedModels = true
-}
-
 // DisablePrefixCache turns off prefix-cache-aware routing (falls back to
 // round-robin). Prefix-cache routing is enabled by default in distributed mode.
 var DisablePrefixCache = func(o *ApplicationConfig) {
--- a/core/gallery/importers/llama-cpp.go
+++ b/core/gallery/importers/llama-cpp.go
@@ -25,8 +25,8 @@ var (

 type LlamaCPPImporter struct{}

-func (i *LlamaCPPImporter) Name() string     { return "llama-cpp" }
-func (i *LlamaCPPImporter) Modality() string { return "text" }
+func (i *LlamaCPPImporter) Name() string      { return "llama-cpp" }
+func (i *LlamaCPPImporter) Modality() string  { return "text" }
 func (i *LlamaCPPImporter) AutoDetects() bool { return true }

 // AdditionalBackends advertises drop-in replacements that share the
@@ -293,7 +293,7 @@ func pickPreferredGroup(groups []hfapi.ShardGroup, prefs []string) *hfapi.ShardG
 	for _, pref := range prefs {
 		lower := strings.ToLower(pref)
 		for i := range groups {
-			if strings.Contains(strings.ToLower(groups[i].Base), lower) {
+			if quantTokenMatches(strings.ToLower(groups[i].Base), lower) {
 				return &groups[i]
 			}
 		}
@@ -301,6 +301,39 @@ func pickPreferredGroup(groups []hfapi.ShardGroup, prefs []string) *hfapi.ShardG
 	return &groups[len(groups)-1]
 }

+// quantTokenMatches reports whether pref appears in base as a whole token
+// rather than as a substring of a larger alphanumeric run. Both arguments
+// must already be lowercased.
+//
+// A plain strings.Contains is wrong here: `f16` is a substring of `bf16`, so
+// asking for the `F16` quant used to wrongly select a `BF16` file (#10559).
+// Only the OUTER edges of the matched preference must hit a boundary — a
+// non-alphanumeric char (or the start/end of base). Separators inside the
+// preference itself (e.g. `ud-q4_k_xl`) are intentionally left untouched.
+func quantTokenMatches(base, pref string) bool {
+	if pref == "" {
+		return false
+	}
+	for start := strings.Index(base, pref); start != -1; {
+		end := start + len(pref)
+		leftOK := start == 0 || !isAlphaNum(base[start-1])
+		rightOK := end == len(base) || !isAlphaNum(base[end])
+		if leftOK && rightOK {
+			return true
+		}
+		next := strings.Index(base[start+1:], pref)
+		if next == -1 {
+			break
+		}
+		start += next + 1
+	}
+	return false
+}
+
+func isAlphaNum(b byte) bool {
+	return (b >= 'a' && b <= 'z') || (b >= '0' && b <= '9')
+}
+
 // maybeApplyMTPDefaults parses the picked GGUF header (range-fetched over
 // HTTP for HF/URL imports) and, if the file declares a Multi-Token Prediction
 // head, appends the auto-MTP option keys to modelConfig.Options. Failures
--- a/core/gallery/importers/llama-cpp_test.go
+++ b/core/gallery/importers/llama-cpp_test.go
@@ -374,6 +374,104 @@ var _ = Describe("LlamaCPPImporter", func() {
 		})
 	})

+	Context("quant token boundary matching", func() {
+		// Regression for #10559: the quant preference must match as a whole
+		// token, not as a substring. Asking for `F16` used to select a
+		// `BF16` mmproj because strings.Contains("...bf16.gguf", "f16") is
+		// true — the leading `b` was ignored.
+
+		const repoBase = "https://huggingface.co/acme/example-GGUF/resolve/main/"
+
+		hfFile := func(path, sha string) hfapi.ModelFile {
+			return hfapi.ModelFile{
+				Path:   path,
+				SHA256: sha,
+				URL:    repoBase + path,
+			}
+		}
+
+		withHF := func(preferences string, files ...hfapi.ModelFile) Details {
+			d := Details{
+				URI: "https://huggingface.co/acme/example-GGUF",
+				HuggingFace: &hfapi.ModelDetails{
+					ModelID: "acme/example-GGUF",
+					Files:   files,
+				},
+			}
+			if preferences != "" {
+				d.Preferences = json.RawMessage(preferences)
+			}
+			return d
+		}
+
+		It("selects the F16 mmproj over BF16 (BF16 listed first)", func() {
+			details := withHF(`{"name":"VL","mmproj_quantizations":"F16"}`,
+				hfFile("model-Q4_K_M.gguf", "model"),
+				hfFile("mmproj-x-BF16.gguf", "bf16"),
+				hfFile("mmproj-x-F16.gguf", "f16"),
+			)
+
+			modelConfig, err := importer.Import(details)
+
+			Expect(err).ToNot(HaveOccurred())
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/VL/mmproj-x-F16.gguf"), fmt.Sprintf("%+v", modelConfig))
+			Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("BF16"), fmt.Sprintf("%+v", modelConfig))
+		})
+
+		It("selects the F16 mmproj over BF16 (F16 listed first)", func() {
+			details := withHF(`{"name":"VL","mmproj_quantizations":"F16"}`,
+				hfFile("model-Q4_K_M.gguf", "model"),
+				hfFile("mmproj-x-F16.gguf", "f16"),
+				hfFile("mmproj-x-BF16.gguf", "bf16"),
+			)
+
+			modelConfig, err := importer.Import(details)
+
+			Expect(err).ToNot(HaveOccurred())
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/VL/mmproj-x-F16.gguf"), fmt.Sprintf("%+v", modelConfig))
+			Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("BF16"), fmt.Sprintf("%+v", modelConfig))
+		})
+
+		It("selects BF16 when BF16 is the requested mmproj quant", func() {
+			details := withHF(`{"name":"VL","mmproj_quantizations":"BF16"}`,
+				hfFile("model-Q4_K_M.gguf", "model"),
+				hfFile("mmproj-x-F16.gguf", "f16"),
+				hfFile("mmproj-x-BF16.gguf", "bf16"),
+			)
+
+			modelConfig, err := importer.Import(details)
+
+			Expect(err).ToNot(HaveOccurred())
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("mmproj: llama-cpp/mmproj/VL/mmproj-x-BF16.gguf"), fmt.Sprintf("%+v", modelConfig))
+		})
+
+		It("still matches a normal model quant with internal separators", func() {
+			// ud-q4_k_xl contains `-`/`_` internally; only the outer edges
+			// must hit a token boundary.
+			details := withHF(`{"name":"M","quantizations":"ud-q4_k_xl"}`,
+				hfFile("model-UD-Q4_K_XL.gguf", "xl"),
+				hfFile("model-Q3_K_M.gguf", "q3"),
+			)
+
+			modelConfig, err := importer.Import(details)
+
+			Expect(err).ToNot(HaveOccurred())
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/M/model-UD-Q4_K_XL.gguf"), fmt.Sprintf("%+v", modelConfig))
+		})
+
+		It("falls back to the last group when no preference matches", func() {
+			details := withHF(`{"name":"M","quantizations":"Q2_K"}`,
+				hfFile("model-Q8_0.gguf", "q8"),
+				hfFile("model-Q3_K_M.gguf", "q3"),
+			)
+
+			modelConfig, err := importer.Import(details)
+
+			Expect(err).ToNot(HaveOccurred())
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("model: llama-cpp/models/M/model-Q3_K_M.gguf"), fmt.Sprintf("%+v", modelConfig))
+		})
+	})
+
 	Context("AdditionalBackends", func() {
 		It("advertises ik-llama-cpp and turboquant as drop-in replacements", func() {
 			entries := importer.AdditionalBackends()
--- a/core/services/nodes/router.go
+++ b/core/services/nodes/router.go
@@ -63,11 +63,6 @@ type SmartRouterOptions struct {
 	// The reconciler reads the same instance to autoscale a saturated cache-warm
 	// replica. nil disables recording (the disabled path stays a no-op).
 	Pressure *prefixcache.Pressure
-	// SharedModels asserts that every node mounts the same models directory at
-	// the same path. When true, stageModelFiles skips all uploading and leaves
-	// the absolute model paths untouched so the worker loads them directly from
-	// the shared volume (#10556). See config.DistributedConfig.SharedModels.
-	SharedModels bool
 }

 // SmartRouter routes inference requests to the best available backend node.
@@ -98,9 +93,6 @@ type SmartRouter struct {
 	// per-request routing doesn't stall behind a busy backend's serialized
 	// HealthCheck/Predict. See probe_cache.go for the rationale.
 	probeCache *probeCache
-	// sharedModels skips file staging when all nodes mount the same models
-	// directory at the same path (see SmartRouterOptions.SharedModels).
-	sharedModels bool
 }

 // probeCacheTTL is how long a successful gRPC HealthCheck on a backend is
@@ -130,7 +122,6 @@ func NewSmartRouter(registry ModelRouter, opts SmartRouterOptions) *SmartRouter
 		prefixProvider:   opts.PrefixProvider,
 		prefixConfig:     opts.PrefixConfig,
 		pressure:         opts.Pressure,
-		sharedModels:     opts.SharedModels,
 	}
 }

@@ -956,19 +947,6 @@ func (r *SmartRouter) buildClientForAddr(node *BackendNode, addr string, paralle
 // simply remove the {ModelsPath}/{trackingKey}/ directory.
 func (r *SmartRouter) stageModelFiles(ctx context.Context, node *BackendNode, opts *pb.ModelOptions, trackingKey string) (*pb.ModelOptions, error) {
 	opts = proto.Clone(opts).(*pb.ModelOptions)
-
-	// Shared-models mode: every node mounts the same models directory at the
-	// same path, so the frontend's absolute model paths are already valid on the
-	// worker. Staging would only re-upload files that already exist on the shared
-	// volume (under a tracking-key subdir the probe never reuses), re-downloading
-	// the model on every load (#10556). Return the clone untouched: no upload, no
-	// path rewrite, no staging tracker.
-	if r.sharedModels {
-		xlog.Info("Skipping model file staging: shared-models mode is on (LOCALAI_DISTRIBUTED_SHARED_MODELS); worker loads directly from the shared volume",
-			"node", node.Name, "modelFile", opts.ModelFile, "trackingKey", trackingKey)
-		return opts, nil
-	}
-
 	xlog.Info("Staging model files for remote node", "node", node.Name, "modelFile", opts.ModelFile, "trackingKey", trackingKey)

 	// Derive the frontend models directory from ModelFile and Model.
--- a/core/services/nodes/router_sharedmodels_test.go
+++ b/core/services/nodes/router_sharedmodels_test.go
@@ -1,85 +0,0 @@
-package nodes
-
-import (
-	"context"
-	"os"
-	"path/filepath"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-)
-
-// These tests cover shared-models mode (LOCALAI_DISTRIBUTED_SHARED_MODELS): when
-// every node mounts the same models directory at the same path, the router must
-// NOT stage model files to workers. The canonical absolute path is already valid
-// on the worker, so staging would only re-download what is already present
-// (#10556).
-var _ = Describe("stageModelFiles shared-models mode", func() {
-	var (
-		stager  *fakeFileStager
-		node    *BackendNode
-		tmp     string
-		gguf    string
-		modelID = "ornith-1.0-35b"
-	)
-
-	BeforeEach(func() {
-		stager = &fakeFileStager{}
-		node = &BackendNode{ID: "node-1", Name: "node-1", Address: "10.0.0.1:50051"}
-		tmp = GinkgoT().TempDir()
-
-		modelDir := filepath.Join(tmp, "models", "llama-cpp", "models")
-		Expect(os.MkdirAll(modelDir, 0o755)).To(Succeed())
-		gguf = filepath.Join(modelDir, "ornith.gguf")
-		Expect(os.WriteFile(gguf, []byte("weights"), 0o644)).To(Succeed())
-	})
-
-	It("does not stage and keeps the canonical absolute ModelFile when shared-models is enabled", func() {
-		router := &SmartRouter{
-			fileStager:     stager,
-			stagingTracker: NewStagingTracker(),
-			sharedModels:   true,
-		}
-
-		opts := &pb.ModelOptions{
-			Model:     "llama-cpp/models/ornith.gguf",
-			ModelFile: gguf,
-		}
-
-		staged, err := router.stageModelFiles(context.Background(), node, opts, modelID)
-		Expect(err).ToNot(HaveOccurred())
-
-		// The file stager must never be touched: no upload, no re-download.
-		Expect(stager.ensureCalls).To(BeEmpty())
-		// The worker loads directly from the shared volume, so the path is unchanged.
-		Expect(staged.ModelFile).To(Equal(gguf))
-	})
-
-	It("stages files (existing behavior) when shared-models is disabled", func() {
-		router := &SmartRouter{
-			fileStager:     stager,
-			stagingTracker: NewStagingTracker(),
-			sharedModels:   false,
-		}
-
-		opts := &pb.ModelOptions{
-			Model:     "llama-cpp/models/ornith.gguf",
-			ModelFile: gguf,
-		}
-
-		staged, err := router.stageModelFiles(context.Background(), node, opts, modelID)
-		Expect(err).ToNot(HaveOccurred())
-
-		// Default mode uploads the model file to the worker.
-		Expect(stager.ensureCalls).ToNot(BeEmpty())
-		stagedLocals := make([]string, 0, len(stager.ensureCalls))
-		for _, c := range stager.ensureCalls {
-			stagedLocals = append(stagedLocals, c.localPath)
-		}
-		Expect(stagedLocals).To(ContainElement(gguf))
-		// ModelFile is rewritten to the remote (tracking-key namespaced) path.
-		Expect(staged.ModelFile).ToNot(Equal(gguf))
-	})
-})
--- a/docker-compose.distributed.yaml
+++ b/docker-compose.distributed.yaml
@@ -57,11 +57,6 @@ services:
      LOCALAI_AGENT_POOL_VECTOR_ENGINE: "postgres"
      LOCALAI_AGENT_POOL_DATABASE_URL: "postgresql://localai:localai@postgres:5432/localai?sslmode=disable"
      LOCALAI_REGISTRATION_TOKEN: "changeme"  # Change this in production!
-      # Shared-models mode (optional): set when every node mounts the SAME
-      # models directory at the SAME path (see "Shared Volume Mode" below).
-      # The router then skips gRPC file staging and workers load models
-      # directly from the shared volume instead of re-downloading them.
-      # LOCALAI_DISTRIBUTED_SHARED_MODELS: "true"
      # Auth (required for distributed mode — must use PostgreSQL)
      LOCALAI_AUTH: "true"
      LOCALAI_AUTH_DATABASE_URL: "postgresql://localai:localai@postgres:5432/localai?sslmode=disable"
@@ -162,11 +157,8 @@ services:
  # Then add to the volumes section:
  #   shared_models:
  #
-  # With shared volumes the model files are already present on every worker at
-  # the same path. Set LOCALAI_DISTRIBUTED_SHARED_MODELS=true on the frontend
-  # (see its environment above) so the router skips gRPC file staging and the
-  # worker loads the model directly from the shared path instead of
-  # re-downloading it into a per-model subdirectory.
+  # With shared volumes, model files are already available on the backend —
+  # gRPC file staging becomes a no-op (paths match).

  # --- Adding More Workers ---
  # Copy the worker-1 service above and change:
--- a/docs/content/features/distributed-mode.md
+++ b/docs/content/features/distributed-mode.md
@@ -67,7 +67,6 @@ The frontend is a standard LocalAI instance with distributed mode enabled. These
 | `--registration-require-auth` | `LOCALAI_REGISTRATION_REQUIRE_AUTH` | `false` | Fail startup when distributed mode is enabled but the registration token is empty (node endpoints and worker file-transfer would otherwise be unauthenticated) |
 | `--distributed-require-auth` | `LOCALAI_DISTRIBUTED_REQUIRE_AUTH` | `false` | **Umbrella switch.** Implies both `--nats-require-auth` and `--registration-require-auth` — one knob to lock down the NATS bus *and* the registration/file-transfer layer. Set this in production instead of the two granular flags. |
 | `--auto-approve-nodes` | `LOCALAI_AUTO_APPROVE_NODES` | `false` | Auto-approve new worker nodes (skip admin approval) |
-| `--distributed-shared-models` | `LOCALAI_DISTRIBUTED_SHARED_MODELS` | `false` | Assert that every node mounts the **same** models directory at the **same** path (a shared volume). When `true`, the router skips file staging entirely and workers load models directly from the shared path instead of re-downloading them. See [Shared models directory](#shared-models-directory). |
 | `--auth` | `LOCALAI_AUTH` | `false` | **Must be `true`** for distributed mode |
 | `--auth-database-url` | `LOCALAI_AUTH_DATABASE_URL` | *(required)* | PostgreSQL connection URL |
 | `--backend-install-timeout` | `LOCALAI_NATS_BACKEND_INSTALL_TIMEOUT` | `15m` | How long the frontend waits for a worker to acknowledge a backend install before considering the request stalled. Raise it when workers pull large backend images over slow links. If a worker takes longer than this, the operation shows as "still installing in background" in the admin UI and clears once the worker finishes. |
@@ -134,14 +133,6 @@ When S3 is not configured, model files are transferred directly from the fronten

 For high-throughput or very large model files, S3 can be more efficient since it avoids streaming through the frontend.

-### Shared models directory
-
-If every node (frontend and workers) mounts the **same** models directory at the **same** path - for example a shared volume or network filesystem, as shown in the "Shared Volume Mode" section of `docker-compose.distributed.yaml` - the model files are already present on each worker at their canonical path. In that case staging is wasted work: it copies files that already exist into a per-model subdirectory the worker then loads from, which shows up as a re-download of a model you already have.
-
-Set `LOCALAI_DISTRIBUTED_SHARED_MODELS=true` (or `--distributed-shared-models`) on the frontend to skip staging entirely. The router then leaves the model's absolute paths untouched and the worker loads them directly from the shared volume.
-
-This flag is a contract you assert: all nodes must mount identical paths. Leave it off (the default) when workers have independent models directories - the frontend stages files to them over HTTP (or S3) as described above.
-
 {{% notice warning %}}
 The worker HTTP file transfer server is authenticated by `LOCALAI_REGISTRATION_TOKEN`. If the token is **empty**, the server **fails open** — anyone who can reach the port gets read/write access to the worker's models/staging/data directories (a remote model-poisoning / exfiltration vector). The worker logs a loud warning at startup in this case. Always set `LOCALAI_REGISTRATION_TOKEN` in distributed mode, and set `LOCALAI_DISTRIBUTED_REQUIRE_AUTH=true` (frontend **and** workers) to make a missing token *or* missing NATS credentials a hard startup error rather than a silent fail-open. Firewall the file-transfer port (gRPC base − 1) so only the frontend can reach it.
 {{% /notice %}}