fix(llama-cpp): default rms_norm_eps for Gemma 3 GGUFs missing the key

Some Gemma 3 GGUF files distributed via the Ollama registry omit the `gemma3.attention.layer_norm_rms_epsilon` metadata key. Both llama.cpp and ik_llama.cpp treat that key as required and abort the load with: error loading model hyperparameters: key not found in model: gemma3.attention.layer_norm_rms_epsilon Ollama's loader silently falls back to ~1e-6 in the same situation, which is the canonical Gemma 3 default (google/gemma_pytorch config.py and the Hugging Face Gemma3Config), and the model loads correctly. Add small build-time patches to both backends that pre-seed `hparams.f_norm_rms_eps` with 1e-6 and mark the metadata lookup as optional. GGUFs that already carry the key continue to use the embedded value unchanged. Closes #9414
feat(distributed): sync state with frontends, better backend management reporting (#9426 )
2026-05-19 22:29:54 -04:00 · 2026-04-19 16:15:26 +00:00 · 2026-04-19 17:55:53 +02:00 · 2026-04-19 13:29:58 +02:00
7 changed files with 86 additions and 94 deletions
--- a/backend/cpp/ik-llama-cpp/patches/0002-gemma3-default-rms-norm-eps.patch
+++ b/backend/cpp/ik-llama-cpp/patches/0002-gemma3-default-rms-norm-eps.patch
@@ -0,0 +1,38 @@
+From: LocalAI maintainers <noreply@localai.io>
+Subject: [PATCH] gemma3: default rms norm eps when GGUF metadata key is missing
+
+Some Gemma 3 GGUF files (notably those distributed via the Ollama
+registry) do not embed the `gemma3.attention.layer_norm_rms_epsilon`
+metadata key. ik_llama.cpp currently requires the key to be present and
+fails the entire model load with:
+
+    error loading model hyperparameters:
+    key not found in model: gemma3.attention.layer_norm_rms_epsilon
+
+Ollama's own loader silently falls back to ~1e-6 in the same situation,
+which is the canonical Gemma 3 default (see google/gemma_pytorch
+config.py and the Hugging Face Gemma3Config), so the model still loads
+and works correctly.
+
+Mirror that behavior here: pre-seed the field with the Gemma 3 default
+and mark the metadata key as optional. This unblocks Ollama-converted
+Gemma 3 models without affecting GGUFs that already carry the key.
+
+Refs: ggml-org/llama.cpp#12367, ollama/ollama#10262, mudler/LocalAI#9414
+---
+ src/llama-hparams.cpp | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
+--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
+@@ -679,7 +679,8 @@
+                 hparams.rope_freq_scale_train_swa = 1.0f;
+
+                 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
+-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                hparams.f_norm_rms_eps = 1e-6f; // Gemma 3 canonical default; some Ollama GGUFs omit the key
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
+
+                 switch (hparams.n_layer) {
+                     case 26: model.type = e_model::MODEL_2B; break;
--- a/backend/cpp/llama-cpp/patches/0001-gemma3-default-rms-norm-eps.patch
+++ b/backend/cpp/llama-cpp/patches/0001-gemma3-default-rms-norm-eps.patch
@@ -0,0 +1,38 @@
+From: LocalAI maintainers <noreply@localai.io>
+Subject: [PATCH] gemma3: default rms norm eps when GGUF metadata key is missing
+
+Some Gemma 3 GGUF files (notably those distributed via the Ollama
+registry) do not embed the `gemma3.attention.layer_norm_rms_epsilon`
+metadata key. llama.cpp currently requires the key to be present and
+fails the entire model load with:
+
+    error loading model hyperparameters:
+    key not found in model: gemma3.attention.layer_norm_rms_epsilon
+
+Ollama's own loader silently falls back to ~1e-6 in the same situation,
+which is the canonical Gemma 3 default (see google/gemma_pytorch
+config.py and the Hugging Face Gemma3Config), so the model still loads
+and works correctly.
+
+Mirror that behavior here: pre-seed the field with the Gemma 3 default
+and mark the metadata key as optional. This unblocks Ollama-converted
+Gemma 3 models without affecting GGUFs that already carry the key.
+
+Refs: ggml-org/llama.cpp#12367, ollama/ollama#10262, mudler/LocalAI#9414
+---
+ src/llama-model.cpp | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/src/llama-model.cpp b/src/llama-model.cpp
+--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
+@@ -1568,7 +1568,8 @@
+
+                 hparams.f_final_logit_softcapping = 0.0f;
+                 ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
+-                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                hparams.f_norm_rms_eps = 1e-6f; // Gemma 3 canonical default; some Ollama GGUFs omit the key
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
+
+                 switch (hparams.n_layer) {
+                     case 18: type = LLM_TYPE_270M; break;
--- a/backend/rust/kokoros/src/service.rs
+++ b/backend/rust/kokoros/src/service.rs
@@ -341,6 +341,16 @@ impl Backend for KokorosService {
        Err(Status::unimplemented("Not supported"))
    }

+    type AudioTranscriptionStreamStream =
+        ReceiverStream<Result<backend::TranscriptStreamResponse, Status>>;
+
+    async fn audio_transcription_stream(
+        &self,
+        _: Request<backend::TranscriptRequest>,
+    ) -> Result<Response<Self::AudioTranscriptionStreamStream>, Status> {
+        Err(Status::unimplemented("Not supported"))
+    }
+
    async fn sound_generation(
        &self,
        _: Request<backend::SoundGenerationRequest>,
--- a/core/services/nodes/managers_distributed.go
+++ b/core/services/nodes/managers_distributed.go
@@ -106,13 +106,6 @@ func (d *DistributedBackendManager) enqueueAndDrainBackendOp(ctx context.Context
 		if node.Status == StatusPending {
 			continue
 		}
-		// Backend lifecycle ops only make sense on backend-type workers.
-		// Agent workers don't subscribe to backend.install/delete/list, so
-		// enqueueing for them guarantees a forever-retrying row that the
-		// reconciler can never drain. Silently skip — they aren't consumers.
-		if node.NodeType != "" && node.NodeType != NodeTypeBackend {
-			continue
-		}
 		if err := d.registry.UpsertPendingBackendOp(ctx, node.ID, backend, op, galleriesJSON); err != nil {
 			xlog.Warn("Failed to enqueue backend op", "op", op, "node", node.Name, "backend", backend, "error", err)
 			result.Nodes = append(result.Nodes, NodeOpStatus{
--- a/core/services/nodes/reconciler.go
+++ b/core/services/nodes/reconciler.go
@@ -3,14 +3,12 @@ package nodes
 import (
 	"context"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"time"

 	"github.com/mudler/LocalAI/core/services/advisorylock"
 	grpcclient "github.com/mudler/LocalAI/pkg/grpc"
 	"github.com/mudler/xlog"
-	"github.com/nats-io/nats.go"
 	"gorm.io/gorm"
 )

@@ -208,47 +206,12 @@ func (rc *ReplicaReconciler) drainPendingBackendOps(ctx context.Context) {
 			}
 			continue
 		}
-
-		// ErrNoResponders means the node has no active NATS subscription for
-		// this subject. Either its connection dropped, or it's the wrong
-		// node type entirely. Mark unhealthy so the health monitor's
-		// heartbeat-only pass doesn't immediately flip it back — and so
-		// ListDuePendingBackendOps (which filters by status=healthy) stops
-		// picking the row until the node genuinely recovers.
-		if errors.Is(applyErr, nats.ErrNoResponders) {
-			xlog.Warn("Reconciler: no NATS responders — marking node unhealthy",
-				"op", op.Op, "backend", op.Backend, "node", op.NodeID)
-			_ = rc.registry.MarkUnhealthy(ctx, op.NodeID)
-		}
-
-		// Dead-letter cap: after maxAttempts the row is the reconciler
-		// equivalent of a poison message. Delete it loudly so the queue
-		// doesn't churn NATS every tick forever — operators can re-issue
-		// the op from the UI if they still want it applied.
-		if op.Attempts+1 >= maxPendingBackendOpAttempts {
-			xlog.Error("Reconciler: abandoning pending backend op after max attempts",
-				"op", op.Op, "backend", op.Backend, "node", op.NodeID,
-				"attempts", op.Attempts+1, "last_error", applyErr)
-			if err := rc.registry.DeletePendingBackendOp(ctx, op.ID); err != nil {
-				xlog.Warn("Reconciler: failed to delete abandoned op row", "id", op.ID, "error", err)
-			}
-			continue
-		}
-
 		_ = rc.registry.RecordPendingBackendOpFailure(ctx, op.ID, applyErr.Error())
 		xlog.Warn("Reconciler: pending backend op retry failed",
 			"op", op.Op, "backend", op.Backend, "node", op.NodeID, "attempts", op.Attempts+1, "error", applyErr)
 	}
 }

-// maxPendingBackendOpAttempts caps how many times the reconciler retries a
-// failing row before dead-lettering it. Ten attempts at exponential backoff
-// (30s → 15m cap) is >1h of wall-clock patience — well past any transient
-// worker restart or network blip. Poisoned rows beyond that are almost
-// certainly structural (wrong node type, non-existent gallery entry) and no
-// amount of further retrying will help.
-const maxPendingBackendOpAttempts = 10
-
 // probeLoadedModels gRPC-health-checks model addresses that the DB says are
 // loaded. If a model's backend process is gone (OOM, crash, manual restart)
 // we remove the row so ghosts don't linger. Only probes rows older than
--- a/core/services/nodes/reconciler_test.go
+++ b/core/services/nodes/reconciler_test.go
@@ -373,30 +373,4 @@ var _ = Describe("ReplicaReconciler — state reconciliation", func() {
 			Expect(row.NextRetryAt).To(BeTemporally(">", before))
 		})
 	})
-
-	Describe("NewNodeRegistry malformed-row pruning", func() {
-		It("drops queue rows for agent nodes and non-existent nodes on startup", func() {
-			agent := &BackendNode{Name: "agent-1", NodeType: NodeTypeAgent, Address: "x"}
-			Expect(registry.Register(context.Background(), agent, true)).To(Succeed())
-			backend := &BackendNode{Name: "backend-1", NodeType: NodeTypeBackend, Address: "y"}
-			Expect(registry.Register(context.Background(), backend, true)).To(Succeed())
-
-			// Three rows: one for a valid backend node (should survive),
-			// one for an agent node (pruned), one for an empty backend name
-			// on the valid node (pruned).
-			Expect(registry.UpsertPendingBackendOp(context.Background(), backend.ID, "foo", OpBackendInstall, nil)).To(Succeed())
-			Expect(registry.UpsertPendingBackendOp(context.Background(), agent.ID, "foo", OpBackendInstall, nil)).To(Succeed())
-			Expect(registry.UpsertPendingBackendOp(context.Background(), backend.ID, "", OpBackendInstall, nil)).To(Succeed())
-
-			// Re-instantiating the registry runs the cleanup migration.
-			_, err := NewNodeRegistry(db)
-			Expect(err).ToNot(HaveOccurred())
-
-			var rows []PendingBackendOp
-			Expect(db.Find(&rows).Error).To(Succeed())
-			Expect(rows).To(HaveLen(1))
-			Expect(rows[0].NodeID).To(Equal(backend.ID))
-			Expect(rows[0].Backend).To(Equal("foo"))
-		})
-	})
 })
--- a/core/services/nodes/registry.go
+++ b/core/services/nodes/registry.go
@@ -148,30 +148,6 @@ func NewNodeRegistry(db *gorm.DB) (*NodeRegistry, error) {
 	}); err != nil {
 		return nil, fmt.Errorf("migrating node tables: %w", err)
 	}
-
-	// One-shot cleanup of queue rows that can never drain: ops targeted at
-	// agent workers (wrong subscription set), at non-existent nodes, or with
-	// an empty backend name. The guard in enqueueAndDrainBackendOp prevents
-	// new ones from being written, but rows persisted by earlier versions
-	// keep the reconciler busy retrying a permanently-failing NATS request
-	// every 30s. Guarded by the same migration advisory lock so only one
-	// frontend runs it.
-	_ = advisorylock.WithLockCtx(context.Background(), db, advisorylock.KeySchemaMigrate, func() error {
-		res := db.Exec(`
-			DELETE FROM pending_backend_ops
-			WHERE backend = ''
-			   OR node_id NOT IN (SELECT id FROM backend_nodes WHERE node_type = ? OR node_type = '')
-		`, NodeTypeBackend)
-		if res.Error != nil {
-			xlog.Warn("Failed to prune malformed pending_backend_ops rows", "error", res.Error)
-			return res.Error
-		}
-		if res.RowsAffected > 0 {
-			xlog.Info("Pruned pending_backend_ops rows (wrong node type or empty backend)", "count", res.RowsAffected)
-		}
-		return nil
-	})
-
 	return &NodeRegistry{db: db}, nil
 }