mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-19 14:17:21 -04:00
Compare commits
8 Commits
issue-9414
...
distribute
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
44e7d9806b | ||
|
|
7a9d89fa54 | ||
|
|
ee34a52c5d | ||
|
|
92b9e22dc9 | ||
|
|
f0ab68e352 | ||
|
|
9373de9f9b | ||
|
|
1b3c951c85 | ||
|
|
1f43762655 |
@@ -1,38 +0,0 @@
|
||||
From: LocalAI maintainers <noreply@localai.io>
|
||||
Subject: [PATCH] gemma3: default rms norm eps when GGUF metadata key is missing
|
||||
|
||||
Some Gemma 3 GGUF files (notably those distributed via the Ollama
|
||||
registry) do not embed the `gemma3.attention.layer_norm_rms_epsilon`
|
||||
metadata key. ik_llama.cpp currently requires the key to be present and
|
||||
fails the entire model load with:
|
||||
|
||||
error loading model hyperparameters:
|
||||
key not found in model: gemma3.attention.layer_norm_rms_epsilon
|
||||
|
||||
Ollama's own loader silently falls back to ~1e-6 in the same situation,
|
||||
which is the canonical Gemma 3 default (see google/gemma_pytorch
|
||||
config.py and the Hugging Face Gemma3Config), so the model still loads
|
||||
and works correctly.
|
||||
|
||||
Mirror that behavior here: pre-seed the field with the Gemma 3 default
|
||||
and mark the metadata key as optional. This unblocks Ollama-converted
|
||||
Gemma 3 models without affecting GGUFs that already carry the key.
|
||||
|
||||
Refs: ggml-org/llama.cpp#12367, ollama/ollama#10262, mudler/LocalAI#9414
|
||||
---
|
||||
src/llama-hparams.cpp | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
|
||||
--- a/src/llama-hparams.cpp
|
||||
+++ b/src/llama-hparams.cpp
|
||||
@@ -679,7 +679,8 @@
|
||||
hparams.rope_freq_scale_train_swa = 1.0f;
|
||||
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
+ hparams.f_norm_rms_eps = 1e-6f; // Gemma 3 canonical default; some Ollama GGUFs omit the key
|
||||
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 26: model.type = e_model::MODEL_2B; break;
|
||||
@@ -1,38 +0,0 @@
|
||||
From: LocalAI maintainers <noreply@localai.io>
|
||||
Subject: [PATCH] gemma3: default rms norm eps when GGUF metadata key is missing
|
||||
|
||||
Some Gemma 3 GGUF files (notably those distributed via the Ollama
|
||||
registry) do not embed the `gemma3.attention.layer_norm_rms_epsilon`
|
||||
metadata key. llama.cpp currently requires the key to be present and
|
||||
fails the entire model load with:
|
||||
|
||||
error loading model hyperparameters:
|
||||
key not found in model: gemma3.attention.layer_norm_rms_epsilon
|
||||
|
||||
Ollama's own loader silently falls back to ~1e-6 in the same situation,
|
||||
which is the canonical Gemma 3 default (see google/gemma_pytorch
|
||||
config.py and the Hugging Face Gemma3Config), so the model still loads
|
||||
and works correctly.
|
||||
|
||||
Mirror that behavior here: pre-seed the field with the Gemma 3 default
|
||||
and mark the metadata key as optional. This unblocks Ollama-converted
|
||||
Gemma 3 models without affecting GGUFs that already carry the key.
|
||||
|
||||
Refs: ggml-org/llama.cpp#12367, ollama/ollama#10262, mudler/LocalAI#9414
|
||||
---
|
||||
src/llama-model.cpp | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
|
||||
--- a/src/llama-model.cpp
|
||||
+++ b/src/llama-model.cpp
|
||||
@@ -1568,7 +1568,8 @@
|
||||
|
||||
hparams.f_final_logit_softcapping = 0.0f;
|
||||
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
||||
- ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
+ hparams.f_norm_rms_eps = 1e-6f; // Gemma 3 canonical default; some Ollama GGUFs omit the key
|
||||
+ ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 18: type = LLM_TYPE_270M; break;
|
||||
@@ -341,16 +341,6 @@ impl Backend for KokorosService {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
type AudioTranscriptionStreamStream =
|
||||
ReceiverStream<Result<backend::TranscriptStreamResponse, Status>>;
|
||||
|
||||
async fn audio_transcription_stream(
|
||||
&self,
|
||||
_: Request<backend::TranscriptRequest>,
|
||||
) -> Result<Response<Self::AudioTranscriptionStreamStream>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn sound_generation(
|
||||
&self,
|
||||
_: Request<backend::SoundGenerationRequest>,
|
||||
|
||||
@@ -106,6 +106,13 @@ func (d *DistributedBackendManager) enqueueAndDrainBackendOp(ctx context.Context
|
||||
if node.Status == StatusPending {
|
||||
continue
|
||||
}
|
||||
// Backend lifecycle ops only make sense on backend-type workers.
|
||||
// Agent workers don't subscribe to backend.install/delete/list, so
|
||||
// enqueueing for them guarantees a forever-retrying row that the
|
||||
// reconciler can never drain. Silently skip — they aren't consumers.
|
||||
if node.NodeType != "" && node.NodeType != NodeTypeBackend {
|
||||
continue
|
||||
}
|
||||
if err := d.registry.UpsertPendingBackendOp(ctx, node.ID, backend, op, galleriesJSON); err != nil {
|
||||
xlog.Warn("Failed to enqueue backend op", "op", op, "node", node.Name, "backend", backend, "error", err)
|
||||
result.Nodes = append(result.Nodes, NodeOpStatus{
|
||||
|
||||
@@ -3,12 +3,14 @@ package nodes
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/mudler/LocalAI/core/services/advisorylock"
|
||||
grpcclient "github.com/mudler/LocalAI/pkg/grpc"
|
||||
"github.com/mudler/xlog"
|
||||
"github.com/nats-io/nats.go"
|
||||
"gorm.io/gorm"
|
||||
)
|
||||
|
||||
@@ -206,12 +208,47 @@ func (rc *ReplicaReconciler) drainPendingBackendOps(ctx context.Context) {
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// ErrNoResponders means the node has no active NATS subscription for
|
||||
// this subject. Either its connection dropped, or it's the wrong
|
||||
// node type entirely. Mark unhealthy so the health monitor's
|
||||
// heartbeat-only pass doesn't immediately flip it back — and so
|
||||
// ListDuePendingBackendOps (which filters by status=healthy) stops
|
||||
// picking the row until the node genuinely recovers.
|
||||
if errors.Is(applyErr, nats.ErrNoResponders) {
|
||||
xlog.Warn("Reconciler: no NATS responders — marking node unhealthy",
|
||||
"op", op.Op, "backend", op.Backend, "node", op.NodeID)
|
||||
_ = rc.registry.MarkUnhealthy(ctx, op.NodeID)
|
||||
}
|
||||
|
||||
// Dead-letter cap: after maxAttempts the row is the reconciler
|
||||
// equivalent of a poison message. Delete it loudly so the queue
|
||||
// doesn't churn NATS every tick forever — operators can re-issue
|
||||
// the op from the UI if they still want it applied.
|
||||
if op.Attempts+1 >= maxPendingBackendOpAttempts {
|
||||
xlog.Error("Reconciler: abandoning pending backend op after max attempts",
|
||||
"op", op.Op, "backend", op.Backend, "node", op.NodeID,
|
||||
"attempts", op.Attempts+1, "last_error", applyErr)
|
||||
if err := rc.registry.DeletePendingBackendOp(ctx, op.ID); err != nil {
|
||||
xlog.Warn("Reconciler: failed to delete abandoned op row", "id", op.ID, "error", err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
_ = rc.registry.RecordPendingBackendOpFailure(ctx, op.ID, applyErr.Error())
|
||||
xlog.Warn("Reconciler: pending backend op retry failed",
|
||||
"op", op.Op, "backend", op.Backend, "node", op.NodeID, "attempts", op.Attempts+1, "error", applyErr)
|
||||
}
|
||||
}
|
||||
|
||||
// maxPendingBackendOpAttempts caps how many times the reconciler retries a
|
||||
// failing row before dead-lettering it. Ten attempts at exponential backoff
|
||||
// (30s → 15m cap) is >1h of wall-clock patience — well past any transient
|
||||
// worker restart or network blip. Poisoned rows beyond that are almost
|
||||
// certainly structural (wrong node type, non-existent gallery entry) and no
|
||||
// amount of further retrying will help.
|
||||
const maxPendingBackendOpAttempts = 10
|
||||
|
||||
// probeLoadedModels gRPC-health-checks model addresses that the DB says are
|
||||
// loaded. If a model's backend process is gone (OOM, crash, manual restart)
|
||||
// we remove the row so ghosts don't linger. Only probes rows older than
|
||||
|
||||
@@ -373,4 +373,30 @@ var _ = Describe("ReplicaReconciler — state reconciliation", func() {
|
||||
Expect(row.NextRetryAt).To(BeTemporally(">", before))
|
||||
})
|
||||
})
|
||||
|
||||
Describe("NewNodeRegistry malformed-row pruning", func() {
|
||||
It("drops queue rows for agent nodes and non-existent nodes on startup", func() {
|
||||
agent := &BackendNode{Name: "agent-1", NodeType: NodeTypeAgent, Address: "x"}
|
||||
Expect(registry.Register(context.Background(), agent, true)).To(Succeed())
|
||||
backend := &BackendNode{Name: "backend-1", NodeType: NodeTypeBackend, Address: "y"}
|
||||
Expect(registry.Register(context.Background(), backend, true)).To(Succeed())
|
||||
|
||||
// Three rows: one for a valid backend node (should survive),
|
||||
// one for an agent node (pruned), one for an empty backend name
|
||||
// on the valid node (pruned).
|
||||
Expect(registry.UpsertPendingBackendOp(context.Background(), backend.ID, "foo", OpBackendInstall, nil)).To(Succeed())
|
||||
Expect(registry.UpsertPendingBackendOp(context.Background(), agent.ID, "foo", OpBackendInstall, nil)).To(Succeed())
|
||||
Expect(registry.UpsertPendingBackendOp(context.Background(), backend.ID, "", OpBackendInstall, nil)).To(Succeed())
|
||||
|
||||
// Re-instantiating the registry runs the cleanup migration.
|
||||
_, err := NewNodeRegistry(db)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
var rows []PendingBackendOp
|
||||
Expect(db.Find(&rows).Error).To(Succeed())
|
||||
Expect(rows).To(HaveLen(1))
|
||||
Expect(rows[0].NodeID).To(Equal(backend.ID))
|
||||
Expect(rows[0].Backend).To(Equal("foo"))
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
@@ -148,6 +148,30 @@ func NewNodeRegistry(db *gorm.DB) (*NodeRegistry, error) {
|
||||
}); err != nil {
|
||||
return nil, fmt.Errorf("migrating node tables: %w", err)
|
||||
}
|
||||
|
||||
// One-shot cleanup of queue rows that can never drain: ops targeted at
|
||||
// agent workers (wrong subscription set), at non-existent nodes, or with
|
||||
// an empty backend name. The guard in enqueueAndDrainBackendOp prevents
|
||||
// new ones from being written, but rows persisted by earlier versions
|
||||
// keep the reconciler busy retrying a permanently-failing NATS request
|
||||
// every 30s. Guarded by the same migration advisory lock so only one
|
||||
// frontend runs it.
|
||||
_ = advisorylock.WithLockCtx(context.Background(), db, advisorylock.KeySchemaMigrate, func() error {
|
||||
res := db.Exec(`
|
||||
DELETE FROM pending_backend_ops
|
||||
WHERE backend = ''
|
||||
OR node_id NOT IN (SELECT id FROM backend_nodes WHERE node_type = ? OR node_type = '')
|
||||
`, NodeTypeBackend)
|
||||
if res.Error != nil {
|
||||
xlog.Warn("Failed to prune malformed pending_backend_ops rows", "error", res.Error)
|
||||
return res.Error
|
||||
}
|
||||
if res.RowsAffected > 0 {
|
||||
xlog.Info("Pruned pending_backend_ops rows (wrong node type or empty backend)", "count", res.RowsAffected)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
return &NodeRegistry{db: db}, nil
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user