mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-24 16:51:44 -04:00
Compare commits
8 Commits
docs/wan-g
...
distribute
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
44e7d9806b | ||
|
|
7a9d89fa54 | ||
|
|
ee34a52c5d | ||
|
|
92b9e22dc9 | ||
|
|
f0ab68e352 | ||
|
|
9373de9f9b | ||
|
|
1b3c951c85 | ||
|
|
1f43762655 |
@@ -1,7 +1,7 @@
|
|||||||
|
|
||||||
# Pinned to the HEAD of feature/turboquant-kv-cache on https://github.com/TheTom/llama-cpp-turboquant.
|
# Pinned to the HEAD of feature/turboquant-kv-cache on https://github.com/TheTom/llama-cpp-turboquant.
|
||||||
# Auto-bumped nightly by .github/workflows/bump_deps.yaml.
|
# Auto-bumped nightly by .github/workflows/bump_deps.yaml.
|
||||||
TURBOQUANT_VERSION?=627ebbc6e27727bd4f65422d8aa60b13404993c8
|
TURBOQUANT_VERSION?=45f8a066ed5f5bb38c695cec532f6cef9f4efa9d
|
||||||
LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant
|
LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant
|
||||||
|
|
||||||
CMAKE_ARGS?=
|
CMAKE_ARGS?=
|
||||||
|
|||||||
@@ -1,22 +1,13 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# Patch the shared backend/cpp/llama-cpp/grpc-server.cpp *copy* used by the
|
# Augment the shared backend/cpp/llama-cpp/grpc-server.cpp allow-list of KV-cache
|
||||||
# turboquant build to account for two gaps between upstream and the fork:
|
# types so the gRPC `LoadModel` call accepts the TurboQuant-specific
|
||||||
|
# `turbo2` / `turbo3` / `turbo4` cache types.
|
||||||
#
|
#
|
||||||
# 1. Augment the kv_cache_types[] allow-list so `LoadModel` accepts the
|
# We do this on the *copy* sitting in turboquant-<flavor>-build/, never on the
|
||||||
# fork-specific `turbo2` / `turbo3` / `turbo4` cache types.
|
# original under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps
|
||||||
# 2. Replace `get_media_marker()` (added upstream in ggml-org/llama.cpp#21962,
|
# compiling against vanilla upstream which does not know about GGML_TYPE_TURBO*.
|
||||||
# server-side random per-instance marker) with the legacy "<__media__>"
|
|
||||||
# literal. The fork branched before that PR, so server-common.cpp has no
|
|
||||||
# get_media_marker symbol. The fork's mtmd_default_marker() still returns
|
|
||||||
# "<__media__>", and Go-side tooling falls back to that sentinel when the
|
|
||||||
# backend does not expose media_marker, so substituting the literal keeps
|
|
||||||
# behavior identical on the turboquant path.
|
|
||||||
#
|
#
|
||||||
# We patch the *copy* sitting in turboquant-<flavor>-build/, never the original
|
# Idempotent: skips the insertion if the marker is already present (so re-runs
|
||||||
# under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps compiling
|
|
||||||
# against vanilla upstream.
|
|
||||||
#
|
|
||||||
# Idempotent: skips each insertion if its marker is already present (so re-runs
|
|
||||||
# of the same build dir don't double-insert).
|
# of the same build dir don't double-insert).
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
@@ -34,47 +25,33 @@ if [[ ! -f "$SRC" ]]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if grep -q 'GGML_TYPE_TURBO2_0' "$SRC"; then
|
if grep -q 'GGML_TYPE_TURBO2_0' "$SRC"; then
|
||||||
echo "==> $SRC already has TurboQuant cache types, skipping KV allow-list patch"
|
echo "==> $SRC already has TurboQuant cache types, skipping"
|
||||||
else
|
exit 0
|
||||||
echo "==> patching $SRC to allow turbo2/turbo3/turbo4 KV-cache types"
|
|
||||||
|
|
||||||
# Insert the three TURBO entries right after the first ` GGML_TYPE_Q5_1,`
|
|
||||||
# line (the kv_cache_types[] allow-list). Using awk because the builder image
|
|
||||||
# does not ship python3, and GNU sed's multi-line `a\` quoting is awkward.
|
|
||||||
awk '
|
|
||||||
/^ GGML_TYPE_Q5_1,$/ && !done {
|
|
||||||
print
|
|
||||||
print " // turboquant fork extras — added by patch-grpc-server.sh"
|
|
||||||
print " GGML_TYPE_TURBO2_0,"
|
|
||||||
print " GGML_TYPE_TURBO3_0,"
|
|
||||||
print " GGML_TYPE_TURBO4_0,"
|
|
||||||
done = 1
|
|
||||||
next
|
|
||||||
}
|
|
||||||
{ print }
|
|
||||||
END {
|
|
||||||
if (!done) {
|
|
||||||
print "patch-grpc-server.sh: anchor ` GGML_TYPE_Q5_1,` not found" > "/dev/stderr"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
' "$SRC" > "$SRC.tmp"
|
|
||||||
mv "$SRC.tmp" "$SRC"
|
|
||||||
|
|
||||||
echo "==> KV allow-list patch OK"
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if grep -q 'get_media_marker()' "$SRC"; then
|
echo "==> patching $SRC to allow turbo2/turbo3/turbo4 KV-cache types"
|
||||||
echo "==> patching $SRC to replace get_media_marker() with legacy \"<__media__>\" literal"
|
|
||||||
# Only one call site today (ModelMetadata), but replace all occurrences to
|
|
||||||
# stay robust if upstream adds more. Use a temp file to avoid relying on
|
|
||||||
# sed -i portability (the builder image uses GNU sed, but keeping this
|
|
||||||
# consistent with the awk block above).
|
|
||||||
sed 's/get_media_marker()/"<__media__>"/g' "$SRC" > "$SRC.tmp"
|
|
||||||
mv "$SRC.tmp" "$SRC"
|
|
||||||
echo "==> get_media_marker() substitution OK"
|
|
||||||
else
|
|
||||||
echo "==> $SRC has no get_media_marker() call, skipping media-marker patch"
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "==> all patches applied"
|
# Insert the three TURBO entries right after the first ` GGML_TYPE_Q5_1,`
|
||||||
|
# line (the kv_cache_types[] allow-list). Using awk because the builder image
|
||||||
|
# does not ship python3, and GNU sed's multi-line `a\` quoting is awkward.
|
||||||
|
awk '
|
||||||
|
/^ GGML_TYPE_Q5_1,$/ && !done {
|
||||||
|
print
|
||||||
|
print " // turboquant fork extras — added by patch-grpc-server.sh"
|
||||||
|
print " GGML_TYPE_TURBO2_0,"
|
||||||
|
print " GGML_TYPE_TURBO3_0,"
|
||||||
|
print " GGML_TYPE_TURBO4_0,"
|
||||||
|
done = 1
|
||||||
|
next
|
||||||
|
}
|
||||||
|
{ print }
|
||||||
|
END {
|
||||||
|
if (!done) {
|
||||||
|
print "patch-grpc-server.sh: anchor ` GGML_TYPE_Q5_1,` not found" > "/dev/stderr"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
' "$SRC" > "$SRC.tmp"
|
||||||
|
mv "$SRC.tmp" "$SRC"
|
||||||
|
|
||||||
|
echo "==> patched OK"
|
||||||
|
|||||||
@@ -0,0 +1,83 @@
|
|||||||
|
From 660600081fb7b9b769ded5c805a2d39a419f0a0d Mon Sep 17 00:00:00 2001
|
||||||
|
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
|
||||||
|
Date: Wed, 8 Apr 2026 11:12:15 -0400
|
||||||
|
Subject: [PATCH] server: respect the ignore eos flag (#21203)
|
||||||
|
|
||||||
|
---
|
||||||
|
tools/server/server-context.cpp | 3 +++
|
||||||
|
tools/server/server-context.h | 3 +++
|
||||||
|
tools/server/server-task.cpp | 3 ++-
|
||||||
|
tools/server/server-task.h | 1 +
|
||||||
|
4 files changed, 9 insertions(+), 1 deletion(-)
|
||||||
|
|
||||||
|
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
|
||||||
|
index 9d3ac538..b31981c5 100644
|
||||||
|
--- a/tools/server/server-context.cpp
|
||||||
|
+++ b/tools/server/server-context.cpp
|
||||||
|
@@ -3033,6 +3033,8 @@ server_context_meta server_context::get_meta() const {
|
||||||
|
/* fim_rep_token */ llama_vocab_fim_rep(impl->vocab),
|
||||||
|
/* fim_sep_token */ llama_vocab_fim_sep(impl->vocab),
|
||||||
|
|
||||||
|
+ /* logit_bias_eog */ impl->params_base.sampling.logit_bias_eog,
|
||||||
|
+
|
||||||
|
/* model_vocab_type */ llama_vocab_type(impl->vocab),
|
||||||
|
/* model_vocab_n_tokens */ llama_vocab_n_tokens(impl->vocab),
|
||||||
|
/* model_n_ctx_train */ llama_model_n_ctx_train(impl->model),
|
||||||
|
@@ -3117,6 +3119,7 @@ std::unique_ptr<server_res_generator> server_routes::handle_completions_impl(
|
||||||
|
ctx_server.vocab,
|
||||||
|
params,
|
||||||
|
meta->slot_n_ctx,
|
||||||
|
+ meta->logit_bias_eog,
|
||||||
|
data);
|
||||||
|
task.id_slot = json_value(data, "id_slot", -1);
|
||||||
|
|
||||||
|
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
|
||||||
|
index d7ce8735..6ea9afc0 100644
|
||||||
|
--- a/tools/server/server-context.h
|
||||||
|
+++ b/tools/server/server-context.h
|
||||||
|
@@ -39,6 +39,9 @@ struct server_context_meta {
|
||||||
|
llama_token fim_rep_token;
|
||||||
|
llama_token fim_sep_token;
|
||||||
|
|
||||||
|
+ // sampling
|
||||||
|
+ std::vector<llama_logit_bias> logit_bias_eog;
|
||||||
|
+
|
||||||
|
// model meta
|
||||||
|
enum llama_vocab_type model_vocab_type;
|
||||||
|
int32_t model_vocab_n_tokens;
|
||||||
|
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
|
||||||
|
index 4cc87bc5..856b3f0e 100644
|
||||||
|
--- a/tools/server/server-task.cpp
|
||||||
|
+++ b/tools/server/server-task.cpp
|
||||||
|
@@ -239,6 +239,7 @@ task_params server_task::params_from_json_cmpl(
|
||||||
|
const llama_vocab * vocab,
|
||||||
|
const common_params & params_base,
|
||||||
|
const int n_ctx_slot,
|
||||||
|
+ const std::vector<llama_logit_bias> & logit_bias_eog,
|
||||||
|
const json & data) {
|
||||||
|
task_params params;
|
||||||
|
|
||||||
|
@@ -562,7 +563,7 @@ task_params server_task::params_from_json_cmpl(
|
||||||
|
if (params.sampling.ignore_eos) {
|
||||||
|
params.sampling.logit_bias.insert(
|
||||||
|
params.sampling.logit_bias.end(),
|
||||||
|
- defaults.sampling.logit_bias_eog.begin(), defaults.sampling.logit_bias_eog.end());
|
||||||
|
+ logit_bias_eog.begin(), logit_bias_eog.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
|
||||||
|
index d855bf08..243e47a8 100644
|
||||||
|
--- a/tools/server/server-task.h
|
||||||
|
+++ b/tools/server/server-task.h
|
||||||
|
@@ -209,6 +209,7 @@ struct server_task {
|
||||||
|
const llama_vocab * vocab,
|
||||||
|
const common_params & params_base,
|
||||||
|
const int n_ctx_slot,
|
||||||
|
+ const std::vector<llama_logit_bias> & logit_bias_eog,
|
||||||
|
const json & data);
|
||||||
|
|
||||||
|
// utility function
|
||||||
|
--
|
||||||
|
2.43.0
|
||||||
|
|
||||||
@@ -1008,20 +1008,6 @@
|
|||||||
nvidia-cuda-12: "cuda12-turboquant-development"
|
nvidia-cuda-12: "cuda12-turboquant-development"
|
||||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-turboquant-development"
|
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-turboquant-development"
|
||||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-turboquant-development"
|
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-turboquant-development"
|
||||||
- !!merge <<: *stablediffusionggml
|
|
||||||
name: "stablediffusion-ggml-development"
|
|
||||||
capabilities:
|
|
||||||
default: "cpu-stablediffusion-ggml-development"
|
|
||||||
nvidia: "cuda12-stablediffusion-ggml-development"
|
|
||||||
intel: "intel-sycl-f16-stablediffusion-ggml-development"
|
|
||||||
# amd: "rocm-stablediffusion-ggml-development"
|
|
||||||
vulkan: "vulkan-stablediffusion-ggml-development"
|
|
||||||
nvidia-l4t: "nvidia-l4t-arm64-stablediffusion-ggml-development"
|
|
||||||
metal: "metal-stablediffusion-ggml-development"
|
|
||||||
nvidia-cuda-13: "cuda13-stablediffusion-ggml-development"
|
|
||||||
nvidia-cuda-12: "cuda12-stablediffusion-ggml-development"
|
|
||||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-stablediffusion-ggml-development"
|
|
||||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-stablediffusion-ggml-development"
|
|
||||||
- !!merge <<: *neutts
|
- !!merge <<: *neutts
|
||||||
name: "cpu-neutts"
|
name: "cpu-neutts"
|
||||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-neutts"
|
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-neutts"
|
||||||
|
|||||||
@@ -341,16 +341,6 @@ impl Backend for KokorosService {
|
|||||||
Err(Status::unimplemented("Not supported"))
|
Err(Status::unimplemented("Not supported"))
|
||||||
}
|
}
|
||||||
|
|
||||||
type AudioTranscriptionStreamStream =
|
|
||||||
ReceiverStream<Result<backend::TranscriptStreamResponse, Status>>;
|
|
||||||
|
|
||||||
async fn audio_transcription_stream(
|
|
||||||
&self,
|
|
||||||
_: Request<backend::TranscriptRequest>,
|
|
||||||
) -> Result<Response<Self::AudioTranscriptionStreamStream>, Status> {
|
|
||||||
Err(Status::unimplemented("Not supported"))
|
|
||||||
}
|
|
||||||
|
|
||||||
async fn sound_generation(
|
async fn sound_generation(
|
||||||
&self,
|
&self,
|
||||||
_: Request<backend::SoundGenerationRequest>,
|
_: Request<backend::SoundGenerationRequest>,
|
||||||
|
|||||||
@@ -106,6 +106,13 @@ func (d *DistributedBackendManager) enqueueAndDrainBackendOp(ctx context.Context
|
|||||||
if node.Status == StatusPending {
|
if node.Status == StatusPending {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
// Backend lifecycle ops only make sense on backend-type workers.
|
||||||
|
// Agent workers don't subscribe to backend.install/delete/list, so
|
||||||
|
// enqueueing for them guarantees a forever-retrying row that the
|
||||||
|
// reconciler can never drain. Silently skip — they aren't consumers.
|
||||||
|
if node.NodeType != "" && node.NodeType != NodeTypeBackend {
|
||||||
|
continue
|
||||||
|
}
|
||||||
if err := d.registry.UpsertPendingBackendOp(ctx, node.ID, backend, op, galleriesJSON); err != nil {
|
if err := d.registry.UpsertPendingBackendOp(ctx, node.ID, backend, op, galleriesJSON); err != nil {
|
||||||
xlog.Warn("Failed to enqueue backend op", "op", op, "node", node.Name, "backend", backend, "error", err)
|
xlog.Warn("Failed to enqueue backend op", "op", op, "node", node.Name, "backend", backend, "error", err)
|
||||||
result.Nodes = append(result.Nodes, NodeOpStatus{
|
result.Nodes = append(result.Nodes, NodeOpStatus{
|
||||||
|
|||||||
@@ -3,12 +3,14 @@ package nodes
|
|||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/mudler/LocalAI/core/services/advisorylock"
|
"github.com/mudler/LocalAI/core/services/advisorylock"
|
||||||
grpcclient "github.com/mudler/LocalAI/pkg/grpc"
|
grpcclient "github.com/mudler/LocalAI/pkg/grpc"
|
||||||
"github.com/mudler/xlog"
|
"github.com/mudler/xlog"
|
||||||
|
"github.com/nats-io/nats.go"
|
||||||
"gorm.io/gorm"
|
"gorm.io/gorm"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -206,12 +208,47 @@ func (rc *ReplicaReconciler) drainPendingBackendOps(ctx context.Context) {
|
|||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ErrNoResponders means the node has no active NATS subscription for
|
||||||
|
// this subject. Either its connection dropped, or it's the wrong
|
||||||
|
// node type entirely. Mark unhealthy so the health monitor's
|
||||||
|
// heartbeat-only pass doesn't immediately flip it back — and so
|
||||||
|
// ListDuePendingBackendOps (which filters by status=healthy) stops
|
||||||
|
// picking the row until the node genuinely recovers.
|
||||||
|
if errors.Is(applyErr, nats.ErrNoResponders) {
|
||||||
|
xlog.Warn("Reconciler: no NATS responders — marking node unhealthy",
|
||||||
|
"op", op.Op, "backend", op.Backend, "node", op.NodeID)
|
||||||
|
_ = rc.registry.MarkUnhealthy(ctx, op.NodeID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dead-letter cap: after maxAttempts the row is the reconciler
|
||||||
|
// equivalent of a poison message. Delete it loudly so the queue
|
||||||
|
// doesn't churn NATS every tick forever — operators can re-issue
|
||||||
|
// the op from the UI if they still want it applied.
|
||||||
|
if op.Attempts+1 >= maxPendingBackendOpAttempts {
|
||||||
|
xlog.Error("Reconciler: abandoning pending backend op after max attempts",
|
||||||
|
"op", op.Op, "backend", op.Backend, "node", op.NodeID,
|
||||||
|
"attempts", op.Attempts+1, "last_error", applyErr)
|
||||||
|
if err := rc.registry.DeletePendingBackendOp(ctx, op.ID); err != nil {
|
||||||
|
xlog.Warn("Reconciler: failed to delete abandoned op row", "id", op.ID, "error", err)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
_ = rc.registry.RecordPendingBackendOpFailure(ctx, op.ID, applyErr.Error())
|
_ = rc.registry.RecordPendingBackendOpFailure(ctx, op.ID, applyErr.Error())
|
||||||
xlog.Warn("Reconciler: pending backend op retry failed",
|
xlog.Warn("Reconciler: pending backend op retry failed",
|
||||||
"op", op.Op, "backend", op.Backend, "node", op.NodeID, "attempts", op.Attempts+1, "error", applyErr)
|
"op", op.Op, "backend", op.Backend, "node", op.NodeID, "attempts", op.Attempts+1, "error", applyErr)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// maxPendingBackendOpAttempts caps how many times the reconciler retries a
|
||||||
|
// failing row before dead-lettering it. Ten attempts at exponential backoff
|
||||||
|
// (30s → 15m cap) is >1h of wall-clock patience — well past any transient
|
||||||
|
// worker restart or network blip. Poisoned rows beyond that are almost
|
||||||
|
// certainly structural (wrong node type, non-existent gallery entry) and no
|
||||||
|
// amount of further retrying will help.
|
||||||
|
const maxPendingBackendOpAttempts = 10
|
||||||
|
|
||||||
// probeLoadedModels gRPC-health-checks model addresses that the DB says are
|
// probeLoadedModels gRPC-health-checks model addresses that the DB says are
|
||||||
// loaded. If a model's backend process is gone (OOM, crash, manual restart)
|
// loaded. If a model's backend process is gone (OOM, crash, manual restart)
|
||||||
// we remove the row so ghosts don't linger. Only probes rows older than
|
// we remove the row so ghosts don't linger. Only probes rows older than
|
||||||
|
|||||||
@@ -373,4 +373,30 @@ var _ = Describe("ReplicaReconciler — state reconciliation", func() {
|
|||||||
Expect(row.NextRetryAt).To(BeTemporally(">", before))
|
Expect(row.NextRetryAt).To(BeTemporally(">", before))
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
Describe("NewNodeRegistry malformed-row pruning", func() {
|
||||||
|
It("drops queue rows for agent nodes and non-existent nodes on startup", func() {
|
||||||
|
agent := &BackendNode{Name: "agent-1", NodeType: NodeTypeAgent, Address: "x"}
|
||||||
|
Expect(registry.Register(context.Background(), agent, true)).To(Succeed())
|
||||||
|
backend := &BackendNode{Name: "backend-1", NodeType: NodeTypeBackend, Address: "y"}
|
||||||
|
Expect(registry.Register(context.Background(), backend, true)).To(Succeed())
|
||||||
|
|
||||||
|
// Three rows: one for a valid backend node (should survive),
|
||||||
|
// one for an agent node (pruned), one for an empty backend name
|
||||||
|
// on the valid node (pruned).
|
||||||
|
Expect(registry.UpsertPendingBackendOp(context.Background(), backend.ID, "foo", OpBackendInstall, nil)).To(Succeed())
|
||||||
|
Expect(registry.UpsertPendingBackendOp(context.Background(), agent.ID, "foo", OpBackendInstall, nil)).To(Succeed())
|
||||||
|
Expect(registry.UpsertPendingBackendOp(context.Background(), backend.ID, "", OpBackendInstall, nil)).To(Succeed())
|
||||||
|
|
||||||
|
// Re-instantiating the registry runs the cleanup migration.
|
||||||
|
_, err := NewNodeRegistry(db)
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
|
||||||
|
var rows []PendingBackendOp
|
||||||
|
Expect(db.Find(&rows).Error).To(Succeed())
|
||||||
|
Expect(rows).To(HaveLen(1))
|
||||||
|
Expect(rows[0].NodeID).To(Equal(backend.ID))
|
||||||
|
Expect(rows[0].Backend).To(Equal("foo"))
|
||||||
|
})
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -148,6 +148,30 @@ func NewNodeRegistry(db *gorm.DB) (*NodeRegistry, error) {
|
|||||||
}); err != nil {
|
}); err != nil {
|
||||||
return nil, fmt.Errorf("migrating node tables: %w", err)
|
return nil, fmt.Errorf("migrating node tables: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// One-shot cleanup of queue rows that can never drain: ops targeted at
|
||||||
|
// agent workers (wrong subscription set), at non-existent nodes, or with
|
||||||
|
// an empty backend name. The guard in enqueueAndDrainBackendOp prevents
|
||||||
|
// new ones from being written, but rows persisted by earlier versions
|
||||||
|
// keep the reconciler busy retrying a permanently-failing NATS request
|
||||||
|
// every 30s. Guarded by the same migration advisory lock so only one
|
||||||
|
// frontend runs it.
|
||||||
|
_ = advisorylock.WithLockCtx(context.Background(), db, advisorylock.KeySchemaMigrate, func() error {
|
||||||
|
res := db.Exec(`
|
||||||
|
DELETE FROM pending_backend_ops
|
||||||
|
WHERE backend = ''
|
||||||
|
OR node_id NOT IN (SELECT id FROM backend_nodes WHERE node_type = ? OR node_type = '')
|
||||||
|
`, NodeTypeBackend)
|
||||||
|
if res.Error != nil {
|
||||||
|
xlog.Warn("Failed to prune malformed pending_backend_ops rows", "error", res.Error)
|
||||||
|
return res.Error
|
||||||
|
}
|
||||||
|
if res.RowsAffected > 0 {
|
||||||
|
xlog.Info("Pruned pending_backend_ops rows (wrong node type or empty backend)", "count", res.RowsAffected)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
|
||||||
return &NodeRegistry{db: db}, nil
|
return &NodeRegistry{db: db}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -15186,10 +15186,10 @@
|
|||||||
- gpu
|
- gpu
|
||||||
overrides:
|
overrides:
|
||||||
parameters:
|
parameters:
|
||||||
model: wan2.1_t2v_1.3b-q8_0.gguf
|
model: wan2.1-t2v-1.3B-Q8_0.gguf
|
||||||
files:
|
files:
|
||||||
- filename: "wan2.1_t2v_1.3b-q8_0.gguf"
|
- filename: "wan2.1-t2v-1.3B-Q8_0.gguf"
|
||||||
uri: "huggingface://calcuis/wan-gguf/wan2.1_t2v_1.3b-q8_0.gguf"
|
uri: "huggingface://calcuis/wan-gguf/wan2.1-t2v-1.3B-Q8_0.gguf"
|
||||||
- filename: "wan_2.1_vae.safetensors"
|
- filename: "wan_2.1_vae.safetensors"
|
||||||
uri: "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors"
|
uri: "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors"
|
||||||
- filename: "umt5-xxl-encoder-Q8_0.gguf"
|
- filename: "umt5-xxl-encoder-Q8_0.gguf"
|
||||||
|
|||||||
@@ -9,6 +9,11 @@ config_file: |
|
|||||||
- "diffusion_model"
|
- "diffusion_model"
|
||||||
- "vae_decode_only:false"
|
- "vae_decode_only:false"
|
||||||
- "sampler:euler"
|
- "sampler:euler"
|
||||||
|
- "scheduler:discrete"
|
||||||
- "flow_shift:3.0"
|
- "flow_shift:3.0"
|
||||||
|
- "diffusion_flash_attn:true"
|
||||||
|
- "offload_params_to_cpu:true"
|
||||||
|
- "keep_vae_on_cpu:true"
|
||||||
|
- "keep_clip_on_cpu:true"
|
||||||
- "t5xxl_path:umt5-xxl-encoder-Q8_0.gguf"
|
- "t5xxl_path:umt5-xxl-encoder-Q8_0.gguf"
|
||||||
- "vae_path:wan_2.1_vae.safetensors"
|
- "vae_path:wan_2.1_vae.safetensors"
|
||||||
|
|||||||
Reference in New Issue
Block a user