mirror of
https://github.com/mudler/LocalAI.git
synced 2026-07-03 04:46:54 -04:00
* feat(messaging): add backend.upgrade NATS subject + payload types
Splits the slow force-reinstall path off backend.install so it can run on
its own subscription goroutine, eliminating head-of-line blocking between
routine model loads and full gallery upgrades.
Wire-level Force flag on BackendInstallRequest is kept for one release as
the rolling-update fallback target; doc note marks it deprecated.
Assisted-by: Claude:claude-sonnet-4-6
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* feat(distributed/worker): add per-backend mutex helper to backendSupervisor
Different backend names lock independently; same backend serializes. This
is the synchronization primitive used by the upcoming concurrent install
handler — without it, wrapping the NATS callback in a goroutine would
race the gallery directory when two requests target the same backend.
Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* fix(distributed/worker): run backend.install handler in a goroutine
NATS subscriptions deliver messages serially on a single per-subscription
goroutine. With a synchronous install handler, a multi-minute gallery
download would head-of-line-block every other install request to the
same worker — manifesting upstream as a 5-minute "nats: timeout" on
unrelated routine model loads.
The body now runs in its own goroutine, with a per-backend mutex
(lockBackend) protecting the gallery directory from concurrent operations
on the same backend. Different backend names install in parallel.
Backward-compat: req.Force=true is still honored here, so an older master
that hasn't been updated to send on backend.upgrade keeps working.
Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* feat(distributed/worker): subscribe to backend.upgrade as a separate path
Slow force-reinstall now lives on its own NATS subscription, so a
multi-minute gallery pull cannot head-of-line-block the routine
backend.install handler on the same worker. Same per-backend mutex
guards both — concurrent install + upgrade for the same backend
serialize at the gallery directory; different backends are independent.
upgradeBackend stops every live process for the backend, force-installs
from gallery, and re-registers. It does not start a new process — the
next backend.install will spawn one with the freshly-pulled binary.
Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* feat(distributed): add UpgradeBackend on NodeCommandSender; drop Force from InstallBackend
Master now sends to backend.upgrade for force-reinstall, with a
nats.ErrNoResponders fallback to the legacy backend.install Force=true
path so a rolling update with a new master + an old worker still
converges. The Force parameter leaves the public Go API surface
entirely — only the internal fallback sets it on the wire.
InstallBackend timeout drops 5min -> 3min (most replies are sub-second
since the worker short-circuits on already-running or already-installed).
UpgradeBackend timeout is 15min, sized for real-world Jetson-on-WiFi
gallery pulls.
Updates the admin install HTTP endpoint
(core/http/endpoints/localai/nodes.go) to the new signature too.
router_test.go's fakeUnloader does not yet implement the new interface
shape; Task 3.2 will catch it up before the next package-level test run.
Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* test(distributed): update fakeUnloader for new NodeCommandSender shape
InstallBackend lost its force bool param (Force is not part of the public
Go API anymore — only the internal upgrade-fallback path sets it on the
wire). UpgradeBackend gained a method. Fake records both call slices and
provides an installHook concurrency seam for upcoming singleflight tests.
Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* test(distributed): cover UpgradeBackend's new subject + rolling-update fallback
Task 3.1 changed the master to publish UpgradeBackend on the new
backend.upgrade subject; the existing UpgradeBackend tests scripted the
old install subject and so all 3 began failing as expected. Updates them
to script SubjectNodeBackendUpgrade with BackendUpgradeReply.
Adds two new specs for the rolling-update fallback:
- ErrNoResponders on backend.upgrade triggers a backend.install
Force=true retry on the same node.
- Non-NoResponders errors propagate to the caller unchanged.
scriptedMessagingClient gains scriptNoResponders (real nats sentinel) and
scriptReplyMatching (predicate-matched canned reply, used to assert that
the fallback path actually sets Force=true on the install retry).
Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* fix(distributed): coalesce concurrent identical backend.install via singleflight
Six simultaneous chat completions for the same not-yet-loaded model were
observed firing six independent NATS install requests, each serializing
through the worker's per-subscription goroutine and amplifying queue
depth. SmartRouter now wraps the NATS round-trip in a singleflight.Group
keyed by (nodeID, backend, modelID, replica): N concurrent identical
loads share one round-trip and one reply.
Distinct (modelID, replica) keys still fire independent calls, so
multi-replica scaling and multi-model fan-out are unaffected.
fakeUnloader gains a sync.Mutex around its recording slices to keep
concurrent test goroutines race-clean.
Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* test(e2e/distributed): drop force arg from InstallBackend test calls
Two e2e test call sites still passed the trailing force bool that was
removed from RemoteUnloaderAdapter.InstallBackend in 9bde76d7. Caught
by golangci-lint typecheck on the upgrade-split branch (master CI was
already green because these tests don't run in the standard test path).
Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* refactor(distributed): extract worker business logic to core/services/worker
core/cli/worker.go grew to 1212 lines after the backend.upgrade split.
The CLI package was carrying backendSupervisor, NATS lifecycle handlers,
gallery install/upgrade orchestration, S3 file staging, and registration
helpers — all distributed-worker business logic that doesn't belong in
the cobra surface.
Move it to a new core/services/worker package, mirroring the existing
core/services/{nodes,messaging,galleryop} pattern. core/cli/worker.go
shrinks to ~19 lines: a kong-tagged shim that embeds worker.Config and
delegates Run.
No behavior change. All symbols stay unexported except Config and Run.
The three worker-specific tests (addr/replica/concurrency) move with
the code via git mv so history follows them.
Files split as:
worker.go - Run entry point
config.go - Config struct (kong tags retained, kong not imported)
supervisor.go - backendProcess, backendSupervisor, process lifecycle
install.go - installBackend, upgradeBackend, findBackend, lockBackend
lifecycle.go - subscribeLifecycleEvents (verbatim, decomposition is
a follow-up commit)
file_staging.go - subscribeFileStaging, isPathAllowed
registration.go - advertiseAddr, registrationBody, heartbeatBody, etc.
reply.go - replyJSON
process_helpers.go - readLastLinesFromFile
Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
* refactor(distributed/worker): decompose subscribeLifecycleEvents into per-event handlers
The 226-line subscribeLifecycleEvents method packed eight NATS subscriptions
inline. Each grew context-shaped doc comments mixed with subscription
plumbing, making it hard to read any one handler without scrolling past the
others. Extract each handler into its own method on *backendSupervisor; the
subscriber becomes a thin 8-line dispatcher.
No behavior change: each method body is byte-equivalent to its corresponding
inline goroutine + handler. Doc comments that were attached to the inline
SubscribeReply calls migrate to the new method godocs.
Adding the next NATS subject is now a 2-line patch to the dispatcher plus
one new method, instead of grafting onto a monolith.
Assisted-by: Claude:claude-opus-4-7
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---------
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
559 lines
22 KiB
Go
559 lines
22 KiB
Go
package nodes
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/mudler/LocalAI/core/services/advisorylock"
|
|
grpcclient "github.com/mudler/LocalAI/pkg/grpc"
|
|
"github.com/mudler/xlog"
|
|
"github.com/nats-io/nats.go"
|
|
"gorm.io/gorm"
|
|
)
|
|
|
|
// ModelProber checks whether a model's backend process is still reachable.
|
|
// Defaulted to a gRPC health probe but overridable for tests so we don't
|
|
// need to stand up a real server. Returning false without an error means the
|
|
// process is reachable but unhealthy (same as a timeout for our purposes).
|
|
type ModelProber interface {
|
|
IsAlive(ctx context.Context, address string) bool
|
|
}
|
|
|
|
// grpcModelProber does a 1s HealthCheck on the model's stored gRPC address.
|
|
type grpcModelProber struct{ token string }
|
|
|
|
func (g grpcModelProber) IsAlive(ctx context.Context, address string) bool {
|
|
client := grpcclient.NewClientWithToken(address, false, nil, false, g.token)
|
|
probeCtx, cancel := context.WithTimeout(ctx, 1*time.Second)
|
|
defer cancel()
|
|
ok, _ := client.HealthCheck(probeCtx)
|
|
return ok
|
|
}
|
|
|
|
// ReplicaReconciler periodically ensures model replica counts match their
|
|
// scheduling configs. It scales up replicas when below MinReplicas or when
|
|
// all replicas are busy (up to MaxReplicas), and scales down idle replicas
|
|
// above MinReplicas.
|
|
//
|
|
// Alongside replica scaling it runs two state-reconciliation passes — draining
|
|
// the pending_backend_ops queue and probing loaded models' gRPC addresses to
|
|
// orphan ghosts. Both passes are wrapped in the KeyStateReconciler advisory
|
|
// lock so N frontends don't stampede.
|
|
//
|
|
// Only processes models with auto-scaling enabled (MinReplicas > 0 or MaxReplicas > 0).
|
|
type ReplicaReconciler struct {
|
|
registry *NodeRegistry
|
|
scheduler ModelScheduler // interface for scheduling new models
|
|
unloader NodeCommandSender
|
|
adapter *RemoteUnloaderAdapter // NATS sender for pending-op drain
|
|
prober ModelProber // health probe for model gRPC addrs
|
|
db *gorm.DB
|
|
interval time.Duration
|
|
scaleDownDelay time.Duration
|
|
// probeStaleAfter: only probe node_models rows older than this so we
|
|
// don't hammer every worker every tick for models we just heard from.
|
|
probeStaleAfter time.Duration
|
|
}
|
|
|
|
// ModelScheduler abstracts the scheduling logic needed by the reconciler.
|
|
// SmartRouter implements this interface.
|
|
type ModelScheduler interface {
|
|
// ScheduleAndLoadModel picks a node (optionally from candidateNodeIDs),
|
|
// installs the backend, and loads the model. Returns the node used.
|
|
ScheduleAndLoadModel(ctx context.Context, modelName string, candidateNodeIDs []string) (*BackendNode, error)
|
|
}
|
|
|
|
// ReplicaReconcilerOptions holds configuration for creating a ReplicaReconciler.
|
|
type ReplicaReconcilerOptions struct {
|
|
Registry *NodeRegistry
|
|
Scheduler ModelScheduler
|
|
Unloader NodeCommandSender
|
|
// Adapter is the NATS sender used to retry pending backend ops. When nil,
|
|
// the state-reconciler pending-drain pass is a no-op (single-node mode).
|
|
Adapter *RemoteUnloaderAdapter
|
|
// RegistrationToken is used by the default gRPC prober when probing model
|
|
// addresses. Matches the worker's token so HealthCheck auth succeeds.
|
|
RegistrationToken string
|
|
// Prober overrides the default gRPC health probe (used by tests).
|
|
Prober ModelProber
|
|
DB *gorm.DB
|
|
Interval time.Duration // default 30s
|
|
ScaleDownDelay time.Duration // default 5m
|
|
ProbeStaleAfter time.Duration // default 2m
|
|
}
|
|
|
|
// NewReplicaReconciler creates a new ReplicaReconciler.
|
|
func NewReplicaReconciler(opts ReplicaReconcilerOptions) *ReplicaReconciler {
|
|
interval := opts.Interval
|
|
if interval == 0 {
|
|
interval = 30 * time.Second
|
|
}
|
|
scaleDownDelay := opts.ScaleDownDelay
|
|
if scaleDownDelay == 0 {
|
|
scaleDownDelay = 5 * time.Minute
|
|
}
|
|
probeStaleAfter := opts.ProbeStaleAfter
|
|
if probeStaleAfter == 0 {
|
|
probeStaleAfter = 2 * time.Minute
|
|
}
|
|
prober := opts.Prober
|
|
if prober == nil {
|
|
prober = grpcModelProber{token: opts.RegistrationToken}
|
|
}
|
|
return &ReplicaReconciler{
|
|
registry: opts.Registry,
|
|
scheduler: opts.Scheduler,
|
|
unloader: opts.Unloader,
|
|
adapter: opts.Adapter,
|
|
prober: prober,
|
|
db: opts.DB,
|
|
interval: interval,
|
|
scaleDownDelay: scaleDownDelay,
|
|
probeStaleAfter: probeStaleAfter,
|
|
}
|
|
}
|
|
|
|
// Run starts the reconciliation loop. It blocks until ctx is cancelled.
|
|
func (rc *ReplicaReconciler) Run(ctx context.Context) {
|
|
ticker := time.NewTicker(rc.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
rc.reconcileOnce(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
// reconcileOnce performs a single reconciliation pass. Replica work and
|
|
// state-reconciliation work run under *different* advisory locks so multiple
|
|
// frontends can share load across passes, and one long-running pass doesn't
|
|
// block the other forever if a frontend wedges.
|
|
func (rc *ReplicaReconciler) reconcileOnce(ctx context.Context) {
|
|
if rc.db != nil {
|
|
replicaKey := advisorylock.KeyFromString("replica-reconciler")
|
|
_ = advisorylock.WithLockCtx(ctx, rc.db, replicaKey, func() error {
|
|
rc.reconcile(ctx)
|
|
return nil
|
|
})
|
|
// Try, don't block: if another frontend is already running the state
|
|
// pass, this tick is a no-op. Matches the health monitor pattern.
|
|
_, _ = advisorylock.TryWithLockCtx(ctx, rc.db, advisorylock.KeyStateReconciler, func() error {
|
|
rc.reconcileState(ctx)
|
|
return nil
|
|
})
|
|
} else {
|
|
rc.reconcile(ctx)
|
|
rc.reconcileState(ctx)
|
|
}
|
|
}
|
|
|
|
// reconcileState runs the state-reconciliation passes: drain pending backend
|
|
// ops for freshly-healthy nodes, then probe model gRPC addresses to orphan
|
|
// ghosts. Both passes are best-effort: a failure on one node doesn't stop
|
|
// the rest.
|
|
func (rc *ReplicaReconciler) reconcileState(ctx context.Context) {
|
|
if rc.adapter != nil {
|
|
rc.drainPendingBackendOps(ctx)
|
|
}
|
|
rc.probeLoadedModels(ctx)
|
|
}
|
|
|
|
// drainPendingBackendOps retries queued backend ops whose next_retry_at has
|
|
// passed on nodes that are currently healthy. On success the row is deleted;
|
|
// on failure attempts++ and next_retry_at moves out via exponential backoff.
|
|
func (rc *ReplicaReconciler) drainPendingBackendOps(ctx context.Context) {
|
|
ops, err := rc.registry.ListDuePendingBackendOps(ctx)
|
|
if err != nil {
|
|
xlog.Warn("Reconciler: failed to list pending backend ops", "error", err)
|
|
return
|
|
}
|
|
if len(ops) == 0 {
|
|
return
|
|
}
|
|
xlog.Debug("Reconciler: draining pending backend ops", "count", len(ops))
|
|
|
|
for _, op := range ops {
|
|
if err := ctx.Err(); err != nil {
|
|
return
|
|
}
|
|
var applyErr error
|
|
switch op.Op {
|
|
case OpBackendDelete:
|
|
_, applyErr = rc.adapter.DeleteBackend(op.NodeID, op.Backend)
|
|
case OpBackendInstall:
|
|
// Pending-op drain for admin install — not a per-replica load.
|
|
// Replica 0 is the conventional admin slot. Install is idempotent:
|
|
// the worker short-circuits if the backend is already running.
|
|
reply, err := rc.adapter.InstallBackend(op.NodeID, op.Backend, "", string(op.Galleries), "", "", "", 0)
|
|
if err != nil {
|
|
applyErr = err
|
|
} else if !reply.Success {
|
|
applyErr = fmt.Errorf("install failed: %s", reply.Error)
|
|
}
|
|
case OpBackendUpgrade:
|
|
// Pending-op drain for admin upgrade — fires backend.upgrade so
|
|
// the slow re-pull doesn't head-of-line-block install traffic on
|
|
// the same worker. Falls back to the legacy backend.install
|
|
// Force=true path on nats.ErrNoResponders for old workers that
|
|
// don't subscribe to backend.upgrade yet (rolling-update window).
|
|
reply, err := rc.adapter.UpgradeBackend(op.NodeID, op.Backend, string(op.Galleries), "", "", "", 0)
|
|
if err != nil {
|
|
if errors.Is(err, nats.ErrNoResponders) {
|
|
instReply, instErr := rc.adapter.installWithForceFallback(op.NodeID, op.Backend, string(op.Galleries), "", "", "", 0)
|
|
if instErr != nil {
|
|
applyErr = instErr
|
|
} else if !instReply.Success {
|
|
applyErr = fmt.Errorf("upgrade (legacy fallback) failed: %s", instReply.Error)
|
|
}
|
|
} else {
|
|
applyErr = err
|
|
}
|
|
} else if !reply.Success {
|
|
applyErr = fmt.Errorf("upgrade failed: %s", reply.Error)
|
|
}
|
|
default:
|
|
xlog.Warn("Reconciler: unknown pending op", "op", op.Op, "id", op.ID)
|
|
continue
|
|
}
|
|
|
|
if applyErr == nil {
|
|
if err := rc.registry.DeletePendingBackendOp(ctx, op.ID); err != nil {
|
|
xlog.Warn("Reconciler: failed to delete drained op row", "id", op.ID, "error", err)
|
|
} else {
|
|
xlog.Info("Reconciler: pending backend op applied",
|
|
"op", op.Op, "backend", op.Backend, "node", op.NodeID, "attempts", op.Attempts+1)
|
|
}
|
|
continue
|
|
}
|
|
|
|
// ErrNoResponders means the node has no active NATS subscription for
|
|
// this subject. Either its connection dropped, or it's the wrong
|
|
// node type entirely. Mark unhealthy so the health monitor's
|
|
// heartbeat-only pass doesn't immediately flip it back — and so
|
|
// ListDuePendingBackendOps (which filters by status=healthy) stops
|
|
// picking the row until the node genuinely recovers.
|
|
if errors.Is(applyErr, nats.ErrNoResponders) {
|
|
xlog.Warn("Reconciler: no NATS responders — marking node unhealthy",
|
|
"op", op.Op, "backend", op.Backend, "node", op.NodeID)
|
|
_ = rc.registry.MarkUnhealthy(ctx, op.NodeID)
|
|
}
|
|
|
|
// Dead-letter cap: after maxAttempts the row is the reconciler
|
|
// equivalent of a poison message. Delete it loudly so the queue
|
|
// doesn't churn NATS every tick forever — operators can re-issue
|
|
// the op from the UI if they still want it applied.
|
|
if op.Attempts+1 >= maxPendingBackendOpAttempts {
|
|
xlog.Error("Reconciler: abandoning pending backend op after max attempts",
|
|
"op", op.Op, "backend", op.Backend, "node", op.NodeID,
|
|
"attempts", op.Attempts+1, "last_error", applyErr)
|
|
if err := rc.registry.DeletePendingBackendOp(ctx, op.ID); err != nil {
|
|
xlog.Warn("Reconciler: failed to delete abandoned op row", "id", op.ID, "error", err)
|
|
}
|
|
continue
|
|
}
|
|
|
|
_ = rc.registry.RecordPendingBackendOpFailure(ctx, op.ID, applyErr.Error())
|
|
xlog.Warn("Reconciler: pending backend op retry failed",
|
|
"op", op.Op, "backend", op.Backend, "node", op.NodeID, "attempts", op.Attempts+1, "error", applyErr)
|
|
}
|
|
}
|
|
|
|
// maxPendingBackendOpAttempts caps how many times the reconciler retries a
|
|
// failing row before dead-lettering it. Ten attempts at exponential backoff
|
|
// (30s → 15m cap) is >1h of wall-clock patience — well past any transient
|
|
// worker restart or network blip. Poisoned rows beyond that are almost
|
|
// certainly structural (wrong node type, non-existent gallery entry) and no
|
|
// amount of further retrying will help.
|
|
const maxPendingBackendOpAttempts = 10
|
|
|
|
// probeLoadedModels gRPC-health-checks model addresses that the DB says are
|
|
// loaded. If a model's backend process is gone (OOM, crash, manual restart)
|
|
// we remove the row so ghosts don't linger. Only probes rows older than
|
|
// probeStaleAfter so we don't hammer every worker every tick for models we
|
|
// just heard from.
|
|
func (rc *ReplicaReconciler) probeLoadedModels(ctx context.Context) {
|
|
var stale []NodeModel
|
|
cutoff := time.Now().Add(-rc.probeStaleAfter)
|
|
err := rc.registry.db.WithContext(ctx).
|
|
Joins("JOIN backend_nodes ON backend_nodes.id = node_models.node_id").
|
|
Where("node_models.state = ? AND backend_nodes.status = ? AND node_models.updated_at < ? AND node_models.address != ''",
|
|
"loaded", StatusHealthy, cutoff).
|
|
Find(&stale).Error
|
|
if err != nil {
|
|
xlog.Warn("Reconciler: failed to list loaded models for probe", "error", err)
|
|
return
|
|
}
|
|
for _, m := range stale {
|
|
if err := ctx.Err(); err != nil {
|
|
return
|
|
}
|
|
if rc.prober.IsAlive(ctx, m.Address) {
|
|
// Bump updated_at so we don't probe this row again immediately.
|
|
_ = rc.registry.db.WithContext(ctx).Model(&NodeModel{}).
|
|
Where("id = ?", m.ID).Update("updated_at", time.Now()).Error
|
|
continue
|
|
}
|
|
if err := rc.registry.RemoveNodeModel(ctx, m.NodeID, m.ModelName, m.ReplicaIndex); err != nil {
|
|
xlog.Warn("Reconciler: failed to remove unreachable model", "node", m.NodeID, "model", m.ModelName, "replica", m.ReplicaIndex, "error", err)
|
|
continue
|
|
}
|
|
xlog.Warn("Reconciler: model unreachable, removed from registry",
|
|
"node", m.NodeID, "model", m.ModelName, "replica", m.ReplicaIndex, "address", m.Address)
|
|
}
|
|
}
|
|
|
|
func (rc *ReplicaReconciler) reconcile(ctx context.Context) {
|
|
configs, err := rc.registry.ListAutoScalingConfigs(ctx)
|
|
if err != nil {
|
|
xlog.Warn("Reconciler: failed to list auto-scaling configs", "error", err)
|
|
return
|
|
}
|
|
|
|
for _, cfg := range configs {
|
|
if err := ctx.Err(); err != nil {
|
|
return // context cancelled
|
|
}
|
|
rc.reconcileModel(ctx, cfg)
|
|
}
|
|
}
|
|
|
|
// unsatisfiableTickThreshold is how many consecutive ticks of "capacity == 0
|
|
// && need > 0" must elapse before the reconciler stops trying. Three ticks at
|
|
// the default 30s interval gives ~90s of grace before logging a warning and
|
|
// entering cooldown — enough to ride out a transient race between Register
|
|
// and the next tick, but short enough that a misconfig (MinReplicas above
|
|
// cluster capacity) doesn't churn the worker forever like it did pre-PR4.
|
|
const unsatisfiableTickThreshold = 3
|
|
|
|
// unsatisfiableCooldown is the duration the reconciler waits before retrying
|
|
// after the threshold trips. ClearAllUnsatisfiable on cluster events shortens
|
|
// this in practice — the cooldown is the worst-case, not the steady-state.
|
|
const unsatisfiableCooldown = 5 * time.Minute
|
|
|
|
// candidateNodeIDsForSelector resolves the model's NodeSelector to a slice
|
|
// of node IDs, or returns nil if no selector is configured (meaning "any
|
|
// healthy node" — registry helpers interpret nil as no candidate filter).
|
|
// Returns ok=false if a non-empty selector matched zero nodes, in which case
|
|
// the caller should skip — there's nothing to schedule on.
|
|
func (rc *ReplicaReconciler) candidateNodeIDsForSelector(ctx context.Context, cfg ModelSchedulingConfig) (ids []string, ok bool) {
|
|
if cfg.NodeSelector == "" {
|
|
return nil, true
|
|
}
|
|
sel := parseSelector(cfg.NodeSelector)
|
|
if len(sel) == 0 {
|
|
return nil, true
|
|
}
|
|
nodes, err := rc.registry.FindNodesBySelector(ctx, sel)
|
|
if err != nil || len(nodes) == 0 {
|
|
return nil, false
|
|
}
|
|
ids = make([]string, len(nodes))
|
|
for i, n := range nodes {
|
|
ids[i] = n.ID
|
|
}
|
|
return ids, true
|
|
}
|
|
|
|
func (rc *ReplicaReconciler) reconcileModel(ctx context.Context, cfg ModelSchedulingConfig) {
|
|
// Cooldown gate: if we previously decided this config is unsatisfiable,
|
|
// don't even bother checking until the cooldown expires. ClearAllUnsatisfiable
|
|
// (fired by node lifecycle events) bypasses this by zeroing the column.
|
|
if cfg.UnsatisfiableUntil != nil && cfg.UnsatisfiableUntil.After(time.Now()) {
|
|
return
|
|
}
|
|
|
|
current, err := rc.registry.CountLoadedReplicas(ctx, cfg.ModelName)
|
|
if err != nil {
|
|
xlog.Warn("Reconciler: failed to count replicas", "model", cfg.ModelName, "error", err)
|
|
return
|
|
}
|
|
|
|
// 1. Ensure minimum replicas, but only up to what the cluster can host.
|
|
// Without this cap, a MinReplicas above cluster capacity would loop
|
|
// forever (the original flap: every tick "scaling up", but the registry
|
|
// never grows because all nodes are full).
|
|
if cfg.MinReplicas > 0 && int(current) < cfg.MinReplicas {
|
|
candidateNodeIDs, selectorMatched := rc.candidateNodeIDsForSelector(ctx, cfg)
|
|
if !selectorMatched {
|
|
xlog.Warn("Reconciler: no nodes match selector", "model", cfg.ModelName, "selector", cfg.NodeSelector)
|
|
rc.markCapacityProblem(ctx, cfg.ModelName, "no nodes match selector")
|
|
return
|
|
}
|
|
|
|
capacity, capErr := rc.registry.ClusterCapacityForModel(ctx, cfg.ModelName, candidateNodeIDs)
|
|
if capErr != nil {
|
|
xlog.Warn("Reconciler: failed to compute cluster capacity", "model", cfg.ModelName, "error", capErr)
|
|
return
|
|
}
|
|
|
|
needed := cfg.MinReplicas - int(current)
|
|
if capacity == 0 {
|
|
// No capacity right now. Bump hysteresis; trip cooldown if it
|
|
// crosses the threshold. ClearAllUnsatisfiable resets this on
|
|
// any plausible capacity-changing event.
|
|
rc.markCapacityProblem(ctx, cfg.ModelName, "cluster capacity exhausted")
|
|
return
|
|
}
|
|
// Cap to actual capacity so we don't try harder than possible.
|
|
if needed > capacity {
|
|
xlog.Info("Reconciler: capping scale-up at cluster capacity", "model", cfg.ModelName,
|
|
"need", needed, "capacity", capacity)
|
|
needed = capacity
|
|
}
|
|
xlog.Info("Reconciler: scaling up to meet minimum", "model", cfg.ModelName,
|
|
"current", current, "min", cfg.MinReplicas, "adding", needed)
|
|
rc.scaleUp(ctx, cfg, needed)
|
|
// Successful (or partial) scale-up clears the hysteresis so a future
|
|
// dip starts fresh.
|
|
_ = rc.registry.ClearUnsatisfiable(ctx, cfg.ModelName)
|
|
return
|
|
}
|
|
|
|
// 2. Auto-scale up if all replicas are busy
|
|
if current > 0 && (cfg.MaxReplicas == 0 || int(current) < cfg.MaxReplicas) {
|
|
if rc.allReplicasBusy(ctx, cfg.ModelName) {
|
|
candidateNodeIDs, selectorMatched := rc.candidateNodeIDsForSelector(ctx, cfg)
|
|
if !selectorMatched {
|
|
return
|
|
}
|
|
capacity, capErr := rc.registry.ClusterCapacityForModel(ctx, cfg.ModelName, candidateNodeIDs)
|
|
if capErr != nil || capacity == 0 {
|
|
// All busy AND no slot available — burst load above capacity.
|
|
// Don't enter cooldown for this case (it's transient demand,
|
|
// not a misconfig); the next tick will retry naturally.
|
|
return
|
|
}
|
|
xlog.Info("Reconciler: all replicas busy, scaling up", "model", cfg.ModelName,
|
|
"current", current)
|
|
rc.scaleUp(ctx, cfg, 1)
|
|
}
|
|
}
|
|
|
|
// 3. Scale down idle replicas above minimum
|
|
floor := cfg.MinReplicas
|
|
if floor < 1 {
|
|
floor = 1
|
|
}
|
|
if int(current) > floor {
|
|
rc.scaleDownIdle(ctx, cfg, int(current), floor)
|
|
}
|
|
}
|
|
|
|
// markCapacityProblem advances the hysteresis counter and sets the cooldown
|
|
// timestamp once it crosses the threshold. Centralized so the two scale-up
|
|
// paths (MinReplicas and busy-burst) report capacity exhaustion the same way.
|
|
func (rc *ReplicaReconciler) markCapacityProblem(ctx context.Context, modelName, reason string) {
|
|
ticks, err := rc.registry.BumpUnsatisfiableTicks(ctx, modelName)
|
|
if err != nil {
|
|
xlog.Warn("Reconciler: failed to bump unsatisfiable counter", "model", modelName, "error", err)
|
|
return
|
|
}
|
|
if ticks >= unsatisfiableTickThreshold {
|
|
until := time.Now().Add(unsatisfiableCooldown)
|
|
if err := rc.registry.MarkUnsatisfiable(ctx, modelName, until); err != nil {
|
|
xlog.Warn("Reconciler: failed to mark unsatisfiable", "model", modelName, "error", err)
|
|
return
|
|
}
|
|
xlog.Warn("Reconciler: scheduling unsatisfiable, entering cooldown",
|
|
"model", modelName, "reason", reason,
|
|
"cooldown", unsatisfiableCooldown, "retry_after", until.Format(time.RFC3339))
|
|
}
|
|
}
|
|
|
|
// scaleUp schedules additional replicas of the model. Callers in
|
|
// reconcileModel are expected to have already capped `count` against
|
|
// ClusterCapacityForModel so this function never tries to overshoot.
|
|
func (rc *ReplicaReconciler) scaleUp(ctx context.Context, cfg ModelSchedulingConfig, count int) {
|
|
if rc.scheduler == nil {
|
|
xlog.Warn("Reconciler: no scheduler available, cannot scale up")
|
|
return
|
|
}
|
|
|
|
// Resolve selector → candidate node IDs (nil when no selector → "any
|
|
// healthy node"). The selector mismatch case is handled upstream in
|
|
// reconcileModel, but defensively short-circuit here too.
|
|
candidateNodeIDs, ok := rc.candidateNodeIDsForSelector(ctx, cfg)
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
for i := 0; i < count; i++ {
|
|
node, err := rc.scheduler.ScheduleAndLoadModel(ctx, cfg.ModelName, candidateNodeIDs)
|
|
if err != nil {
|
|
xlog.Warn("Reconciler: failed to scale up replica", "model", cfg.ModelName,
|
|
"attempt", i+1, "error", err)
|
|
return // stop trying on first failure
|
|
}
|
|
xlog.Info("Reconciler: scaled up replica", "model", cfg.ModelName, "node", node.Name)
|
|
}
|
|
}
|
|
|
|
// scaleDownIdle removes idle replicas above the floor.
|
|
func (rc *ReplicaReconciler) scaleDownIdle(ctx context.Context, cfg ModelSchedulingConfig, current, floor int) {
|
|
if rc.unloader == nil {
|
|
return
|
|
}
|
|
|
|
// Find idle replicas that have been unused for longer than scaleDownDelay.
|
|
// Order by replica_index DESC first, then last_used ASC: trim the
|
|
// highest-indexed replicas first so subsequent scale-ups can reuse the
|
|
// low indexes via NextFreeReplicaIndex, keeping slot allocation compact
|
|
// and matching the worker supervisor's port-recycling behavior.
|
|
cutoff := time.Now().Add(-rc.scaleDownDelay)
|
|
var idleModels []NodeModel
|
|
rc.registry.db.WithContext(ctx).
|
|
Where("model_name = ? AND state = ? AND in_flight = 0 AND last_used < ?",
|
|
cfg.ModelName, "loaded", cutoff).
|
|
Order("replica_index DESC, last_used ASC").
|
|
Find(&idleModels)
|
|
|
|
toRemove := current - floor
|
|
removed := 0
|
|
for _, nm := range idleModels {
|
|
if removed >= toRemove {
|
|
break
|
|
}
|
|
// Remove this specific replica row from registry (sibling replicas of
|
|
// the same model on the same node, if any, are unaffected).
|
|
if err := rc.registry.RemoveNodeModel(ctx, nm.NodeID, nm.ModelName, nm.ReplicaIndex); err != nil {
|
|
xlog.Warn("Reconciler: failed to remove model record", "node", nm.NodeID, "model", nm.ModelName, "replica", nm.ReplicaIndex, "error", err)
|
|
continue
|
|
}
|
|
// Unload from worker
|
|
if err := rc.unloader.UnloadModelOnNode(nm.NodeID, nm.ModelName); err != nil {
|
|
xlog.Warn("Reconciler: unload failed (model already removed from registry)", "error", err)
|
|
}
|
|
xlog.Info("Reconciler: scaled down idle replica", "model", cfg.ModelName, "node", nm.NodeID, "replica", nm.ReplicaIndex)
|
|
removed++
|
|
}
|
|
}
|
|
|
|
// allReplicasBusy returns true if all loaded replicas of a model have in-flight requests.
|
|
func (rc *ReplicaReconciler) allReplicasBusy(ctx context.Context, modelName string) bool {
|
|
var idleCount int64
|
|
rc.registry.db.WithContext(ctx).Model(&NodeModel{}).
|
|
Where("model_name = ? AND state = ? AND in_flight = 0", modelName, "loaded").
|
|
Count(&idleCount)
|
|
return idleCount == 0
|
|
}
|
|
|
|
// parseSelector decodes a JSON node selector string into a map.
|
|
func parseSelector(selectorJSON string) map[string]string {
|
|
if selectorJSON == "" {
|
|
return nil
|
|
}
|
|
var selector map[string]string
|
|
if err := json.Unmarshal([]byte(selectorJSON), &selector); err != nil {
|
|
xlog.Warn("Failed to parse node selector", "selector", selectorJSON, "error", err)
|
|
return nil
|
|
}
|
|
return selector
|
|
}
|