Files
LocalAI/core/services/nodes/reconciler.go
LocalAI [bot] 7637f8cf1b feat(distributed): declarative per-model scheduling via env/args (#10308)
* feat(distributed): add SpreadAll column and authoritative scheduling seeding

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(distributed): parse declarative model scheduling config (env/file)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(distributed): reconcile spread_all to one replica per matching node

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(distributed): wire LOCALAI_MODEL_SCHEDULING env/args and startup seeding

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(distributed): expose spread_all on the scheduling API endpoint

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(distributed): add spread-to-all-nodes mode to the scheduling UI

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs(distributed): document LOCALAI_MODEL_SCHEDULING env/args

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* docs(distributed): clarify replica modes and all-nodes spread in scheduling config

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-13 18:31:06 +02:00

686 lines
28 KiB
Go

package nodes
import (
"context"
"encoding/json"
"errors"
"fmt"
"time"
"github.com/mudler/LocalAI/core/services/advisorylock"
"github.com/mudler/LocalAI/core/services/nodes/prefixcache"
grpcclient "github.com/mudler/LocalAI/pkg/grpc"
"github.com/mudler/xlog"
"github.com/nats-io/nats.go"
"gorm.io/gorm"
)
// ModelProber checks whether a model's backend process is still reachable.
// Defaulted to a gRPC health probe but overridable for tests so we don't
// need to stand up a real server. Returning false without an error means the
// process is reachable but unhealthy (same as a timeout for our purposes).
type ModelProber interface {
IsAlive(ctx context.Context, address string) bool
}
// grpcModelProber does a 1s HealthCheck on the model's stored gRPC address.
type grpcModelProber struct{ token string }
func (g grpcModelProber) IsAlive(ctx context.Context, address string) bool {
client := grpcclient.NewClientWithToken(address, false, nil, false, g.token)
probeCtx, cancel := context.WithTimeout(ctx, 1*time.Second)
defer cancel()
ok, _ := client.HealthCheck(probeCtx)
return ok
}
// ReplicaReconciler periodically ensures model replica counts match their
// scheduling configs. It scales up replicas when below MinReplicas or when
// all replicas are busy (up to MaxReplicas), and scales down idle replicas
// above MinReplicas.
//
// Alongside replica scaling it runs two state-reconciliation passes — draining
// the pending_backend_ops queue and probing loaded models' gRPC addresses to
// orphan ghosts. Both passes are wrapped in the KeyStateReconciler advisory
// lock so N frontends don't stampede.
//
// Only processes models with auto-scaling enabled (MinReplicas > 0 or MaxReplicas > 0).
type ReplicaReconciler struct {
registry *NodeRegistry
scheduler ModelScheduler // interface for scheduling new models
unloader NodeCommandSender
adapter *RemoteUnloaderAdapter // NATS sender for pending-op drain
prober ModelProber // health probe for model gRPC addrs
db *gorm.DB
interval time.Duration
scaleDownDelay time.Duration
// probeStaleAfter: only probe node_models rows older than this so we
// don't hammer every worker every tick for models we just heard from.
probeStaleAfter time.Duration
// pressure is the shared forced-disturb counter written by the router. When
// a model's count within the Pressure's rolling window reaches pressureThreshold the
// reconciler treats its cache-warm replica as saturated and scales up,
// subject to the same MaxReplicas/capacity/UnsatisfiableUntil machinery as
// the other scale-up paths. nil disables this signal (a true no-op).
pressure *prefixcache.Pressure
pressureThreshold int
}
// ModelScheduler abstracts the scheduling logic needed by the reconciler.
// SmartRouter implements this interface.
type ModelScheduler interface {
// ScheduleAndLoadModel picks a node (optionally from candidateNodeIDs),
// installs the backend, and loads the model. Returns the node used.
ScheduleAndLoadModel(ctx context.Context, modelName string, candidateNodeIDs []string) (*BackendNode, error)
}
// ReplicaReconcilerOptions holds configuration for creating a ReplicaReconciler.
type ReplicaReconcilerOptions struct {
Registry *NodeRegistry
Scheduler ModelScheduler
Unloader NodeCommandSender
// Adapter is the NATS sender used to retry pending backend ops. When nil,
// the state-reconciler pending-drain pass is a no-op (single-node mode).
Adapter *RemoteUnloaderAdapter
// RegistrationToken is used by the default gRPC prober when probing model
// addresses. Matches the worker's token so HealthCheck auth succeeds.
RegistrationToken string
// Prober overrides the default gRPC health probe (used by tests).
Prober ModelProber
DB *gorm.DB
Interval time.Duration // default 30s
ScaleDownDelay time.Duration // default 5m
ProbeStaleAfter time.Duration // default 2m
// Pressure is the shared forced-disturb counter written by the router. nil
// disables the cache-saturation autoscale signal (a true no-op).
Pressure *prefixcache.Pressure
// PressureThreshold is the forced-disturb count within PressureWindow that
// triggers a scale-up. Default prefixcache.DefaultConfig().PressureScaleThreshold (1).
PressureThreshold int
}
// NewReplicaReconciler creates a new ReplicaReconciler.
func NewReplicaReconciler(opts ReplicaReconcilerOptions) *ReplicaReconciler {
interval := opts.Interval
if interval == 0 {
interval = 30 * time.Second
}
scaleDownDelay := opts.ScaleDownDelay
if scaleDownDelay == 0 {
scaleDownDelay = 5 * time.Minute
}
probeStaleAfter := opts.ProbeStaleAfter
if probeStaleAfter == 0 {
probeStaleAfter = 2 * time.Minute
}
prober := opts.Prober
if prober == nil {
prober = grpcModelProber{token: opts.RegistrationToken}
}
pressureThreshold := opts.PressureThreshold
if pressureThreshold == 0 {
pressureThreshold = prefixcache.DefaultConfig().PressureScaleThreshold
}
return &ReplicaReconciler{
registry: opts.Registry,
scheduler: opts.Scheduler,
unloader: opts.Unloader,
adapter: opts.Adapter,
prober: prober,
db: opts.DB,
interval: interval,
scaleDownDelay: scaleDownDelay,
probeStaleAfter: probeStaleAfter,
pressure: opts.Pressure,
pressureThreshold: pressureThreshold,
}
}
// Run starts the reconciliation loop. It blocks until ctx is cancelled.
func (rc *ReplicaReconciler) Run(ctx context.Context) {
ticker := time.NewTicker(rc.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
rc.reconcileOnce(ctx)
}
}
}
// reconcileOnce performs a single reconciliation pass. Replica work and
// state-reconciliation work run under *different* advisory locks so multiple
// frontends can share load across passes, and one long-running pass doesn't
// block the other forever if a frontend wedges.
func (rc *ReplicaReconciler) reconcileOnce(ctx context.Context) {
if rc.db != nil {
replicaKey := advisorylock.KeyFromString("replica-reconciler")
_ = advisorylock.WithLockCtx(ctx, rc.db, replicaKey, func() error {
rc.reconcile(ctx)
return nil
})
// Try, don't block: if another frontend is already running the state
// pass, this tick is a no-op. Matches the health monitor pattern.
_, _ = advisorylock.TryWithLockCtx(ctx, rc.db, advisorylock.KeyStateReconciler, func() error {
rc.reconcileState(ctx)
return nil
})
} else {
rc.reconcile(ctx)
rc.reconcileState(ctx)
}
}
// reconcileState runs the state-reconciliation passes: drain pending backend
// ops for freshly-healthy nodes, then probe model gRPC addresses to orphan
// ghosts. Both passes are best-effort: a failure on one node doesn't stop
// the rest.
func (rc *ReplicaReconciler) reconcileState(ctx context.Context) {
if rc.adapter != nil {
rc.drainPendingBackendOps(ctx)
}
rc.probeLoadedModels(ctx)
}
// drainPendingBackendOps retries queued backend ops whose next_retry_at has
// passed on nodes that are currently healthy. On success the row is deleted;
// on failure attempts++ and next_retry_at moves out via exponential backoff.
func (rc *ReplicaReconciler) drainPendingBackendOps(ctx context.Context) {
// Garbage-collect ops behind nodes that went offline/draining. These are
// invisible to ListDuePendingBackendOps (which filters status=healthy), so
// without this sweep they leak forever and keep the UI operation spinning.
if _, err := rc.registry.DeleteStalePendingBackendOps(ctx, stalePendingBackendOpGrace); err != nil {
xlog.Warn("Reconciler: failed to clear stale pending backend ops", "error", err)
}
ops, err := rc.registry.ListDuePendingBackendOps(ctx)
if err != nil {
xlog.Warn("Reconciler: failed to list pending backend ops", "error", err)
return
}
if len(ops) == 0 {
return
}
xlog.Debug("Reconciler: draining pending backend ops", "count", len(ops))
for _, op := range ops {
if err := ctx.Err(); err != nil {
return
}
var applyErr error
switch op.Op {
case OpBackendDelete:
_, applyErr = rc.adapter.DeleteBackend(op.NodeID, op.Backend)
case OpBackendInstall:
// Pending-op drain for admin install — not a per-replica load.
// Replica 0 is the conventional admin slot. Install is idempotent:
// the worker short-circuits if the backend is already running.
reply, err := rc.adapter.InstallBackend(op.NodeID, op.Backend, "", string(op.Galleries), "", "", "", 0, "", nil)
if err != nil {
applyErr = err
} else if !reply.Success {
applyErr = fmt.Errorf("install failed: %s", reply.Error)
}
case OpBackendUpgrade:
// Pending-op drain for admin upgrade — fires backend.upgrade so
// the slow re-pull doesn't head-of-line-block install traffic on
// the same worker. Falls back to the legacy backend.install
// Force=true path on nats.ErrNoResponders for old workers that
// don't subscribe to backend.upgrade yet (rolling-update window).
// Reconciler retries are background reconciliation with no live
// admin watching a progress bar, so opID/onProgress are empty —
// the adapter skips the progress subscription entirely.
reply, err := rc.adapter.UpgradeBackend(op.NodeID, op.Backend, string(op.Galleries), "", "", "", 0, "", nil)
if err != nil {
if errors.Is(err, nats.ErrNoResponders) {
instReply, instErr := rc.adapter.installWithForceFallback(op.NodeID, op.Backend, string(op.Galleries), "", "", "", 0, "", nil)
if instErr != nil {
applyErr = instErr
} else if !instReply.Success {
applyErr = fmt.Errorf("upgrade (legacy fallback) failed: %s", instReply.Error)
}
} else {
applyErr = err
}
} else if !reply.Success {
applyErr = fmt.Errorf("upgrade failed: %s", reply.Error)
}
default:
xlog.Warn("Reconciler: unknown pending op", "op", op.Op, "id", op.ID)
continue
}
if applyErr == nil {
if err := rc.registry.DeletePendingBackendOp(ctx, op.ID); err != nil {
xlog.Warn("Reconciler: failed to delete drained op row", "id", op.ID, "error", err)
} else {
xlog.Info("Reconciler: pending backend op applied",
"op", op.Op, "backend", op.Backend, "node", op.NodeID, "attempts", op.Attempts+1)
}
continue
}
// ErrNoResponders means the node has no active NATS subscription for
// this subject. Either its connection dropped, or it's the wrong
// node type entirely. Mark unhealthy so the health monitor's
// heartbeat-only pass doesn't immediately flip it back — and so
// ListDuePendingBackendOps (which filters by status=healthy) stops
// picking the row until the node genuinely recovers.
if errors.Is(applyErr, nats.ErrNoResponders) {
xlog.Warn("Reconciler: no NATS responders — marking node unhealthy",
"op", op.Op, "backend", op.Backend, "node", op.NodeID)
_ = rc.registry.MarkUnhealthy(ctx, op.NodeID)
}
// Dead-letter cap: after maxAttempts the row is the reconciler
// equivalent of a poison message. Delete it loudly so the queue
// doesn't churn NATS every tick forever — operators can re-issue
// the op from the UI if they still want it applied.
if op.Attempts+1 >= maxPendingBackendOpAttempts {
xlog.Error("Reconciler: abandoning pending backend op after max attempts",
"op", op.Op, "backend", op.Backend, "node", op.NodeID,
"attempts", op.Attempts+1, "last_error", applyErr)
if err := rc.registry.DeletePendingBackendOp(ctx, op.ID); err != nil {
xlog.Warn("Reconciler: failed to delete abandoned op row", "id", op.ID, "error", err)
}
continue
}
_ = rc.registry.RecordPendingBackendOpFailure(ctx, op.ID, applyErr.Error())
xlog.Warn("Reconciler: pending backend op retry failed",
"op", op.Op, "backend", op.Backend, "node", op.NodeID, "attempts", op.Attempts+1, "error", applyErr)
}
}
// maxPendingBackendOpAttempts caps how many times the reconciler retries a
// failing row before dead-lettering it. Ten attempts at exponential backoff
// (30s → 15m cap) is >1h of wall-clock patience — well past any transient
// worker restart or network blip. Poisoned rows beyond that are almost
// certainly structural (wrong node type, non-existent gallery entry) and no
// amount of further retrying will help.
const maxPendingBackendOpAttempts = 10
// stalePendingBackendOpGrace is how long a node may be offline before its
// pending backend ops are garbage-collected. Draining nodes are cleared
// immediately regardless of this window (see DeleteStalePendingBackendOps).
// ListDuePendingBackendOps never surfaces ops behind non-healthy nodes, so
// without this sweep they would leak forever and keep the UI op spinning.
const stalePendingBackendOpGrace = 15 * time.Minute
// probeLoadedModels gRPC-health-checks model addresses that the DB says are
// loaded. If a model's backend process is gone (OOM, crash, manual restart)
// we remove the row so ghosts don't linger. Only probes rows older than
// probeStaleAfter so we don't hammer every worker every tick for models we
// just heard from.
func (rc *ReplicaReconciler) probeLoadedModels(ctx context.Context) {
var stale []NodeModel
cutoff := time.Now().Add(-rc.probeStaleAfter)
err := rc.registry.db.WithContext(ctx).
Joins("JOIN backend_nodes ON backend_nodes.id = node_models.node_id").
Where("node_models.state = ? AND backend_nodes.status = ? AND node_models.updated_at < ? AND node_models.address != ''",
"loaded", StatusHealthy, cutoff).
Find(&stale).Error
if err != nil {
xlog.Warn("Reconciler: failed to list loaded models for probe", "error", err)
return
}
for _, m := range stale {
if err := ctx.Err(); err != nil {
return
}
if rc.prober.IsAlive(ctx, m.Address) {
// Bump updated_at so we don't probe this row again immediately.
_ = rc.registry.db.WithContext(ctx).Model(&NodeModel{}).
Where("id = ?", m.ID).Update("updated_at", time.Now()).Error
continue
}
if err := rc.registry.RemoveNodeModel(ctx, m.NodeID, m.ModelName, m.ReplicaIndex); err != nil {
xlog.Warn("Reconciler: failed to remove unreachable model", "node", m.NodeID, "model", m.ModelName, "replica", m.ReplicaIndex, "error", err)
continue
}
xlog.Warn("Reconciler: model unreachable, removed from registry",
"node", m.NodeID, "model", m.ModelName, "replica", m.ReplicaIndex, "address", m.Address)
}
}
func (rc *ReplicaReconciler) reconcile(ctx context.Context) {
configs, err := rc.registry.ListAutoScalingConfigs(ctx)
if err != nil {
xlog.Warn("Reconciler: failed to list auto-scaling configs", "error", err)
return
}
for _, cfg := range configs {
if err := ctx.Err(); err != nil {
return // context cancelled
}
rc.reconcileModel(ctx, cfg)
}
}
// unsatisfiableTickThreshold is how many consecutive ticks of "capacity == 0
// && need > 0" must elapse before the reconciler stops trying. Three ticks at
// the default 30s interval gives ~90s of grace before logging a warning and
// entering cooldown — enough to ride out a transient race between Register
// and the next tick, but short enough that a misconfig (MinReplicas above
// cluster capacity) doesn't churn the worker forever like it did pre-PR4.
const unsatisfiableTickThreshold = 3
// unsatisfiableCooldown is the duration the reconciler waits before retrying
// after the threshold trips. ClearAllUnsatisfiable on cluster events shortens
// this in practice — the cooldown is the worst-case, not the steady-state.
const unsatisfiableCooldown = 5 * time.Minute
// candidateNodeIDsForSelector resolves the model's NodeSelector to a slice
// of node IDs, or returns nil if no selector is configured (meaning "any
// healthy node" — registry helpers interpret nil as no candidate filter).
// Returns ok=false if a non-empty selector matched zero nodes, in which case
// the caller should skip — there's nothing to schedule on.
func (rc *ReplicaReconciler) candidateNodeIDsForSelector(ctx context.Context, cfg ModelSchedulingConfig) (ids []string, ok bool) {
if cfg.NodeSelector == "" {
return nil, true
}
sel := parseSelector(cfg.NodeSelector)
if len(sel) == 0 {
return nil, true
}
nodes, err := rc.registry.FindNodesBySelector(ctx, sel)
if err != nil || len(nodes) == 0 {
return nil, false
}
ids = make([]string, len(nodes))
for i, n := range nodes {
ids[i] = n.ID
}
return ids, true
}
func (rc *ReplicaReconciler) reconcileModel(ctx context.Context, cfg ModelSchedulingConfig) {
// spread_all: derive a dynamic replica target equal to the number of nodes
// currently matching the selector (all healthy backend nodes when the
// selector is empty). Feeding it through Min==Max==target reuses every
// existing path: the floor scales up toward target (capped at capacity),
// Max==target stops busy-burst/pressure overshooting, and idle scale-down
// trims above target. The target re-tracks node join/leave each tick. cfg is
// a by-value copy, so mutating it here is local to this tick.
if cfg.SpreadAll {
matched, err := rc.registry.FindNodesBySelector(ctx, parseSelector(cfg.NodeSelector))
if err != nil {
xlog.Warn("Reconciler: spread_all failed to resolve matching nodes", "model", cfg.ModelName, "error", err)
return
}
if len(matched) == 0 {
xlog.Info("Reconciler: spread_all has no matching nodes; nothing to schedule",
"model", cfg.ModelName, "selector", cfg.NodeSelector)
return
}
cfg.MinReplicas = len(matched)
cfg.MaxReplicas = len(matched)
}
// Cooldown gate: if we previously decided this config is unsatisfiable,
// don't even bother checking until the cooldown expires. ClearAllUnsatisfiable
// (fired by node lifecycle events) bypasses this by zeroing the column.
if cfg.UnsatisfiableUntil != nil && cfg.UnsatisfiableUntil.After(time.Now()) {
return
}
current, err := rc.registry.CountLoadedReplicas(ctx, cfg.ModelName)
if err != nil {
xlog.Warn("Reconciler: failed to count replicas", "model", cfg.ModelName, "error", err)
return
}
// 1. Ensure minimum replicas, but only up to what the cluster can host.
// Without this cap, a MinReplicas above cluster capacity would loop
// forever (the original flap: every tick "scaling up", but the registry
// never grows because all nodes are full).
if cfg.MinReplicas > 0 && int(current) < cfg.MinReplicas {
candidateNodeIDs, selectorMatched := rc.candidateNodeIDsForSelector(ctx, cfg)
if !selectorMatched {
xlog.Warn("Reconciler: no nodes match selector", "model", cfg.ModelName, "selector", cfg.NodeSelector)
rc.markCapacityProblem(ctx, cfg.ModelName, "no nodes match selector")
return
}
capacity, capErr := rc.registry.ClusterCapacityForModel(ctx, cfg.ModelName, candidateNodeIDs)
if capErr != nil {
xlog.Warn("Reconciler: failed to compute cluster capacity", "model", cfg.ModelName, "error", capErr)
return
}
needed := cfg.MinReplicas - int(current)
if capacity == 0 {
// No capacity right now. Bump hysteresis; trip cooldown if it
// crosses the threshold. ClearAllUnsatisfiable resets this on
// any plausible capacity-changing event.
rc.markCapacityProblem(ctx, cfg.ModelName, "cluster capacity exhausted")
return
}
// Cap to actual capacity so we don't try harder than possible.
if needed > capacity {
xlog.Info("Reconciler: capping scale-up at cluster capacity", "model", cfg.ModelName,
"need", needed, "capacity", capacity)
needed = capacity
}
xlog.Info("Reconciler: scaling up to meet minimum", "model", cfg.ModelName,
"current", current, "min", cfg.MinReplicas, "adding", needed)
if rc.scaleUp(ctx, cfg, needed) {
// A real (or partial) scale-up clears the hysteresis so a future
// dip starts fresh. If scaleUp added nothing (scheduler errored or
// no node could be loaded) we leave the hysteresis intact so the
// next tick retries from where it left off rather than resetting
// the unsatisfiable counter on a failed attempt.
_ = rc.registry.ClearUnsatisfiable(ctx, cfg.ModelName)
}
return
}
// scaledUp tracks whether a scale-up already fired in this tick. The two
// scale-up paths below (busy-burst and pressure) share the single `current`
// value read once above; scaleUp does not re-check it. So at most one of
// them may fire per tick, otherwise a model that is both busy AND over the
// pressure threshold would scale +2 and could overshoot MaxReplicas by one.
// Scale-down is also skipped in a tick that scaled up.
scaledUp := false
// 2. Auto-scale up if all replicas are busy
if current > 0 && (cfg.MaxReplicas == 0 || int(current) < cfg.MaxReplicas) {
if rc.allReplicasBusy(ctx, cfg.ModelName) {
candidateNodeIDs, selectorMatched := rc.candidateNodeIDsForSelector(ctx, cfg)
if !selectorMatched {
return
}
capacity, capErr := rc.registry.ClusterCapacityForModel(ctx, cfg.ModelName, candidateNodeIDs)
if capErr != nil || capacity == 0 {
// All busy AND no slot available — burst load above capacity.
// Don't enter cooldown for this case (it's transient demand,
// not a misconfig); the next tick will retry naturally.
return
}
xlog.Info("Reconciler: all replicas busy, scaling up", "model", cfg.ModelName,
"current", current)
// Only mark the tick as having scaled up if a replica was actually
// added. On a failed scaleUp, leave scaledUp false so the pressure
// path below and the scale-down logic still apply as they would
// have if the busy-burst path had not run.
scaledUp = rc.scaleUp(ctx, cfg, 1)
}
}
// 2b. Auto-scale up on prefix-cache forced-disturb pressure. A forced-disturb
// is recorded by the router when a request had a usable hot prefix match
// but the load guard forced it off the warm node: the cache-warm replica
// is saturated. We reuse the same MaxReplicas + capacity guards as the
// busy-burst path, and the same UnsatisfiableUntil cooldown gates this
// block at the top of reconcileModel, so a no-capacity model will not
// spin. Pressure never overrides MaxReplicas or force-evicts.
//
// Skipped when the busy-burst path already scaled up this tick: at most
// one scaleUp(+1) per tick (see scaledUp above).
if !scaledUp && rc.pressure != nil && current > 0 && (cfg.MaxReplicas == 0 || int(current) < cfg.MaxReplicas) {
if pressureCount := rc.pressure.Count(cfg.ModelName, time.Now()); pressureCount >= rc.pressureThreshold {
candidateNodeIDs, selectorMatched := rc.candidateNodeIDsForSelector(ctx, cfg)
if selectorMatched {
capacity, capErr := rc.registry.ClusterCapacityForModel(ctx, cfg.ModelName, candidateNodeIDs)
if capErr == nil && capacity > 0 {
xlog.Info("Reconciler: prefix-cache forced-disturb pressure, scaling up",
"model", cfg.ModelName, "current", current,
"pressure", pressureCount,
"threshold", rc.pressureThreshold)
if rc.scaleUp(ctx, cfg, 1) {
scaledUp = true
// Consume the signal only on a real scale-up:
// Pressure.Count is non-draining (it prunes only by
// age), so a single burst stays in-window for the whole
// window and would re-fire scaleUp on every tick. Reset
// clears the model's events so a fresh scale-up needs
// fresh forced-disturbs to accumulate. If scaleUp added
// nothing (scheduler errored or no node could be loaded)
// we preserve the signal so the next tick retries off
// the same accumulated pressure instead of having to
// re-accumulate a full window from scratch.
rc.pressure.Reset(cfg.ModelName)
}
}
// No capacity: transient demand, not a misconfig - let the next
// tick retry naturally (mirrors the busy-burst path's choice not
// to enter cooldown for burst load).
}
}
}
// 3. Scale down idle replicas above minimum. Skipped in a tick that already
// scaled up so we never scale up and down in the same pass.
if !scaledUp {
floor := max(cfg.MinReplicas, 1)
if int(current) > floor {
rc.scaleDownIdle(ctx, cfg, int(current), floor)
}
}
}
// markCapacityProblem advances the hysteresis counter and sets the cooldown
// timestamp once it crosses the threshold. Centralized so the two scale-up
// paths (MinReplicas and busy-burst) report capacity exhaustion the same way.
func (rc *ReplicaReconciler) markCapacityProblem(ctx context.Context, modelName, reason string) {
ticks, err := rc.registry.BumpUnsatisfiableTicks(ctx, modelName)
if err != nil {
xlog.Warn("Reconciler: failed to bump unsatisfiable counter", "model", modelName, "error", err)
return
}
if ticks >= unsatisfiableTickThreshold {
until := time.Now().Add(unsatisfiableCooldown)
if err := rc.registry.MarkUnsatisfiable(ctx, modelName, until); err != nil {
xlog.Warn("Reconciler: failed to mark unsatisfiable", "model", modelName, "error", err)
return
}
xlog.Warn("Reconciler: scheduling unsatisfiable, entering cooldown",
"model", modelName, "reason", reason,
"cooldown", unsatisfiableCooldown, "retry_after", until.Format(time.RFC3339))
}
}
// scaleUp schedules additional replicas of the model. Callers in
// reconcileModel are expected to have already capped `count` against
// ClusterCapacityForModel so this function never tries to overshoot.
//
// Returns true if at least one replica was actually scheduled. Callers use
// this to gate signal-consuming side effects (Pressure.Reset,
// ClearUnsatisfiable) on a real scale-up: a failed/no-op scaleUp must not
// discard the accumulated forced-disturb pressure or clear the unsatisfiable
// hysteresis, otherwise the signal has to re-accumulate from scratch and the
// next tick can't simply retry.
func (rc *ReplicaReconciler) scaleUp(ctx context.Context, cfg ModelSchedulingConfig, count int) bool {
if rc.scheduler == nil {
xlog.Warn("Reconciler: no scheduler available, cannot scale up")
return false
}
// Resolve selector → candidate node IDs (nil when no selector → "any
// healthy node"). The selector mismatch case is handled upstream in
// reconcileModel, but defensively short-circuit here too.
candidateNodeIDs, ok := rc.candidateNodeIDsForSelector(ctx, cfg)
if !ok {
return false
}
scheduled := 0
for i := 0; i < count; i++ {
node, err := rc.scheduler.ScheduleAndLoadModel(ctx, cfg.ModelName, candidateNodeIDs)
if err != nil {
xlog.Warn("Reconciler: failed to scale up replica", "model", cfg.ModelName,
"attempt", i+1, "error", err)
break // stop trying on first failure
}
scheduled++
xlog.Info("Reconciler: scaled up replica", "model", cfg.ModelName, "node", node.Name)
}
return scheduled > 0
}
// scaleDownIdle removes idle replicas above the floor.
func (rc *ReplicaReconciler) scaleDownIdle(ctx context.Context, cfg ModelSchedulingConfig, current, floor int) {
if rc.unloader == nil {
return
}
// Find idle replicas that have been unused for longer than scaleDownDelay.
// Order by replica_index DESC first, then last_used ASC: trim the
// highest-indexed replicas first so subsequent scale-ups can reuse the
// low indexes via NextFreeReplicaIndex, keeping slot allocation compact
// and matching the worker supervisor's port-recycling behavior.
cutoff := time.Now().Add(-rc.scaleDownDelay)
var idleModels []NodeModel
rc.registry.db.WithContext(ctx).
Where("model_name = ? AND state = ? AND in_flight = 0 AND last_used < ?",
cfg.ModelName, "loaded", cutoff).
Order("replica_index DESC, last_used ASC").
Find(&idleModels)
toRemove := current - floor
removed := 0
for _, nm := range idleModels {
if removed >= toRemove {
break
}
// Remove this specific replica row from registry (sibling replicas of
// the same model on the same node, if any, are unaffected).
if err := rc.registry.RemoveNodeModel(ctx, nm.NodeID, nm.ModelName, nm.ReplicaIndex); err != nil {
xlog.Warn("Reconciler: failed to remove model record", "node", nm.NodeID, "model", nm.ModelName, "replica", nm.ReplicaIndex, "error", err)
continue
}
// Unload from worker
if err := rc.unloader.UnloadModelOnNode(nm.NodeID, nm.ModelName); err != nil {
xlog.Warn("Reconciler: unload failed (model already removed from registry)", "error", err)
}
xlog.Info("Reconciler: scaled down idle replica", "model", cfg.ModelName, "node", nm.NodeID, "replica", nm.ReplicaIndex)
removed++
}
}
// allReplicasBusy returns true if all loaded replicas of a model have in-flight requests.
func (rc *ReplicaReconciler) allReplicasBusy(ctx context.Context, modelName string) bool {
var idleCount int64
rc.registry.db.WithContext(ctx).Model(&NodeModel{}).
Where("model_name = ? AND state = ? AND in_flight = 0", modelName, "loaded").
Count(&idleCount)
return idleCount == 0
}
// parseSelector decodes a JSON node selector string into a map.
func parseSelector(selectorJSON string) map[string]string {
if selectorJSON == "" {
return nil
}
var selector map[string]string
if err := json.Unmarshal([]byte(selectorJSON), &selector); err != nil {
xlog.Warn("Failed to parse node selector", "selector", selectorJSON, "error", err)
return nil
}
return selector
}