mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-14 19:58:44 -04:00
* feat(distributed): add SpreadAll column and authoritative scheduling seeding Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(distributed): parse declarative model scheduling config (env/file) Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(distributed): reconcile spread_all to one replica per matching node Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(distributed): wire LOCALAI_MODEL_SCHEDULING env/args and startup seeding Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(distributed): expose spread_all on the scheduling API endpoint Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(distributed): add spread-to-all-nodes mode to the scheduling UI Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * docs(distributed): document LOCALAI_MODEL_SCHEDULING env/args Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * docs(distributed): clarify replica modes and all-nodes spread in scheduling config Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
1907 lines
82 KiB
Go
1907 lines
82 KiB
Go
package nodes
|
||
|
||
import (
|
||
"context"
|
||
"errors"
|
||
"fmt"
|
||
"sync/atomic"
|
||
"time"
|
||
|
||
"github.com/google/uuid"
|
||
"github.com/mudler/LocalAI/core/services/advisorylock"
|
||
"github.com/mudler/xlog"
|
||
"gorm.io/gorm"
|
||
"gorm.io/gorm/clause"
|
||
)
|
||
|
||
// BackendNode represents a remote worker node.
|
||
// Workers are generic — they don't have a fixed backend type.
|
||
// The SmartRouter dynamically installs backends via NATS backend.install events.
|
||
type BackendNode struct {
|
||
ID string `gorm:"primaryKey;size:36" json:"id"`
|
||
Name string `gorm:"uniqueIndex;size:255" json:"name"`
|
||
NodeType string `gorm:"size:32;default:backend" json:"node_type"` // backend, agent
|
||
Address string `gorm:"size:255" json:"address"` // host:port for gRPC
|
||
HTTPAddress string `gorm:"size:255" json:"http_address"` // host:port for HTTP file transfer
|
||
Status string `gorm:"size:32;default:registering" json:"status"` // registering, healthy, unhealthy, draining, pending
|
||
TokenHash string `gorm:"size:64" json:"-"` // SHA-256 of registration token
|
||
TotalVRAM uint64 `gorm:"column:total_vram" json:"total_vram"` // Total GPU VRAM in bytes
|
||
AvailableVRAM uint64 `gorm:"column:available_vram" json:"available_vram"` // Available GPU VRAM in bytes
|
||
// ReservedVRAM is a soft, in-tick reservation deducted by the scheduler when
|
||
// it picks this node to load a model. Workers reset it back to 0 on each
|
||
// heartbeat (the worker is the source of truth for actual free VRAM); the
|
||
// reservation is only here to keep two scheduling decisions within the
|
||
// same heartbeat window from over-committing the same node.
|
||
ReservedVRAM uint64 `gorm:"column:reserved_vram;default:0" json:"reserved_vram"`
|
||
TotalRAM uint64 `gorm:"column:total_ram" json:"total_ram"` // Total system RAM in bytes (fallback when no GPU)
|
||
AvailableRAM uint64 `gorm:"column:available_ram" json:"available_ram"` // Available system RAM in bytes
|
||
GPUVendor string `gorm:"column:gpu_vendor;size:32" json:"gpu_vendor"` // nvidia, amd, intel, vulkan, unknown
|
||
// MaxReplicasPerModel caps how many replicas of any one model can run on
|
||
// this node concurrently. Default 1 preserves the historical "one
|
||
// (node, model)" assumption; set higher (via worker --max-replicas-per-model)
|
||
// to allow stacking replicas on a fat node.
|
||
MaxReplicasPerModel int `gorm:"column:max_replicas_per_model;default:1" json:"max_replicas_per_model"`
|
||
// MaxReplicasPerModelManuallySet flags the value above as a UI-set
|
||
// admin override. When true, the worker's CLI value is ignored on
|
||
// re-registration so the override survives worker restarts. Cleared
|
||
// by an explicit "reset to worker default" action.
|
||
MaxReplicasPerModelManuallySet bool `gorm:"column:max_replicas_per_model_manually_set;default:false" json:"max_replicas_per_model_manually_set"`
|
||
APIKeyID string `gorm:"size:36" json:"-"` // auto-provisioned API key ID (for cleanup)
|
||
AuthUserID string `gorm:"size:36" json:"-"` // auto-provisioned user ID (for cleanup)
|
||
LastHeartbeat time.Time `gorm:"column:last_heartbeat" json:"last_heartbeat"`
|
||
CreatedAt time.Time `json:"created_at"`
|
||
UpdatedAt time.Time `json:"updated_at"`
|
||
}
|
||
|
||
const (
|
||
NodeTypeBackend = "backend"
|
||
NodeTypeAgent = "agent"
|
||
|
||
StatusHealthy = "healthy"
|
||
StatusPending = "pending"
|
||
StatusOffline = "offline"
|
||
StatusDraining = "draining"
|
||
StatusUnhealthy = "unhealthy"
|
||
|
||
// Column names (must match gorm:"column:" tags on BackendNode)
|
||
ColAvailableVRAM = "available_vram"
|
||
ColTotalVRAM = "total_vram"
|
||
ColReservedVRAM = "reserved_vram"
|
||
ColAvailableRAM = "available_ram"
|
||
ColGPUVendor = "gpu_vendor"
|
||
ColLastHeartbeat = "last_heartbeat"
|
||
ColMaxReplicasPerModel = "max_replicas_per_model"
|
||
)
|
||
|
||
// NodeModel tracks which models are loaded on which nodes.
|
||
//
|
||
// Multiple replicas of the same model on the same node are allowed; each
|
||
// replica has its own ReplicaIndex (0..MaxReplicasPerModel-1), its own
|
||
// gRPC Address (each replica is a separate worker process on its own port),
|
||
// and its own InFlight counter.
|
||
type NodeModel struct {
|
||
ID string `gorm:"primaryKey;size:36" json:"id"`
|
||
NodeID string `gorm:"index;size:36" json:"node_id"`
|
||
ModelName string `gorm:"index;size:255" json:"model_name"`
|
||
ReplicaIndex int `gorm:"column:replica_index;default:0;index" json:"replica_index"`
|
||
Address string `gorm:"size:255" json:"address"` // gRPC address for this replica's backend process
|
||
State string `gorm:"size:32;default:idle" json:"state"` // loading, loaded, unloading, idle
|
||
InFlight int `json:"in_flight"` // number of active requests on this replica
|
||
LastUsed time.Time `json:"last_used"`
|
||
LoadingBy string `gorm:"size:36" json:"loading_by,omitempty"` // frontend ID that triggered loading
|
||
BackendType string `gorm:"size:128" json:"backend_type,omitempty"` // e.g. "llama-cpp"; used by reconciler to replicate loads
|
||
ModelOptsBlob []byte `gorm:"type:bytea" json:"-"` // serialized pb.ModelOptions for replica scale-ups
|
||
CreatedAt time.Time `json:"created_at"`
|
||
UpdatedAt time.Time `json:"updated_at"`
|
||
}
|
||
|
||
// ModelLoadInfo is per-model load metadata kept independently of NodeModel rows
|
||
// so the Replica Reconciler can re-load a model after every replica row has
|
||
// been removed (worker death, eviction, MarkOffline reaping, frontend restart
|
||
// with stale heartbeats).
|
||
//
|
||
// Why a separate table when the same blob is also stamped on each NodeModel
|
||
// row? NodeModel rows are tied to a live (node, replica) slot and get deleted
|
||
// when a backend stops being healthy. Tying the only copy of load info to
|
||
// that lifecycle is exactly what caused Bug-1: a frontend restart followed by
|
||
// transient worker-row removal left no copy of ModelOptions, so the reconciler
|
||
// could not bring `min_replicas` back without a fresh inference request.
|
||
//
|
||
// Keyed by ModelName (the tracking key used by the router); last-write-wins
|
||
// on the opts blob because two concurrent frontends dispatching the same
|
||
// model with slightly different opts converge on whichever finished last.
|
||
// That is identical to the per-NodeModel-row semantics today; if a stronger
|
||
// guarantee is needed in the future, the row carries UpdatedAt for ordering.
|
||
type ModelLoadInfo struct {
|
||
ModelName string `gorm:"primaryKey;size:255" json:"model_name"`
|
||
BackendType string `gorm:"size:128" json:"backend_type"`
|
||
ModelOptsBlob []byte `gorm:"type:bytea" json:"-"`
|
||
CreatedAt time.Time `json:"created_at"`
|
||
UpdatedAt time.Time `json:"updated_at"`
|
||
}
|
||
|
||
// NodeLabel is a key-value label on a node (like K8s labels).
|
||
type NodeLabel struct {
|
||
ID string `gorm:"primaryKey;size:36" json:"id"`
|
||
NodeID string `gorm:"uniqueIndex:idx_node_label;size:36" json:"node_id"`
|
||
Key string `gorm:"uniqueIndex:idx_node_label;size:128" json:"key"`
|
||
Value string `gorm:"size:255" json:"value"`
|
||
}
|
||
|
||
// ModelSchedulingConfig defines how a model should be scheduled across the cluster.
|
||
// All fields are optional:
|
||
// - NodeSelector only → constrain nodes, single replica
|
||
// - MinReplicas/MaxReplicas only → auto-scale on any node
|
||
// - Both → auto-scale on matching nodes
|
||
// - Neither → no-op (default behavior)
|
||
//
|
||
// Auto-scaling is enabled when MinReplicas > 0, MaxReplicas > 0, or SpreadAll is set.
|
||
type ModelSchedulingConfig struct {
|
||
ID string `gorm:"primaryKey;size:36" json:"id"`
|
||
ModelName string `gorm:"uniqueIndex;size:255" json:"model_name"`
|
||
NodeSelector string `gorm:"type:text" json:"node_selector,omitempty"` // JSON {"key":"value",...}
|
||
MinReplicas int `gorm:"default:0" json:"min_replicas"`
|
||
MaxReplicas int `gorm:"default:0" json:"max_replicas"`
|
||
// SpreadAll requests one replica on every node matching NodeSelector
|
||
// (every healthy backend node when the selector is empty), tracked as
|
||
// nodes join and leave. Mutually exclusive with MinReplicas/MaxReplicas.
|
||
// The reconciler turns this into a dynamic Min==Max target each tick.
|
||
SpreadAll bool `gorm:"column:spread_all;default:false" json:"spread_all,omitempty"`
|
||
// Prefix-cache-aware routing (epic #10063). RoutePolicy "" means inherit
|
||
// the cluster-wide default. Thresholds are per-model overrides; 0 means
|
||
// inherit the global default.
|
||
RoutePolicy string `gorm:"column:route_policy;size:32" json:"route_policy,omitempty"`
|
||
BalanceAbsThreshold int `gorm:"column:balance_abs_threshold;default:0" json:"balance_abs_threshold,omitempty"`
|
||
BalanceRelThreshold float64 `gorm:"column:balance_rel_threshold;default:0" json:"balance_rel_threshold,omitempty"`
|
||
MinPrefixMatch float64 `gorm:"column:min_prefix_match;default:0" json:"min_prefix_match,omitempty"`
|
||
// UnsatisfiableUntil is set by the reconciler when no candidate node has
|
||
// free capacity for this model; while in the future, the reconciler skips
|
||
// scale-up attempts for this model. Cleared on cluster events that could
|
||
// change capacity (new node registers, node approved, labels change,
|
||
// max-replicas-per-model changes) or when the cooldown expires.
|
||
UnsatisfiableUntil *time.Time `gorm:"column:unsatisfiable_until" json:"unsatisfiable_until,omitempty"`
|
||
// UnsatisfiableTicks is hysteresis: incremented each tick capacity==0,
|
||
// promoted to UnsatisfiableUntil once it crosses a small threshold to
|
||
// avoid one-tick flaps. Reset on any successful scale-up.
|
||
UnsatisfiableTicks int `gorm:"column:unsatisfiable_ticks;default:0" json:"unsatisfiable_ticks"`
|
||
CreatedAt time.Time `json:"created_at"`
|
||
UpdatedAt time.Time `json:"updated_at"`
|
||
}
|
||
|
||
// NodeWithExtras extends BackendNode with computed fields for list views.
|
||
type NodeWithExtras struct {
|
||
BackendNode
|
||
ModelCount int `json:"model_count"`
|
||
InFlightCount int `json:"in_flight_count"`
|
||
Labels map[string]string `json:"labels,omitempty"`
|
||
}
|
||
|
||
// PendingBackendOp is a durable intent for a backend lifecycle operation
|
||
// (delete/install/upgrade) that needs to eventually apply on a specific node.
|
||
//
|
||
// Without this table, a backend delete against an offline node silently
|
||
// dropped: the frontend skipped the node, the node came back later with the
|
||
// backend still installed, and the operator saw a zombie. Now the intent is
|
||
// recorded regardless of node status; the state reconciler drains the queue
|
||
// whenever a node is healthy and removes the row on success. Reissuing the
|
||
// same operation while a row exists updates NextRetryAt instead of stacking
|
||
// duplicates (see the unique index).
|
||
type PendingBackendOp struct {
|
||
ID uint `gorm:"primaryKey;autoIncrement" json:"id"`
|
||
NodeID string `gorm:"index;size:36;not null;uniqueIndex:idx_pending_backend_op,priority:1" json:"node_id"`
|
||
Backend string `gorm:"index;size:255;not null;uniqueIndex:idx_pending_backend_op,priority:2" json:"backend"`
|
||
Op string `gorm:"size:16;not null;uniqueIndex:idx_pending_backend_op,priority:3" json:"op"`
|
||
Galleries []byte `gorm:"type:bytea" json:"-"` // serialized JSON for install/upgrade retries
|
||
Attempts int `gorm:"default:0" json:"attempts"`
|
||
LastError string `gorm:"type:text" json:"last_error,omitempty"`
|
||
CreatedAt time.Time `json:"created_at"`
|
||
NextRetryAt time.Time `gorm:"index" json:"next_retry_at"`
|
||
}
|
||
|
||
// Op constants mirror the operation names used by DistributedBackendManager
|
||
// so callers don't repeat stringly-typed values.
|
||
const (
|
||
OpBackendDelete = "delete"
|
||
OpBackendInstall = "install"
|
||
OpBackendUpgrade = "upgrade"
|
||
)
|
||
|
||
// NodeRegistry manages backend node registration and lookup in PostgreSQL.
|
||
type NodeRegistry struct {
|
||
db *gorm.DB
|
||
// replicaRemovedHook is invoked after a replica row for (modelName, nodeID)
|
||
// is removed. It is the single chokepoint that lets the prefix-cache index
|
||
// be invalidated no matter which removal path (router eviction, reconciler
|
||
// scale-down, probe reaper, health-monitor reap, RemoteUnloaderAdapter) ran.
|
||
// The replicaIndex argument is the SPECIFIC replica removed, or negative to
|
||
// signal "all replicas of (modelName, nodeID)". Stored in an atomic.Pointer
|
||
// so the startup wiring (setter) and request / reconcile handling (fire) are
|
||
// race-free.
|
||
replicaRemovedHook atomic.Pointer[func(modelName, nodeID string, replicaIndex int)]
|
||
}
|
||
|
||
// SetReplicaRemovedHook registers a callback invoked after a replica row for
|
||
// (modelName, nodeID) is removed from the registry. replicaIndex is the
|
||
// specific replica removed, or negative to mean "all replicas of the node".
|
||
// Used to invalidate the prefix-cache index so it never points at a replica
|
||
// that no longer hosts the model. Set once at startup before serving. Safe to
|
||
// leave unset (no-op).
|
||
func (r *NodeRegistry) SetReplicaRemovedHook(fn func(modelName, nodeID string, replicaIndex int)) {
|
||
r.replicaRemovedHook.Store(&fn)
|
||
}
|
||
|
||
// fireReplicaRemoved invokes the replica-removed hook if one is set. A negative
|
||
// replicaIndex means all replicas of (modelName, nodeID). Nil-safe.
|
||
func (r *NodeRegistry) fireReplicaRemoved(modelName, nodeID string, replicaIndex int) {
|
||
if fn := r.replicaRemovedHook.Load(); fn != nil && *fn != nil {
|
||
(*fn)(modelName, nodeID, replicaIndex)
|
||
}
|
||
}
|
||
|
||
// nodeModelNames returns the DISTINCT model names that have node_models rows
|
||
// for nodeID, using db (which may be a transaction handle). Used by the bulk
|
||
// node-scoped delete paths (Register re-register cleanup, MarkOffline,
|
||
// MarkDraining, Deregister) to capture what will be removed BEFORE the delete
|
||
// so they can fire the replica-removed hook once per distinct model afterwards
|
||
// and keep the prefix-cache index from pointing at a node that no longer hosts
|
||
// the model. Skips the query entirely when no hook is set (these are lifecycle
|
||
// ops, not the request hot path, but the query is pure overhead with no hook).
|
||
func (r *NodeRegistry) nodeModelNames(ctx context.Context, db *gorm.DB, nodeID string) []string {
|
||
if fn := r.replicaRemovedHook.Load(); fn == nil || *fn == nil {
|
||
return nil
|
||
}
|
||
var names []string
|
||
if err := db.WithContext(ctx).Model(&NodeModel{}).
|
||
Where("node_id = ?", nodeID).
|
||
Distinct().
|
||
Pluck("model_name", &names).Error; err != nil {
|
||
// Non-fatal: proceed with the delete, just skip hook invalidation.
|
||
// A stale prefix-cache entry self-heals on the next routing miss.
|
||
xlog.Warn("Failed to enumerate node models before bulk delete; skipping prefix-cache invalidation", "node", nodeID, "error", err)
|
||
return nil
|
||
}
|
||
return names
|
||
}
|
||
|
||
// NewNodeRegistry creates a NodeRegistry and auto-migrates the schema.
|
||
// Uses a PostgreSQL advisory lock to prevent concurrent migration races
|
||
// when multiple instances (frontend + workers) start at the same time.
|
||
func NewNodeRegistry(db *gorm.DB) (*NodeRegistry, error) {
|
||
if err := advisorylock.WithLockCtx(context.Background(), db, advisorylock.KeySchemaMigrate, func() error {
|
||
return db.AutoMigrate(&BackendNode{}, &NodeModel{}, &NodeLabel{}, &ModelSchedulingConfig{}, &PendingBackendOp{}, &ModelLoadInfo{})
|
||
}); err != nil {
|
||
return nil, fmt.Errorf("migrating node tables: %w", err)
|
||
}
|
||
|
||
// One-shot cleanup of queue rows that can never drain: ops targeted at
|
||
// agent workers (wrong subscription set), at non-existent nodes, or with
|
||
// an empty backend name. The guard in enqueueAndDrainBackendOp prevents
|
||
// new ones from being written, but rows persisted by earlier versions
|
||
// keep the reconciler busy retrying a permanently-failing NATS request
|
||
// every 30s. Guarded by the same migration advisory lock so only one
|
||
// frontend runs it.
|
||
_ = advisorylock.WithLockCtx(context.Background(), db, advisorylock.KeySchemaMigrate, func() error {
|
||
res := db.Exec(`
|
||
DELETE FROM pending_backend_ops
|
||
WHERE backend = ''
|
||
OR node_id NOT IN (SELECT id FROM backend_nodes WHERE node_type = ? OR node_type = '')
|
||
`, NodeTypeBackend)
|
||
if res.Error != nil {
|
||
xlog.Warn("Failed to prune malformed pending_backend_ops rows", "error", res.Error)
|
||
return res.Error
|
||
}
|
||
if res.RowsAffected > 0 {
|
||
xlog.Info("Pruned pending_backend_ops rows (wrong node type or empty backend)", "count", res.RowsAffected)
|
||
}
|
||
return nil
|
||
})
|
||
|
||
return &NodeRegistry{db: db}, nil
|
||
}
|
||
|
||
// Register adds or updates a backend node.
|
||
// If autoApprove is true, the node goes directly to "healthy" status.
|
||
// If false, new nodes start in "pending" status and must be approved by an admin.
|
||
// On re-registration (same name), previously approved nodes return to "healthy";
|
||
// nodes that were never approved stay in "pending".
|
||
func (r *NodeRegistry) Register(ctx context.Context, node *BackendNode, autoApprove bool) error {
|
||
node.LastHeartbeat = time.Now()
|
||
|
||
// Try to find existing node by name
|
||
var existing BackendNode
|
||
err := r.db.WithContext(ctx).Where("name = ?", node.Name).First(&existing).Error
|
||
if err == nil {
|
||
// Re-registration (node restart): preserve ID, respect approval history
|
||
node.ID = existing.ID
|
||
if autoApprove || existing.Status != StatusPending {
|
||
// Auto-approve enabled, or node was previously approved — restore healthy
|
||
node.Status = StatusHealthy
|
||
} else {
|
||
// Node was never approved — keep pending
|
||
node.Status = StatusPending
|
||
}
|
||
// Preserve admin overrides from re-registration. Without this,
|
||
// every worker restart silently reverts the UI-set value back to
|
||
// the worker's CLI flag (default 1) — a footgun for operators who
|
||
// configure capacity from the UI without touching the worker flag.
|
||
updateDB := r.db.WithContext(ctx).Model(&existing)
|
||
if existing.MaxReplicasPerModelManuallySet {
|
||
updateDB = updateDB.Omit("max_replicas_per_model", "max_replicas_per_model_manually_set")
|
||
// Reflect the persisted value back so the caller sees what the
|
||
// scheduler will actually use.
|
||
node.MaxReplicasPerModel = existing.MaxReplicasPerModel
|
||
node.MaxReplicasPerModelManuallySet = true
|
||
}
|
||
if err := updateDB.Updates(node).Error; err != nil {
|
||
return fmt.Errorf("updating node %s: %w", node.Name, err)
|
||
}
|
||
// Preserve auth references from existing record.
|
||
// GORM Updates(struct) skips zero-value fields, so the DB retains
|
||
// the old auth_user_id/api_key_id but the caller's struct is empty.
|
||
// Copy them back so the caller can revoke old credentials on re-registration.
|
||
if node.AuthUserID == "" {
|
||
node.AuthUserID = existing.AuthUserID
|
||
}
|
||
if node.APIKeyID == "" {
|
||
node.APIKeyID = existing.APIKeyID
|
||
}
|
||
// Clear stale model records — the node restarted and has nothing loaded.
|
||
// Capture the distinct models and run the bulk delete inside a single
|
||
// transaction so the set of fired hooks equals exactly the set of rows
|
||
// deleted: a SetNodeModel landing between the capture and the delete can
|
||
// no longer be deleted without its hook firing (no interleaving gap).
|
||
// Fire the hooks only after the transaction commits so a rollback does
|
||
// not invalidate the prefix-cache index for a removal that did not
|
||
// persist (the single chokepoint must cover this path too).
|
||
var removedModels []string
|
||
if err := r.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
|
||
removedModels = r.nodeModelNames(ctx, tx, existing.ID)
|
||
return tx.Where("node_id = ?", existing.ID).Delete(&NodeModel{}).Error
|
||
}); err != nil {
|
||
xlog.Warn("Failed to clear stale model records on re-register", "node", node.Name, "error", err)
|
||
} else {
|
||
for _, m := range removedModels {
|
||
r.fireReplicaRemoved(m, existing.ID, -1)
|
||
}
|
||
}
|
||
} else if errors.Is(err, gorm.ErrRecordNotFound) {
|
||
// Create new node
|
||
if node.ID == "" {
|
||
node.ID = uuid.New().String()
|
||
}
|
||
if autoApprove {
|
||
node.Status = StatusHealthy
|
||
} else {
|
||
node.Status = StatusPending
|
||
}
|
||
if err := r.db.WithContext(ctx).Create(node).Error; err != nil {
|
||
return fmt.Errorf("creating node %s: %w", node.Name, err)
|
||
}
|
||
} else {
|
||
return fmt.Errorf("looking up node %s: %w", node.Name, err)
|
||
}
|
||
|
||
xlog.Info("Node registered", "name", node.Name, "address", node.Address, "status", node.Status)
|
||
// Cluster capacity may have changed: a new healthy node, a returning
|
||
// node, or one with different MaxReplicasPerModel. Wake any configs the
|
||
// reconciler put in cooldown — the next tick will re-flag if still
|
||
// unsatisfiable. Best-effort; logged but non-fatal.
|
||
if err := r.ClearAllUnsatisfiable(ctx); err != nil {
|
||
xlog.Warn("Failed to clear unsatisfiable scheduling flags on register", "error", err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// UpdateAuthRefs stores the auto-provisioned user and API key IDs on a node.
|
||
func (r *NodeRegistry) UpdateAuthRefs(ctx context.Context, nodeID, authUserID, apiKeyID string) error {
|
||
return r.db.WithContext(ctx).Model(&BackendNode{}).Where("id = ?", nodeID).Updates(map[string]any{
|
||
"auth_user_id": authUserID,
|
||
"api_key_id": apiKeyID,
|
||
}).Error
|
||
}
|
||
|
||
// ApproveNode sets a pending node's status to healthy.
|
||
func (r *NodeRegistry) ApproveNode(ctx context.Context, nodeID string) error {
|
||
result := r.db.WithContext(ctx).Model(&BackendNode{}).
|
||
Where("id = ? AND status = ?", nodeID, StatusPending).
|
||
Update("status", StatusHealthy)
|
||
if result.Error != nil {
|
||
return fmt.Errorf("approving node %s: %w", nodeID, result.Error)
|
||
}
|
||
if result.RowsAffected == 0 {
|
||
return fmt.Errorf("node %s not found or not in pending status", nodeID)
|
||
}
|
||
// pending → healthy adds cluster capacity; clear any cooldown flags so
|
||
// the next reconciler tick can use the new node.
|
||
if err := r.ClearAllUnsatisfiable(ctx); err != nil {
|
||
xlog.Warn("Failed to clear unsatisfiable scheduling flags on approve", "error", err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// setStatus updates a node's status column in the database.
|
||
func (r *NodeRegistry) setStatus(ctx context.Context, nodeID, status string) error {
|
||
result := r.db.WithContext(ctx).Model(&BackendNode{}).
|
||
Where("id = ?", nodeID).Update("status", status)
|
||
if result.Error != nil {
|
||
return fmt.Errorf("setting node %s to %s: %w", nodeID, status, result.Error)
|
||
}
|
||
if result.RowsAffected == 0 {
|
||
return fmt.Errorf("node %s not found", nodeID)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// MarkOffline sets a node to offline status and clears its model records.
|
||
// Used on graceful shutdown — preserves the node row so re-registration
|
||
// can restore the previous approval status.
|
||
func (r *NodeRegistry) MarkOffline(ctx context.Context, nodeID string) error {
|
||
if err := r.setStatus(ctx, nodeID, StatusOffline); err != nil {
|
||
return err
|
||
}
|
||
// Clear model records — node is shutting down. Capture the distinct models
|
||
// and run the bulk delete inside a single transaction so the set of fired
|
||
// hooks equals exactly the set of rows deleted: a SetNodeModel landing
|
||
// between the capture and the delete can no longer be deleted without its
|
||
// hook firing (no interleaving gap). The status flip above is a separate,
|
||
// pre-existing operation and routing already filters non-healthy nodes, so
|
||
// it stays outside this transaction. Fire hooks only after commit so a
|
||
// rollback does not invalidate the index for a removal that did not persist.
|
||
var removedModels []string
|
||
if err := r.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
|
||
removedModels = r.nodeModelNames(ctx, tx, nodeID)
|
||
return tx.Where("node_id = ?", nodeID).Delete(&NodeModel{}).Error
|
||
}); err != nil {
|
||
xlog.Warn("Failed to clear model records on offline", "node", nodeID, "error", err)
|
||
} else {
|
||
for _, m := range removedModels {
|
||
r.fireReplicaRemoved(m, nodeID, -1)
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// FindNodeWithVRAM returns healthy nodes with at least minBytes effectively-
|
||
// available VRAM (available_vram - reserved_vram), ordered idle-first then
|
||
// least-loaded. The reserved_vram subtraction is the in-tick soft reservation
|
||
// that prevents two scheduling decisions in the same heartbeat window from
|
||
// over-committing the same node.
|
||
func (r *NodeRegistry) FindNodeWithVRAM(ctx context.Context, minBytes uint64) (*BackendNode, error) {
|
||
db := r.db.WithContext(ctx)
|
||
|
||
loadedModels := db.Model(&NodeModel{}).
|
||
Select("node_id").
|
||
Where("state = ?", "loaded").
|
||
Group("node_id")
|
||
|
||
subquery := db.Model(&NodeModel{}).
|
||
Select("node_id, COALESCE(SUM(in_flight), 0) as total_inflight").
|
||
Group("node_id")
|
||
|
||
// Try idle nodes with enough effectively-free VRAM first, prefer the one
|
||
// with most free VRAM (after deducting the in-tick reservation).
|
||
var node BackendNode
|
||
err := db.Where("status = ? AND node_type = ? AND (available_vram - reserved_vram) >= ? AND id NOT IN (?)",
|
||
StatusHealthy, NodeTypeBackend, minBytes, loadedModels).
|
||
Order("(available_vram - reserved_vram) DESC").
|
||
First(&node).Error
|
||
if err == nil {
|
||
return &node, nil
|
||
}
|
||
|
||
// Fall back to least-loaded nodes with enough effectively-free VRAM
|
||
err = db.Where("status = ? AND node_type = ? AND (available_vram - reserved_vram) >= ?",
|
||
StatusHealthy, NodeTypeBackend, minBytes).
|
||
Joins("LEFT JOIN (?) AS load ON load.node_id = backend_nodes.id", subquery).
|
||
Order("COALESCE(load.total_inflight, 0) ASC, (backend_nodes.available_vram - backend_nodes.reserved_vram) DESC").
|
||
First(&node).Error
|
||
if err != nil {
|
||
return nil, fmt.Errorf("no healthy nodes with %d bytes available VRAM: %w", minBytes, err)
|
||
}
|
||
return &node, nil
|
||
}
|
||
|
||
// ErrInsufficientVRAM signals that ReserveVRAM could not deduct the requested
|
||
// amount because the node's effectively-free VRAM has dropped below it
|
||
// (raced with another scheduler tick or with a heartbeat reset).
|
||
var ErrInsufficientVRAM = errors.New("insufficient effectively-free VRAM on node")
|
||
|
||
// ReserveVRAM atomically deducts `bytes` from the node's effectively-free
|
||
// VRAM (available_vram - reserved_vram). The UPDATE's WHERE clause does the
|
||
// admission check inside the database so two concurrent scheduling ticks
|
||
// can't both succeed when only one fits — whichever lands first reserves
|
||
// the slot, the other gets ErrInsufficientVRAM and falls through to the
|
||
// next candidate node.
|
||
//
|
||
// `bytes` may be 0 (e.g. when the model size estimator declines), in which
|
||
// case ReserveVRAM is a no-op — leaving accounting alone is preferable to
|
||
// reserving 0 (which would still bump no rows but is conceptually wrong).
|
||
//
|
||
// Worker heartbeats reset reserved_vram to 0 because the worker is the
|
||
// authoritative source for actual free VRAM. This is what makes the
|
||
// "soft" in soft-reservation: it's only honored within one heartbeat
|
||
// window; longer-term accounting comes from the worker's own readings.
|
||
func (r *NodeRegistry) ReserveVRAM(ctx context.Context, nodeID string, bytes uint64) error {
|
||
if bytes == 0 {
|
||
return nil
|
||
}
|
||
res := r.db.WithContext(ctx).Model(&BackendNode{}).
|
||
Where("id = ? AND (available_vram - reserved_vram) >= ?", nodeID, bytes).
|
||
UpdateColumn(ColReservedVRAM, gorm.Expr("reserved_vram + ?", bytes))
|
||
if res.Error != nil {
|
||
return fmt.Errorf("reserving %d bytes on node %s: %w", bytes, nodeID, res.Error)
|
||
}
|
||
if res.RowsAffected == 0 {
|
||
return ErrInsufficientVRAM
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// ReleaseVRAM returns previously-reserved bytes to the pool. Called from the
|
||
// scheduler's deferred rollback path when LoadModel fails after a successful
|
||
// reservation, so the failed in-flight reservation doesn't linger until the
|
||
// next heartbeat.
|
||
//
|
||
// Guarded by `reserved_vram >= bytes` so a duplicate Release can't underflow
|
||
// past zero (the column is uint64 — wrap-around would be catastrophic for
|
||
// scheduler decisions).
|
||
func (r *NodeRegistry) ReleaseVRAM(ctx context.Context, nodeID string, bytes uint64) error {
|
||
if bytes == 0 {
|
||
return nil
|
||
}
|
||
return r.db.WithContext(ctx).Model(&BackendNode{}).
|
||
Where("id = ? AND reserved_vram >= ?", nodeID, bytes).
|
||
UpdateColumn(ColReservedVRAM, gorm.Expr("reserved_vram - ?", bytes)).Error
|
||
}
|
||
|
||
// Deregister removes a backend node, its model associations, and any auto-provisioned auth credentials.
|
||
func (r *NodeRegistry) Deregister(ctx context.Context, nodeID string) error {
|
||
db := r.db.WithContext(ctx)
|
||
|
||
var node BackendNode
|
||
if err := db.Where("id = ?", nodeID).First(&node).Error; err != nil {
|
||
return fmt.Errorf("node %s not found: %w", nodeID, err)
|
||
}
|
||
|
||
// Capture the distinct models removed so the prefix-cache index can be
|
||
// invalidated once the transaction commits.
|
||
var removedModels []string
|
||
if err := db.Transaction(func(tx *gorm.DB) error {
|
||
removedModels = r.nodeModelNames(ctx, tx, nodeID)
|
||
if err := tx.Where("node_id = ?", nodeID).Delete(&NodeModel{}).Error; err != nil {
|
||
return fmt.Errorf("deleting node models for %s: %w", nodeID, err)
|
||
}
|
||
if err := tx.Where("id = ?", nodeID).Delete(&BackendNode{}).Error; err != nil {
|
||
return fmt.Errorf("deleting node %s: %w", nodeID, err)
|
||
}
|
||
// Clean up auto-provisioned auth user (cascades to API keys via FK)
|
||
if node.AuthUserID != "" {
|
||
if err := tx.Exec("DELETE FROM users WHERE id = ?", node.AuthUserID).Error; err != nil {
|
||
xlog.Warn("Failed to clean up agent worker user", "node", node.Name, "userID", node.AuthUserID, "error", err)
|
||
// non-fatal: don't rollback the whole deregistration for auth cleanup
|
||
}
|
||
}
|
||
return nil
|
||
}); err != nil {
|
||
return err
|
||
}
|
||
for _, m := range removedModels {
|
||
r.fireReplicaRemoved(m, nodeID, -1)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// HeartbeatUpdate contains optional fields to update on heartbeat.
|
||
type HeartbeatUpdate struct {
|
||
AvailableVRAM *uint64 `json:"available_vram,omitempty"`
|
||
TotalVRAM *uint64 `json:"total_vram,omitempty"`
|
||
AvailableRAM *uint64 `json:"available_ram,omitempty"`
|
||
GPUVendor string `json:"gpu_vendor,omitempty"`
|
||
}
|
||
|
||
// Heartbeat updates the heartbeat timestamp and status for a node.
|
||
// Nodes in "pending" or "offline" status stay in their current status —
|
||
// they must be approved or re-register respectively.
|
||
func (r *NodeRegistry) Heartbeat(ctx context.Context, nodeID string, update *HeartbeatUpdate) error {
|
||
db := r.db.WithContext(ctx)
|
||
|
||
updates := map[string]any{
|
||
ColLastHeartbeat: time.Now(),
|
||
}
|
||
|
||
if update != nil {
|
||
if update.AvailableVRAM != nil {
|
||
updates[ColAvailableVRAM] = *update.AvailableVRAM
|
||
// The worker is the source of truth for actual free VRAM.
|
||
// Whenever it sends us a fresh reading, the in-tick soft
|
||
// reservation is no longer needed — clear it. (See ReserveVRAM.)
|
||
updates[ColReservedVRAM] = uint64(0)
|
||
}
|
||
if update.TotalVRAM != nil {
|
||
updates[ColTotalVRAM] = *update.TotalVRAM
|
||
}
|
||
if update.AvailableRAM != nil {
|
||
updates[ColAvailableRAM] = *update.AvailableRAM
|
||
}
|
||
if update.GPUVendor != "" {
|
||
updates[ColGPUVendor] = update.GPUVendor
|
||
}
|
||
}
|
||
|
||
// Only update all fields (including status promotion) for active nodes.
|
||
// Pending and offline nodes must go through approval or re-registration.
|
||
result := db.Model(&BackendNode{}).
|
||
Where("id = ? AND status NOT IN ?", nodeID, []string{StatusPending, StatusOffline}).
|
||
Updates(updates)
|
||
if result.Error != nil {
|
||
return fmt.Errorf("heartbeat for %s: %w", nodeID, result.Error)
|
||
}
|
||
if result.RowsAffected == 0 {
|
||
// May be pending or offline — still update heartbeat timestamp
|
||
result = db.Model(&BackendNode{}).Where("id = ?", nodeID).Update(ColLastHeartbeat, time.Now())
|
||
if result.Error != nil {
|
||
return fmt.Errorf("heartbeat for %s: %w", nodeID, result.Error)
|
||
}
|
||
if result.RowsAffected == 0 {
|
||
return fmt.Errorf("node %s not found", nodeID)
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// List returns all registered nodes.
|
||
func (r *NodeRegistry) List(ctx context.Context) ([]BackendNode, error) {
|
||
var nodes []BackendNode
|
||
if err := r.db.WithContext(ctx).Order("name").Find(&nodes).Error; err != nil {
|
||
return nil, fmt.Errorf("listing nodes: %w", err)
|
||
}
|
||
return nodes, nil
|
||
}
|
||
|
||
// Get returns a single node by ID.
|
||
func (r *NodeRegistry) Get(ctx context.Context, nodeID string) (*BackendNode, error) {
|
||
var node BackendNode
|
||
if err := r.db.WithContext(ctx).First(&node, "id = ?", nodeID).Error; err != nil {
|
||
return nil, fmt.Errorf("getting node %s: %w", nodeID, err)
|
||
}
|
||
return &node, nil
|
||
}
|
||
|
||
// GetByName returns a single node by name.
|
||
func (r *NodeRegistry) GetByName(ctx context.Context, name string) (*BackendNode, error) {
|
||
var node BackendNode
|
||
if err := r.db.WithContext(ctx).First(&node, "name = ?", name).Error; err != nil {
|
||
return nil, fmt.Errorf("getting node by name %s: %w", name, err)
|
||
}
|
||
return &node, nil
|
||
}
|
||
|
||
// MarkUnhealthy sets a node status to unhealthy. Deliberately status-only:
|
||
// callers fire this on transient triggers (a single nats.ErrNoResponders from
|
||
// managers_distributed / reconciler) where the next heartbeat is expected to
|
||
// flip the node back to healthy, and cascade-deleting node_models here would
|
||
// force a full model reload on every brief NATS hiccup. Stale rows are reaped
|
||
// by the per-model health probe (on by default; see HealthMonitor) and by
|
||
// MarkOffline when the heartbeat really has gone away.
|
||
func (r *NodeRegistry) MarkUnhealthy(ctx context.Context, nodeID string) error {
|
||
return r.setStatus(ctx, nodeID, StatusUnhealthy)
|
||
}
|
||
|
||
// MarkHealthy sets a node status to healthy.
|
||
func (r *NodeRegistry) MarkHealthy(ctx context.Context, nodeID string) error {
|
||
return r.setStatus(ctx, nodeID, StatusHealthy)
|
||
}
|
||
|
||
// MarkDraining sets a node status to draining (no new requests) and clears its
|
||
// model records. Routing already filters out non-healthy nodes, so removing
|
||
// the rows on drain doesn't change new-request behavior — but it does stop the
|
||
// Models UI from showing the node's models as "running" while the box has been
|
||
// taken out of rotation, and it prevents stale rows from being selected if
|
||
// (re)scheduling logic gets relaxed elsewhere. In-flight requests already hold
|
||
// their gRPC client through Route() and will finish normally; the only
|
||
// observable effect is that the per-call IncrementInFlight bookkeeping logs a
|
||
// non-fatal warning, which is acceptable for a drain.
|
||
func (r *NodeRegistry) MarkDraining(ctx context.Context, nodeID string) error {
|
||
if err := r.setStatus(ctx, nodeID, StatusDraining); err != nil {
|
||
return err
|
||
}
|
||
// Capture the distinct models and run the bulk delete inside a single
|
||
// transaction so the set of fired hooks equals exactly the set of rows
|
||
// deleted: a SetNodeModel landing between the capture and the delete can no
|
||
// longer be deleted without its hook firing (no interleaving gap). The
|
||
// status flip above is a separate, pre-existing operation and stays outside
|
||
// this transaction. Fire hooks only after commit so a rollback does not
|
||
// invalidate the index for a removal that did not persist.
|
||
var removedModels []string
|
||
if err := r.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
|
||
removedModels = r.nodeModelNames(ctx, tx, nodeID)
|
||
return tx.Where("node_id = ?", nodeID).Delete(&NodeModel{}).Error
|
||
}); err != nil {
|
||
xlog.Warn("Failed to clear model records on draining", "node", nodeID, "error", err)
|
||
} else {
|
||
for _, m := range removedModels {
|
||
r.fireReplicaRemoved(m, nodeID, -1)
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// FindStaleNodes returns nodes that haven't sent a heartbeat within the given threshold.
|
||
// Excludes unhealthy, offline, and pending nodes since they're not actively participating.
|
||
func (r *NodeRegistry) FindStaleNodes(ctx context.Context, threshold time.Duration) ([]BackendNode, error) {
|
||
var nodes []BackendNode
|
||
cutoff := time.Now().Add(-threshold)
|
||
if err := r.db.WithContext(ctx).Where("last_heartbeat < ? AND status NOT IN ?", cutoff,
|
||
[]string{StatusUnhealthy, StatusOffline, StatusPending}).
|
||
Find(&nodes).Error; err != nil {
|
||
return nil, fmt.Errorf("finding stale nodes: %w", err)
|
||
}
|
||
return nodes, nil
|
||
}
|
||
|
||
// --- NodeModel operations ---
|
||
|
||
// SetNodeModel records that a replica of a model is loaded on a node.
|
||
// replicaIndex identifies which slot on the node this replica occupies
|
||
// (0..MaxReplicasPerModel-1). Pass 0 for single-replica scheduling.
|
||
func (r *NodeRegistry) SetNodeModel(ctx context.Context, nodeID, modelName string, replicaIndex int, state, address string, initialInFlight int) error {
|
||
now := time.Now()
|
||
// Use Attrs for creation-only fields (ID) and Assign for update-only fields.
|
||
// Attrs is applied only when creating a new record. Assign is applied on
|
||
// both create and update. This prevents overwriting the primary key on
|
||
// subsequent calls for the same (node, model, replica_index).
|
||
var nm NodeModel
|
||
result := r.db.WithContext(ctx).Where("node_id = ? AND model_name = ? AND replica_index = ?", nodeID, modelName, replicaIndex).
|
||
Attrs(NodeModel{ID: uuid.New().String(), NodeID: nodeID, ModelName: modelName, ReplicaIndex: replicaIndex}).
|
||
Assign(map[string]any{"address": address, "state": state, "last_used": now, "in_flight": initialInFlight}).
|
||
FirstOrCreate(&nm)
|
||
return result.Error
|
||
}
|
||
|
||
// SetNodeModelLoadInfo stores the backend type and serialized model options on
|
||
// an existing NodeModel record. This metadata is used by the reconciler to
|
||
// replicate model loads during scale-up.
|
||
func (r *NodeRegistry) SetNodeModelLoadInfo(ctx context.Context, nodeID, modelName string, replicaIndex int, backendType string, optsBlob []byte) error {
|
||
return r.db.WithContext(ctx).Model(&NodeModel{}).
|
||
Where("node_id = ? AND model_name = ? AND replica_index = ?", nodeID, modelName, replicaIndex).
|
||
Updates(map[string]any{"backend_type": backendType, "model_opts_blob": optsBlob}).Error
|
||
}
|
||
|
||
// UpsertModelLoadInfo records or replaces the per-model load info in the
|
||
// dedicated ModelLoadInfo table. Unlike SetNodeModelLoadInfo (which writes the
|
||
// blob onto a specific replica row and dies with it), this survives every
|
||
// NodeModel row being removed and so lets the reconciler recover replicas
|
||
// after worker death + frontend restart (Bug-1).
|
||
//
|
||
// ON CONFLICT updates backend_type, model_opts_blob, and updated_at. Two
|
||
// frontends dispatching the same model concurrently with slightly different
|
||
// opts converge on whichever transaction committed last; that matches the
|
||
// existing per-replica blob semantics today.
|
||
func (r *NodeRegistry) UpsertModelLoadInfo(ctx context.Context, modelName, backendType string, optsBlob []byte) error {
|
||
if modelName == "" {
|
||
return fmt.Errorf("model name is required")
|
||
}
|
||
now := time.Now()
|
||
rec := ModelLoadInfo{
|
||
ModelName: modelName,
|
||
BackendType: backendType,
|
||
ModelOptsBlob: optsBlob,
|
||
CreatedAt: now,
|
||
UpdatedAt: now,
|
||
}
|
||
return r.db.WithContext(ctx).Clauses(clause.OnConflict{
|
||
Columns: []clause.Column{{Name: "model_name"}},
|
||
DoUpdates: clause.Assignments(map[string]any{
|
||
"backend_type": backendType,
|
||
"model_opts_blob": optsBlob,
|
||
"updated_at": now,
|
||
}),
|
||
}).Create(&rec).Error
|
||
}
|
||
|
||
// GetModelLoadInfo retrieves the stored backend type and serialized model
|
||
// options. Reads from the dedicated ModelLoadInfo table first (survives every
|
||
// NodeModel row being deleted); falls back to scanning loaded NodeModel rows
|
||
// for the load info stamped before any frontend in this cluster ran an
|
||
// UpsertModelLoadInfo (rolling-upgrade transition). Returns
|
||
// gorm.ErrRecordNotFound when neither source has an entry.
|
||
func (r *NodeRegistry) GetModelLoadInfo(ctx context.Context, modelName string) (backendType string, optsBlob []byte, err error) {
|
||
var info ModelLoadInfo
|
||
err = r.db.WithContext(ctx).Where("model_name = ?", modelName).First(&info).Error
|
||
if err == nil {
|
||
return info.BackendType, info.ModelOptsBlob, nil
|
||
}
|
||
if !errors.Is(err, gorm.ErrRecordNotFound) {
|
||
return "", nil, err
|
||
}
|
||
|
||
var nm NodeModel
|
||
err = r.db.WithContext(ctx).
|
||
Where("model_name = ? AND state = ? AND model_opts_blob IS NOT NULL", modelName, "loaded").
|
||
First(&nm).Error
|
||
if err != nil {
|
||
return "", nil, err
|
||
}
|
||
return nm.BackendType, nm.ModelOptsBlob, nil
|
||
}
|
||
|
||
// RemoveNodeModel removes a single replica of a model from a node.
|
||
// replicaIndex must match the row to delete; passing 0 for single-replica
|
||
// scheduling preserves historical behavior. Removing siblings requires
|
||
// separate calls per index — there is no "remove all replicas" shortcut here
|
||
// to keep the contract explicit (probeLoadedModels and scaleDownIdle iterate
|
||
// per-row and must not orphan healthy siblings).
|
||
func (r *NodeRegistry) RemoveNodeModel(ctx context.Context, nodeID, modelName string, replicaIndex int) error {
|
||
if err := r.db.WithContext(ctx).Where("node_id = ? AND model_name = ? AND replica_index = ?", nodeID, modelName, replicaIndex).
|
||
Delete(&NodeModel{}).Error; err != nil {
|
||
return err
|
||
}
|
||
r.fireReplicaRemoved(modelName, nodeID, replicaIndex)
|
||
return nil
|
||
}
|
||
|
||
// RemoveAllNodeModelReplicas removes every replica of modelName on nodeID.
|
||
// Used by callers (e.g. node deregistration, full backend stop) that genuinely
|
||
// want to clear all replicas, not just one.
|
||
func (r *NodeRegistry) RemoveAllNodeModelReplicas(ctx context.Context, nodeID, modelName string) error {
|
||
if err := r.db.WithContext(ctx).Where("node_id = ? AND model_name = ?", nodeID, modelName).
|
||
Delete(&NodeModel{}).Error; err != nil {
|
||
return err
|
||
}
|
||
// Negative index signals "all replicas of (modelName, nodeID)".
|
||
r.fireReplicaRemoved(modelName, nodeID, -1)
|
||
return nil
|
||
}
|
||
|
||
// FindNodesWithModel returns nodes that have the given model loaded.
|
||
func (r *NodeRegistry) FindNodesWithModel(ctx context.Context, modelName string) ([]BackendNode, error) {
|
||
var nodes []BackendNode
|
||
if err := r.db.WithContext(ctx).Joins("JOIN node_models ON node_models.node_id = backend_nodes.id").
|
||
Where("node_models.model_name = ? AND node_models.state = ? AND backend_nodes.status = ?",
|
||
modelName, "loaded", StatusHealthy).
|
||
Order("node_models.in_flight ASC").
|
||
Find(&nodes).Error; err != nil {
|
||
return nil, fmt.Errorf("finding nodes with model %s: %w", modelName, err)
|
||
}
|
||
return nodes, nil
|
||
}
|
||
|
||
// RoutePreference biases FindAndLockNodeWithModel. PreferredNodeID +
|
||
// PreferredReplica, when set and the exact (node, replica) row is still
|
||
// loaded/healthy, is locked instead of the default ORDER BY pick. The caller
|
||
// (the prefix-cache router) has already applied the load guard, so the lock
|
||
// targets the EXACT replica it chose, not the least-loaded replica on the node.
|
||
// Nil preference => unchanged behavior.
|
||
type RoutePreference struct {
|
||
PreferredNodeID string
|
||
PreferredReplica int
|
||
}
|
||
|
||
// FindAndLockNodeWithModel atomically finds the best loaded replica of the
|
||
// given model and increments its in-flight counter within a single
|
||
// transaction. The SELECT FOR UPDATE row lock prevents concurrent eviction
|
||
// from removing the NodeModel row between the find and increment operations,
|
||
// and serializes contending routers so concurrent picks distribute across
|
||
// replicas instead of all landing on the same row.
|
||
//
|
||
// **Policy:** the SQL ORDER BY below MUST mirror PickBestReplica
|
||
// (replicapicker.go). PickBestReplica is the canonical Go implementation of
|
||
// the same rule — the per-frontend rotating-replica cache (TODO, see
|
||
// pkg/model/loader.go) will eventually use it against in-memory snapshots so
|
||
// hot inference requests don't pay this DB round-trip. If you change the
|
||
// ordering here, change both sides; the TestFindAndLockNodeWithModelMirror
|
||
// spec ("agrees with PickBestReplica on a seeded dataset") fails fast if they
|
||
// drift.
|
||
//
|
||
// When candidateNodeIDs is non-empty, only nodes in that set are considered.
|
||
// Pass nil (or empty) to consider any node. This lets callers pre-filter by
|
||
// NodeSelector so a cached replica on a now-excluded node isn't picked over a
|
||
// matching replica elsewhere — the selector-mismatch fall-through path used to
|
||
// trigger an eviction-busy loop when both sides had the model loaded.
|
||
func (r *NodeRegistry) FindAndLockNodeWithModel(ctx context.Context, modelName string, candidateNodeIDs []string, pref *RoutePreference) (*BackendNode, *NodeModel, error) {
|
||
var nm NodeModel
|
||
var node BackendNode
|
||
|
||
err := r.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
|
||
// Mirror of PickBestReplica's policy (see replicapicker.go):
|
||
// 1. in_flight ASC — least busy replica.
|
||
// 2. last_used ASC — round-robin between equally-loaded replicas.
|
||
// Every successful pick refreshes last_used below, so the
|
||
// "oldest" tier naturally rotates through the candidate set.
|
||
// Without this tier, in_flight ties collapsed to "fattest GPU
|
||
// wins every time" and one node took nearly all the load.
|
||
// 3. available_vram DESC — final tiebreaker for cold starts where
|
||
// last_used is identical across replicas.
|
||
//
|
||
// Filter on backend_nodes.status = healthy in the inner JOIN itself,
|
||
// not only in the later node-fetch step. The previous version picked
|
||
// a (node_id, replica) pair purely on node_models state, then bailed
|
||
// out when the second query couldn't find a healthy node row — but
|
||
// any concurrent reader of node_models could still pick the same
|
||
// stale row in the same window, and other helpers that mirror this
|
||
// JOIN need the same invariant. Belt-and-braces: status filter here
|
||
// AND the status-checked node fetch below.
|
||
base := tx.Clauses(clause.Locking{Strength: "UPDATE"}).
|
||
Joins("JOIN backend_nodes ON backend_nodes.id = node_models.node_id").
|
||
Where("node_models.model_name = ? AND node_models.state = ? AND backend_nodes.status = ?",
|
||
modelName, "loaded", StatusHealthy)
|
||
if len(candidateNodeIDs) > 0 {
|
||
base = base.Where("node_models.node_id IN ?", candidateNodeIDs)
|
||
}
|
||
|
||
picked := false
|
||
if pref != nil && pref.PreferredNodeID != "" {
|
||
// Lock the EXACT (node_id, replica_index) row the caller chose. The
|
||
// caller (prefix-cache router) has already applied the load guard
|
||
// per replica, so here we only require that exact replica still be
|
||
// loaded+healthy. Fall through to the default ORDER BY when that
|
||
// specific replica is not found/loaded.
|
||
q := base.Session(&gorm.Session{}).
|
||
Where("node_models.node_id = ? AND node_models.replica_index = ?", pref.PreferredNodeID, pref.PreferredReplica)
|
||
if err := q.First(&nm).Error; err == nil {
|
||
picked = true
|
||
}
|
||
}
|
||
if !picked {
|
||
if err := base.
|
||
Order("node_models.in_flight ASC, node_models.last_used ASC, backend_nodes.available_vram DESC").
|
||
First(&nm).Error; err != nil {
|
||
return err
|
||
}
|
||
}
|
||
|
||
if err := tx.Model(&nm).Updates(map[string]any{
|
||
"in_flight": gorm.Expr("in_flight + 1"),
|
||
"last_used": time.Now(),
|
||
}).Error; err != nil {
|
||
return err
|
||
}
|
||
|
||
if err := tx.Where("id = ? AND status = ?", nm.NodeID, StatusHealthy).
|
||
First(&node).Error; err != nil {
|
||
return err
|
||
}
|
||
|
||
return nil
|
||
})
|
||
|
||
if err != nil {
|
||
return nil, nil, err
|
||
}
|
||
return &node, &nm, nil
|
||
}
|
||
|
||
// LoadedReplicaStats returns one ReplicaCandidate per loaded+healthy replica of
|
||
// modelName, carrying its current in-flight count. It is a read used by the
|
||
// prefix-cache router to apply the load guard when choosing a preferred node.
|
||
// When candidateNodeIDs is non-empty, only replicas on those nodes are
|
||
// returned; pass nil to consider any healthy node. The result is never nil;
|
||
// an empty slice means no loaded replica exists.
|
||
func (r *NodeRegistry) LoadedReplicaStats(ctx context.Context, modelName string, candidateNodeIDs []string) ([]ReplicaCandidate, error) {
|
||
type row struct {
|
||
NodeID string
|
||
Address string
|
||
ReplicaIndex int
|
||
InFlight int
|
||
LastUsed time.Time
|
||
AvailableVRAM uint64
|
||
}
|
||
q := r.db.WithContext(ctx).Model(&NodeModel{}).
|
||
Joins("JOIN backend_nodes ON backend_nodes.id = node_models.node_id").
|
||
Where("node_models.model_name = ? AND node_models.state = ? AND backend_nodes.status = ?",
|
||
modelName, "loaded", StatusHealthy)
|
||
if len(candidateNodeIDs) > 0 {
|
||
q = q.Where("node_models.node_id IN ?", candidateNodeIDs)
|
||
}
|
||
|
||
// Narrow to only the columns the sole consumer (router buildPreference)
|
||
// reads: NodeID and InFlight. The other ReplicaCandidate fields stay at
|
||
// their zero value, which the consumer does not read. This avoids the
|
||
// JOIN-side available_vram fetch and the extra column transfer.
|
||
var rows []row
|
||
err := q.Select("node_models.node_id AS node_id, node_models.in_flight AS in_flight").
|
||
Scan(&rows).Error
|
||
if err != nil {
|
||
return nil, fmt.Errorf("loading replica stats for %s: %w", modelName, err)
|
||
}
|
||
|
||
out := make([]ReplicaCandidate, 0, len(rows))
|
||
for _, rw := range rows {
|
||
out = append(out, ReplicaCandidate(rw))
|
||
}
|
||
return out, nil
|
||
}
|
||
|
||
// TouchNodeModel updates the last_used timestamp for LRU tracking on a single
|
||
// replica row.
|
||
func (r *NodeRegistry) TouchNodeModel(ctx context.Context, nodeID, modelName string, replicaIndex int) {
|
||
r.db.WithContext(ctx).Model(&NodeModel{}).
|
||
Where("node_id = ? AND model_name = ? AND replica_index = ?", nodeID, modelName, replicaIndex).
|
||
Update("last_used", time.Now())
|
||
}
|
||
|
||
// GetNodeModel returns the NodeModel record for a specific (node, model, replica_index) combination.
|
||
func (r *NodeRegistry) GetNodeModel(ctx context.Context, nodeID, modelName string, replicaIndex int) (*NodeModel, error) {
|
||
var nm NodeModel
|
||
err := r.db.WithContext(ctx).
|
||
Where("node_id = ? AND model_name = ? AND replica_index = ?", nodeID, modelName, replicaIndex).
|
||
First(&nm).Error
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return &nm, nil
|
||
}
|
||
|
||
// CountReplicasOnNode returns how many replicas of modelName are currently
|
||
// recorded for nodeID (across all states). Used by NextFreeReplicaIndex and
|
||
// by capacity checks.
|
||
func (r *NodeRegistry) CountReplicasOnNode(ctx context.Context, nodeID, modelName string) (int, error) {
|
||
var count int64
|
||
if err := r.db.WithContext(ctx).Model(&NodeModel{}).
|
||
Where("node_id = ? AND model_name = ?", nodeID, modelName).
|
||
Count(&count).Error; err != nil {
|
||
return 0, err
|
||
}
|
||
return int(count), nil
|
||
}
|
||
|
||
// ErrNoFreeSlot is returned by NextFreeReplicaIndex when the node already has
|
||
// MaxReplicasPerModel replicas of this model and cannot host another.
|
||
var ErrNoFreeSlot = errors.New("no free replica slot on node")
|
||
|
||
// NextFreeReplicaIndex returns the lowest replica_index in [0, maxSlots) that
|
||
// is not currently occupied by a row for (nodeID, modelName). Returns
|
||
// ErrNoFreeSlot if every index is taken.
|
||
//
|
||
// Allocating the lowest free index (rather than always appending) keeps slot
|
||
// numbers compact across scale-down/scale-up cycles, which matches the worker
|
||
// supervisor's port-recycling behavior in core/cli/worker.go (freePorts).
|
||
func (r *NodeRegistry) NextFreeReplicaIndex(ctx context.Context, nodeID, modelName string, maxSlots int) (int, error) {
|
||
if maxSlots <= 0 {
|
||
return 0, ErrNoFreeSlot
|
||
}
|
||
var taken []int
|
||
if err := r.db.WithContext(ctx).Model(&NodeModel{}).
|
||
Where("node_id = ? AND model_name = ?", nodeID, modelName).
|
||
Pluck("replica_index", &taken).Error; err != nil {
|
||
return 0, err
|
||
}
|
||
occupied := make(map[int]struct{}, len(taken))
|
||
for _, idx := range taken {
|
||
occupied[idx] = struct{}{}
|
||
}
|
||
for idx := 0; idx < maxSlots; idx++ {
|
||
if _, ok := occupied[idx]; !ok {
|
||
return idx, nil
|
||
}
|
||
}
|
||
return 0, ErrNoFreeSlot
|
||
}
|
||
|
||
// FindLeastLoadedNode returns the healthy node with the fewest in-flight requests.
|
||
func (r *NodeRegistry) FindLeastLoadedNode(ctx context.Context) (*BackendNode, error) {
|
||
db := r.db.WithContext(ctx)
|
||
|
||
var node BackendNode
|
||
query := db.Where("status = ? AND node_type = ?", StatusHealthy, NodeTypeBackend)
|
||
// Order by total in-flight across all models on the node
|
||
subquery := db.Model(&NodeModel{}).
|
||
Select("node_id, COALESCE(SUM(in_flight), 0) as total_inflight").
|
||
Group("node_id")
|
||
|
||
err := query.Joins("LEFT JOIN (?) AS load ON load.node_id = backend_nodes.id", subquery).
|
||
Order("COALESCE(load.total_inflight, 0) ASC, backend_nodes.available_vram DESC").
|
||
First(&node).Error
|
||
if err != nil {
|
||
return nil, fmt.Errorf("finding least loaded node: %w", err)
|
||
}
|
||
return &node, nil
|
||
}
|
||
|
||
// FindIdleNode returns a healthy node with zero in-flight requests and zero loaded models.
|
||
// Used by the scheduler to prefer truly idle nodes for new backend assignments.
|
||
func (r *NodeRegistry) FindIdleNode(ctx context.Context) (*BackendNode, error) {
|
||
db := r.db.WithContext(ctx)
|
||
|
||
var node BackendNode
|
||
loadedModels := db.Model(&NodeModel{}).
|
||
Select("node_id").
|
||
Where("state = ?", "loaded").
|
||
Group("node_id")
|
||
err := db.Where("status = ? AND node_type = ? AND id NOT IN (?)", StatusHealthy, NodeTypeBackend, loadedModels).
|
||
Order("available_vram DESC").
|
||
First(&node).Error
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return &node, nil
|
||
}
|
||
|
||
// IncrementInFlight atomically increments the in-flight counter on a single replica row.
|
||
func (r *NodeRegistry) IncrementInFlight(ctx context.Context, nodeID, modelName string, replicaIndex int) error {
|
||
result := r.db.WithContext(ctx).Model(&NodeModel{}).
|
||
Where("node_id = ? AND model_name = ? AND replica_index = ?", nodeID, modelName, replicaIndex).
|
||
Updates(map[string]any{
|
||
"in_flight": gorm.Expr("in_flight + 1"),
|
||
"last_used": time.Now(),
|
||
})
|
||
if result.Error != nil {
|
||
return result.Error
|
||
}
|
||
if result.RowsAffected == 0 {
|
||
return fmt.Errorf("node model %s/%s replica %d not found", nodeID, modelName, replicaIndex)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// DecrementInFlight atomically decrements the in-flight counter on a single replica row.
|
||
// Guarded by `in_flight > 0` so that double-decrements don't go negative.
|
||
func (r *NodeRegistry) DecrementInFlight(ctx context.Context, nodeID, modelName string, replicaIndex int) error {
|
||
result := r.db.WithContext(ctx).Model(&NodeModel{}).
|
||
Where("node_id = ? AND model_name = ? AND replica_index = ? AND in_flight > 0", nodeID, modelName, replicaIndex).
|
||
UpdateColumn("in_flight", gorm.Expr("in_flight - 1"))
|
||
if result.Error != nil {
|
||
return result.Error
|
||
}
|
||
if result.RowsAffected == 0 {
|
||
xlog.Warn("DecrementInFlight: no matching row or already zero", "node", nodeID, "model", modelName, "replica", replicaIndex)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// GetNodeModels returns all models loaded on a given node.
|
||
func (r *NodeRegistry) GetNodeModels(ctx context.Context, nodeID string) ([]NodeModel, error) {
|
||
var models []NodeModel
|
||
if err := r.db.WithContext(ctx).Where("node_id = ?", nodeID).Find(&models).Error; err != nil {
|
||
return nil, fmt.Errorf("getting models for node %s: %w", nodeID, err)
|
||
}
|
||
return models, nil
|
||
}
|
||
|
||
// ListAllLoadedModels returns all models that are loaded on healthy nodes.
|
||
// Used by DistributedModelStore.Range() to discover models not in local cache.
|
||
func (r *NodeRegistry) ListAllLoadedModels(ctx context.Context) ([]NodeModel, error) {
|
||
var models []NodeModel
|
||
err := r.db.WithContext(ctx).Joins("JOIN backend_nodes ON backend_nodes.id = node_models.node_id").
|
||
Where("node_models.state = ? AND backend_nodes.status = ?", "loaded", StatusHealthy).
|
||
Find(&models).Error
|
||
if err != nil {
|
||
return nil, fmt.Errorf("listing all loaded models: %w", err)
|
||
}
|
||
return models, nil
|
||
}
|
||
|
||
// FindNodeForModel returns the first healthy node that has the given model loaded.
|
||
// Returns the node and true if found, nil and false otherwise.
|
||
func (r *NodeRegistry) FindNodeForModel(ctx context.Context, modelName string) (*BackendNode, bool) {
|
||
nodes, err := r.FindNodesWithModel(ctx, modelName)
|
||
if err != nil || len(nodes) == 0 {
|
||
return nil, false
|
||
}
|
||
return &nodes[0], true
|
||
}
|
||
|
||
// FindLRUModel returns the least-recently-used model on a node.
|
||
func (r *NodeRegistry) FindLRUModel(ctx context.Context, nodeID string) (*NodeModel, error) {
|
||
var nm NodeModel
|
||
err := r.db.WithContext(ctx).Where("node_id = ? AND state = ? AND in_flight = 0", nodeID, "loaded").
|
||
Order("last_used ASC").First(&nm).Error
|
||
if err != nil {
|
||
return nil, fmt.Errorf("finding LRU model on node %s: %w", nodeID, err)
|
||
}
|
||
return &nm, nil
|
||
}
|
||
|
||
// FindGlobalLRUModelWithZeroInFlight returns the least-recently-used model
|
||
// across all healthy backend nodes that has zero in-flight requests.
|
||
// Used by the router for preemptive eviction when no node has free VRAM.
|
||
func (r *NodeRegistry) FindGlobalLRUModelWithZeroInFlight(ctx context.Context) (*NodeModel, error) {
|
||
var nm NodeModel
|
||
err := r.db.WithContext(ctx).Joins("JOIN backend_nodes ON backend_nodes.id = node_models.node_id").
|
||
Where("node_models.state = ? AND node_models.in_flight = 0 AND backend_nodes.status = ? AND backend_nodes.node_type = ?",
|
||
"loaded", StatusHealthy, NodeTypeBackend).
|
||
Order("node_models.last_used ASC").
|
||
First(&nm).Error
|
||
if err != nil {
|
||
return nil, fmt.Errorf("no evictable model found: %w", err)
|
||
}
|
||
return &nm, nil
|
||
}
|
||
|
||
// --- NodeLabel operations ---
|
||
|
||
// SetNodeLabel upserts a single label on a node.
|
||
//
|
||
// A label change can change which models match a NodeSelector, so any
|
||
// scheduling cooldown flag is cleared as a side effect — the next reconciler
|
||
// tick will re-flag if the new label set still doesn't satisfy capacity.
|
||
func (r *NodeRegistry) SetNodeLabel(ctx context.Context, nodeID, key, value string) error {
|
||
label := NodeLabel{
|
||
ID: uuid.New().String(),
|
||
NodeID: nodeID,
|
||
Key: key,
|
||
Value: value,
|
||
}
|
||
if err := r.db.WithContext(ctx).
|
||
Clauses(clause.OnConflict{
|
||
Columns: []clause.Column{{Name: "node_id"}, {Name: "key"}},
|
||
DoUpdates: clause.AssignmentColumns([]string{"value"}),
|
||
}).
|
||
Create(&label).Error; err != nil {
|
||
return err
|
||
}
|
||
if err := r.ClearAllUnsatisfiable(ctx); err != nil {
|
||
xlog.Warn("Failed to clear unsatisfiable scheduling flags on SetNodeLabel", "error", err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// SetNodeLabels replaces all labels for a node with the given map.
|
||
func (r *NodeRegistry) SetNodeLabels(ctx context.Context, nodeID string, labels map[string]string) error {
|
||
if err := r.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
|
||
if err := tx.Where("node_id = ?", nodeID).Delete(&NodeLabel{}).Error; err != nil {
|
||
return err
|
||
}
|
||
for k, v := range labels {
|
||
label := NodeLabel{ID: uuid.New().String(), NodeID: nodeID, Key: k, Value: v}
|
||
if err := tx.Create(&label).Error; err != nil {
|
||
return err
|
||
}
|
||
}
|
||
return nil
|
||
}); err != nil {
|
||
return err
|
||
}
|
||
if err := r.ClearAllUnsatisfiable(ctx); err != nil {
|
||
xlog.Warn("Failed to clear unsatisfiable scheduling flags on SetNodeLabels", "error", err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// RemoveNodeLabel removes a single label from a node.
|
||
func (r *NodeRegistry) RemoveNodeLabel(ctx context.Context, nodeID, key string) error {
|
||
if err := r.db.WithContext(ctx).Where("node_id = ? AND key = ?", nodeID, key).Delete(&NodeLabel{}).Error; err != nil {
|
||
return err
|
||
}
|
||
if err := r.ClearAllUnsatisfiable(ctx); err != nil {
|
||
xlog.Warn("Failed to clear unsatisfiable scheduling flags on RemoveNodeLabel", "error", err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// GetNodeLabels returns all labels for a node.
|
||
func (r *NodeRegistry) GetNodeLabels(ctx context.Context, nodeID string) ([]NodeLabel, error) {
|
||
var labels []NodeLabel
|
||
err := r.db.WithContext(ctx).Where("node_id = ?", nodeID).Find(&labels).Error
|
||
return labels, err
|
||
}
|
||
|
||
// GetAllNodeLabelsMap returns all labels grouped by node ID.
|
||
func (r *NodeRegistry) GetAllNodeLabelsMap(ctx context.Context) (map[string]map[string]string, error) {
|
||
var labels []NodeLabel
|
||
if err := r.db.WithContext(ctx).Find(&labels).Error; err != nil {
|
||
return nil, err
|
||
}
|
||
result := make(map[string]map[string]string)
|
||
for _, l := range labels {
|
||
if result[l.NodeID] == nil {
|
||
result[l.NodeID] = make(map[string]string)
|
||
}
|
||
result[l.NodeID][l.Key] = l.Value
|
||
}
|
||
return result, nil
|
||
}
|
||
|
||
// --- Selector-based queries ---
|
||
|
||
// FindNodesBySelector returns healthy backend nodes matching ALL key-value pairs in the selector.
|
||
func (r *NodeRegistry) FindNodesBySelector(ctx context.Context, selector map[string]string) ([]BackendNode, error) {
|
||
if len(selector) == 0 {
|
||
// Empty selector matches all healthy backend nodes
|
||
var nodes []BackendNode
|
||
err := r.db.WithContext(ctx).Where("status = ? AND node_type = ?", StatusHealthy, NodeTypeBackend).Find(&nodes).Error
|
||
return nodes, err
|
||
}
|
||
|
||
db := r.db.WithContext(ctx).Where("status = ? AND node_type = ?", StatusHealthy, NodeTypeBackend)
|
||
for k, v := range selector {
|
||
db = db.Where("EXISTS (SELECT 1 FROM node_labels WHERE node_labels.node_id = backend_nodes.id AND node_labels.key = ? AND node_labels.value = ?)", k, v)
|
||
}
|
||
|
||
var nodes []BackendNode
|
||
err := db.Find(&nodes).Error
|
||
return nodes, err
|
||
}
|
||
|
||
// FindNodeWithVRAMFromSet is like FindNodeWithVRAM but restricted to the given node IDs.
|
||
func (r *NodeRegistry) FindNodeWithVRAMFromSet(ctx context.Context, minBytes uint64, nodeIDs []string) (*BackendNode, error) {
|
||
db := r.db.WithContext(ctx)
|
||
|
||
loadedModels := db.Model(&NodeModel{}).
|
||
Select("node_id").
|
||
Where("state = ?", "loaded").
|
||
Group("node_id")
|
||
|
||
subquery := db.Model(&NodeModel{}).
|
||
Select("node_id, COALESCE(SUM(in_flight), 0) as total_inflight").
|
||
Group("node_id")
|
||
|
||
// Try idle nodes with enough effectively-free VRAM first.
|
||
var node BackendNode
|
||
err := db.Where("status = ? AND node_type = ? AND (available_vram - reserved_vram) >= ? AND id NOT IN (?) AND id IN ?",
|
||
StatusHealthy, NodeTypeBackend, minBytes, loadedModels, nodeIDs).
|
||
Order("(available_vram - reserved_vram) DESC").
|
||
First(&node).Error
|
||
if err == nil {
|
||
return &node, nil
|
||
}
|
||
|
||
// Fall back to least-loaded nodes with enough effectively-free VRAM
|
||
err = db.Where("status = ? AND node_type = ? AND (available_vram - reserved_vram) >= ? AND backend_nodes.id IN ?",
|
||
StatusHealthy, NodeTypeBackend, minBytes, nodeIDs).
|
||
Joins("LEFT JOIN (?) AS load ON load.node_id = backend_nodes.id", subquery).
|
||
Order("COALESCE(load.total_inflight, 0) ASC, (backend_nodes.available_vram - backend_nodes.reserved_vram) DESC").
|
||
First(&node).Error
|
||
if err != nil {
|
||
return nil, fmt.Errorf("no healthy nodes in set with %d bytes available VRAM: %w", minBytes, err)
|
||
}
|
||
return &node, nil
|
||
}
|
||
|
||
// FindIdleNodeFromSet is like FindIdleNode but restricted to the given node IDs.
|
||
func (r *NodeRegistry) FindIdleNodeFromSet(ctx context.Context, nodeIDs []string) (*BackendNode, error) {
|
||
db := r.db.WithContext(ctx)
|
||
|
||
var node BackendNode
|
||
loadedModels := db.Model(&NodeModel{}).
|
||
Select("node_id").
|
||
Where("state = ?", "loaded").
|
||
Group("node_id")
|
||
err := db.Where("status = ? AND node_type = ? AND id NOT IN (?) AND id IN ?", StatusHealthy, NodeTypeBackend, loadedModels, nodeIDs).
|
||
Order("available_vram DESC").
|
||
First(&node).Error
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return &node, nil
|
||
}
|
||
|
||
// FindLeastLoadedNodeFromSet is like FindLeastLoadedNode but restricted to the given node IDs.
|
||
func (r *NodeRegistry) FindLeastLoadedNodeFromSet(ctx context.Context, nodeIDs []string) (*BackendNode, error) {
|
||
db := r.db.WithContext(ctx)
|
||
|
||
var node BackendNode
|
||
query := db.Where("status = ? AND node_type = ? AND backend_nodes.id IN ?", StatusHealthy, NodeTypeBackend, nodeIDs)
|
||
// Order by total in-flight across all models on the node
|
||
subquery := db.Model(&NodeModel{}).
|
||
Select("node_id, COALESCE(SUM(in_flight), 0) as total_inflight").
|
||
Group("node_id")
|
||
|
||
err := query.Joins("LEFT JOIN (?) AS load ON load.node_id = backend_nodes.id", subquery).
|
||
Order("COALESCE(load.total_inflight, 0) ASC, backend_nodes.available_vram DESC").
|
||
First(&node).Error
|
||
if err != nil {
|
||
return nil, fmt.Errorf("finding least loaded node in set: %w", err)
|
||
}
|
||
return &node, nil
|
||
}
|
||
|
||
// --- ModelSchedulingConfig operations ---
|
||
|
||
// SetModelScheduling creates or updates a scheduling config for a model.
|
||
func (r *NodeRegistry) SetModelScheduling(ctx context.Context, config *ModelSchedulingConfig) error {
|
||
if config.ID == "" {
|
||
config.ID = uuid.New().String()
|
||
}
|
||
return r.db.WithContext(ctx).
|
||
Clauses(clause.OnConflict{
|
||
Columns: []clause.Column{{Name: "model_name"}},
|
||
DoUpdates: clause.AssignmentColumns([]string{
|
||
"node_selector", "min_replicas", "max_replicas", "spread_all",
|
||
"route_policy", "balance_abs_threshold", "balance_rel_threshold", "min_prefix_match",
|
||
"updated_at",
|
||
}),
|
||
}).
|
||
Create(config).Error
|
||
}
|
||
|
||
// SeedModelScheduling authoritatively applies a batch of scheduling configs at
|
||
// startup. Each config is upserted (full-replace on model_name), overwriting any
|
||
// prior row for that model. Models not present in configs are left untouched.
|
||
func (r *NodeRegistry) SeedModelScheduling(ctx context.Context, configs []ModelSchedulingConfig) error {
|
||
for i := range configs {
|
||
if err := r.SetModelScheduling(ctx, &configs[i]); err != nil {
|
||
return fmt.Errorf("seeding scheduling config for model %q: %w", configs[i].ModelName, err)
|
||
}
|
||
xlog.Info("Seeded model scheduling config", "model", configs[i].ModelName,
|
||
"spread_all", configs[i].SpreadAll, "min", configs[i].MinReplicas, "max", configs[i].MaxReplicas)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// GetModelScheduling returns the scheduling config for a model, or nil if none exists.
|
||
func (r *NodeRegistry) GetModelScheduling(ctx context.Context, modelName string) (*ModelSchedulingConfig, error) {
|
||
var config ModelSchedulingConfig
|
||
err := r.db.WithContext(ctx).Where("model_name = ?", modelName).First(&config).Error
|
||
if errors.Is(err, gorm.ErrRecordNotFound) {
|
||
return nil, nil
|
||
}
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return &config, nil
|
||
}
|
||
|
||
// ListModelSchedulings returns all scheduling configs.
|
||
func (r *NodeRegistry) ListModelSchedulings(ctx context.Context) ([]ModelSchedulingConfig, error) {
|
||
var configs []ModelSchedulingConfig
|
||
err := r.db.WithContext(ctx).Order("model_name ASC").Find(&configs).Error
|
||
return configs, err
|
||
}
|
||
|
||
// ListAutoScalingConfigs returns scheduling configs where auto-scaling is enabled.
|
||
func (r *NodeRegistry) ListAutoScalingConfigs(ctx context.Context) ([]ModelSchedulingConfig, error) {
|
||
var configs []ModelSchedulingConfig
|
||
err := r.db.WithContext(ctx).Where("min_replicas > 0 OR max_replicas > 0 OR spread_all = ?", true).Find(&configs).Error
|
||
return configs, err
|
||
}
|
||
|
||
// DeleteModelScheduling removes a scheduling config by model name.
|
||
func (r *NodeRegistry) DeleteModelScheduling(ctx context.Context, modelName string) error {
|
||
return r.db.WithContext(ctx).Where("model_name = ?", modelName).Delete(&ModelSchedulingConfig{}).Error
|
||
}
|
||
|
||
// CountLoadedReplicas returns the number of loaded replicas for a model.
|
||
func (r *NodeRegistry) CountLoadedReplicas(ctx context.Context, modelName string) (int64, error) {
|
||
var count int64
|
||
err := r.db.WithContext(ctx).Model(&NodeModel{}).Where("model_name = ? AND state = ?", modelName, "loaded").Count(&count).Error
|
||
return count, err
|
||
}
|
||
|
||
// FindNodesWithFreeSlot returns healthy backend nodes that have at least one
|
||
// free replica slot for modelName (i.e. count(node_models.*) for this model
|
||
// is strictly less than the node's MaxReplicasPerModel cap). When
|
||
// candidateNodeIDs is non-empty, only those nodes are considered.
|
||
//
|
||
// This is the candidate-pool used by SmartRouter.scheduleNewModel — without
|
||
// it, the scheduler would happily pick the same node for replica #2 even
|
||
// when that node already hosts replica #1, re-creating the original flap.
|
||
func (r *NodeRegistry) FindNodesWithFreeSlot(ctx context.Context, modelName string, candidateNodeIDs []string) ([]BackendNode, error) {
|
||
q := r.db.WithContext(ctx).Model(&BackendNode{}).
|
||
Where("status = ? AND node_type = ?", StatusHealthy, NodeTypeBackend)
|
||
if len(candidateNodeIDs) > 0 {
|
||
q = q.Where("id IN ?", candidateNodeIDs)
|
||
}
|
||
// Subquery: per-node count of loaded+loading replicas of this model.
|
||
// We count any non-removed row (state != deleted) so a load in progress
|
||
// counts against the cap and a second concurrent scale-up can't overshoot.
|
||
subq := r.db.Model(&NodeModel{}).
|
||
Select("node_id, COUNT(*) as cnt").
|
||
Where("model_name = ?", modelName).
|
||
Group("node_id")
|
||
|
||
var out []BackendNode
|
||
err := q.Joins("LEFT JOIN (?) AS rc ON rc.node_id = backend_nodes.id", subq).
|
||
Where("COALESCE(rc.cnt, 0) < backend_nodes.max_replicas_per_model").
|
||
Find(&out).Error
|
||
if err != nil {
|
||
return nil, fmt.Errorf("finding nodes with free slot for %s: %w", modelName, err)
|
||
}
|
||
return out, nil
|
||
}
|
||
|
||
// ClusterCapacityForModel returns the total free replica capacity for
|
||
// modelName across the candidate node set: Σ (max_replicas_per_model −
|
||
// current_replicas[n,m]). When candidateNodeIDs is empty all healthy backend
|
||
// nodes are considered.
|
||
//
|
||
// The reconciler uses this to bound MinReplicas at what the cluster can
|
||
// actually host, preventing the "scale-up forever" loop from #9XXX where a
|
||
// MinReplicas=2 with one worker × one slot churned the model every 30s.
|
||
func (r *NodeRegistry) ClusterCapacityForModel(ctx context.Context, modelName string, candidateNodeIDs []string) (int, error) {
|
||
q := r.db.WithContext(ctx).Model(&BackendNode{}).
|
||
Where("status = ? AND node_type = ?", StatusHealthy, NodeTypeBackend)
|
||
if len(candidateNodeIDs) > 0 {
|
||
q = q.Where("id IN ?", candidateNodeIDs)
|
||
}
|
||
subq := r.db.Model(&NodeModel{}).
|
||
Select("node_id, COUNT(*) as cnt").
|
||
Where("model_name = ?", modelName).
|
||
Group("node_id")
|
||
|
||
var nodes []struct {
|
||
MaxReplicasPerModel int
|
||
Loaded int
|
||
}
|
||
err := q.Select("backend_nodes.max_replicas_per_model AS max_replicas_per_model, COALESCE(rc.cnt, 0) AS loaded").
|
||
Joins("LEFT JOIN (?) AS rc ON rc.node_id = backend_nodes.id", subq).
|
||
Scan(&nodes).Error
|
||
if err != nil {
|
||
return 0, fmt.Errorf("computing cluster capacity for %s: %w", modelName, err)
|
||
}
|
||
total := 0
|
||
for _, n := range nodes {
|
||
free := n.MaxReplicasPerModel - n.Loaded
|
||
if free > 0 {
|
||
total += free
|
||
}
|
||
}
|
||
return total, nil
|
||
}
|
||
|
||
// BumpUnsatisfiableTicks increments the per-config hysteresis counter when
|
||
// the reconciler tries to scale up but cluster capacity is exhausted.
|
||
// Returns the new value.
|
||
func (r *NodeRegistry) BumpUnsatisfiableTicks(ctx context.Context, modelName string) (int, error) {
|
||
res := r.db.WithContext(ctx).Model(&ModelSchedulingConfig{}).
|
||
Where("model_name = ?", modelName).
|
||
UpdateColumn("unsatisfiable_ticks", gorm.Expr("unsatisfiable_ticks + 1"))
|
||
if res.Error != nil {
|
||
return 0, res.Error
|
||
}
|
||
var cfg ModelSchedulingConfig
|
||
if err := r.db.WithContext(ctx).Where("model_name = ?", modelName).First(&cfg).Error; err != nil {
|
||
return 0, err
|
||
}
|
||
return cfg.UnsatisfiableTicks, nil
|
||
}
|
||
|
||
// MarkUnsatisfiable sets UnsatisfiableUntil to a future time, so the
|
||
// reconciler skips scale-up attempts for this model until the cooldown
|
||
// expires (or a cluster event clears the flag — see ClearAllUnsatisfiable).
|
||
func (r *NodeRegistry) MarkUnsatisfiable(ctx context.Context, modelName string, until time.Time) error {
|
||
return r.db.WithContext(ctx).Model(&ModelSchedulingConfig{}).
|
||
Where("model_name = ?", modelName).
|
||
Update("unsatisfiable_until", until).Error
|
||
}
|
||
|
||
// ClearUnsatisfiable resets both the cooldown timestamp and the hysteresis
|
||
// counter for a single model. Called on a successful scale-up so the next
|
||
// transient capacity dip starts the hysteresis from zero.
|
||
func (r *NodeRegistry) ClearUnsatisfiable(ctx context.Context, modelName string) error {
|
||
return r.db.WithContext(ctx).Model(&ModelSchedulingConfig{}).
|
||
Where("model_name = ?", modelName).
|
||
Updates(map[string]any{
|
||
"unsatisfiable_until": gorm.Expr("NULL"),
|
||
"unsatisfiable_ticks": 0,
|
||
}).Error
|
||
}
|
||
|
||
// UpdateMaxReplicasPerModel sets a node's per-model replica cap as an admin
|
||
// override (sticky across worker restarts) and refreshes the mirrored
|
||
// `node.replica-slots` auto-label so selectors reflect the new value.
|
||
// Capacity may have just changed, so cooldown flags are cleared too — the
|
||
// next reconciler tick will re-flag if still unsatisfiable.
|
||
//
|
||
// The override is preserved on worker re-registration (see Register). To
|
||
// hand control back to the worker flag, call ResetMaxReplicasPerModel.
|
||
func (r *NodeRegistry) UpdateMaxReplicasPerModel(ctx context.Context, nodeID string, n int) error {
|
||
if n < 1 {
|
||
return fmt.Errorf("max_replicas_per_model must be >= 1, got %d", n)
|
||
}
|
||
res := r.db.WithContext(ctx).Model(&BackendNode{}).
|
||
Where("id = ?", nodeID).
|
||
Updates(map[string]any{
|
||
ColMaxReplicasPerModel: n,
|
||
"max_replicas_per_model_manually_set": true,
|
||
})
|
||
if res.Error != nil {
|
||
return fmt.Errorf("updating max_replicas_per_model on %s: %w", nodeID, res.Error)
|
||
}
|
||
if res.RowsAffected == 0 {
|
||
return fmt.Errorf("node %s not found", nodeID)
|
||
}
|
||
// Keep the auto-label in sync so existing AND-selectors keep matching.
|
||
if err := r.SetNodeLabel(ctx, nodeID, "node.replica-slots", fmt.Sprintf("%d", n)); err != nil {
|
||
xlog.Warn("Failed to refresh node.replica-slots label", "node", nodeID, "error", err)
|
||
}
|
||
if err := r.ClearAllUnsatisfiable(ctx); err != nil {
|
||
xlog.Warn("Failed to clear unsatisfiable scheduling flags after capacity update", "error", err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// ResetMaxReplicasPerModel clears the admin override flag so the next worker
|
||
// re-registration is allowed to update the value again. The current value is
|
||
// left in place — the worker will overwrite it on its next register call.
|
||
//
|
||
// This is the "Reset to worker default" affordance in the UI: it doesn't
|
||
// require knowing what the worker flag is set to (the worker tells us on
|
||
// re-register), it just hands ownership back.
|
||
func (r *NodeRegistry) ResetMaxReplicasPerModel(ctx context.Context, nodeID string) error {
|
||
res := r.db.WithContext(ctx).Model(&BackendNode{}).
|
||
Where("id = ?", nodeID).
|
||
Update("max_replicas_per_model_manually_set", false)
|
||
if res.Error != nil {
|
||
return fmt.Errorf("clearing max_replicas_per_model override on %s: %w", nodeID, res.Error)
|
||
}
|
||
if res.RowsAffected == 0 {
|
||
return fmt.Errorf("node %s not found", nodeID)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// ClearAllUnsatisfiable clears the cooldown flag on every scheduling config.
|
||
// Called from cluster-events that could plausibly increase capacity (new
|
||
// node registers, node approves pending→healthy, node labels change,
|
||
// MaxReplicasPerModel changes). The reconciler's own loop will re-flag any
|
||
// config whose target is still unsatisfiable, so over-clearing is cheap and
|
||
// correct.
|
||
func (r *NodeRegistry) ClearAllUnsatisfiable(ctx context.Context) error {
|
||
return r.db.WithContext(ctx).Model(&ModelSchedulingConfig{}).
|
||
Where("unsatisfiable_until IS NOT NULL OR unsatisfiable_ticks > 0").
|
||
Updates(map[string]any{
|
||
"unsatisfiable_until": gorm.Expr("NULL"),
|
||
"unsatisfiable_ticks": 0,
|
||
}).Error
|
||
}
|
||
|
||
// --- Composite queries ---
|
||
|
||
// ListWithExtras returns all nodes with model counts and labels.
|
||
func (r *NodeRegistry) ListWithExtras(ctx context.Context) ([]NodeWithExtras, error) {
|
||
// Get all nodes
|
||
var nodes []BackendNode
|
||
if err := r.db.WithContext(ctx).Order("name ASC").Find(&nodes).Error; err != nil {
|
||
return nil, err
|
||
}
|
||
|
||
// Get model counts per node
|
||
type modelCount struct {
|
||
NodeID string
|
||
Count int
|
||
}
|
||
var counts []modelCount
|
||
if err := r.db.WithContext(ctx).Model(&NodeModel{}).
|
||
Select("node_id, COUNT(*) as count").
|
||
Where("state = ?", "loaded").
|
||
Group("node_id").
|
||
Find(&counts).Error; err != nil {
|
||
xlog.Warn("ListWithExtras: failed to get model counts", "error", err)
|
||
}
|
||
|
||
countMap := make(map[string]int)
|
||
for _, c := range counts {
|
||
countMap[c.NodeID] = c.Count
|
||
}
|
||
|
||
// Get in-flight counts per node
|
||
type inFlightCount struct {
|
||
NodeID string
|
||
Total int
|
||
}
|
||
var inFlights []inFlightCount
|
||
if err := r.db.WithContext(ctx).Model(&NodeModel{}).
|
||
Select("node_id, COALESCE(SUM(in_flight), 0) as total").
|
||
Where("state IN ?", []string{"loaded", "unloading"}).
|
||
Group("node_id").
|
||
Find(&inFlights).Error; err != nil {
|
||
xlog.Warn("ListWithExtras: failed to get in-flight counts", "error", err)
|
||
}
|
||
|
||
inFlightMap := make(map[string]int)
|
||
for _, f := range inFlights {
|
||
inFlightMap[f.NodeID] = f.Total
|
||
}
|
||
|
||
// Get all labels
|
||
labelsMap, err := r.GetAllNodeLabelsMap(ctx)
|
||
if err != nil {
|
||
xlog.Warn("ListWithExtras: failed to get labels", "error", err)
|
||
}
|
||
|
||
// Build result
|
||
result := make([]NodeWithExtras, len(nodes))
|
||
for i, n := range nodes {
|
||
result[i] = NodeWithExtras{
|
||
BackendNode: n,
|
||
ModelCount: countMap[n.ID],
|
||
InFlightCount: inFlightMap[n.ID],
|
||
Labels: labelsMap[n.ID],
|
||
}
|
||
}
|
||
return result, nil
|
||
}
|
||
|
||
// ApplyAutoLabels sets automatic labels based on node hardware info.
|
||
func (r *NodeRegistry) ApplyAutoLabels(ctx context.Context, nodeID string, node *BackendNode) {
|
||
if node.GPUVendor != "" {
|
||
_ = r.SetNodeLabel(ctx, nodeID, "gpu.vendor", node.GPUVendor)
|
||
}
|
||
if node.TotalVRAM > 0 {
|
||
gb := node.TotalVRAM / (1024 * 1024 * 1024)
|
||
var bucket string
|
||
switch {
|
||
case gb >= 80:
|
||
bucket = "80GB+"
|
||
case gb >= 48:
|
||
bucket = "48GB"
|
||
case gb >= 24:
|
||
bucket = "24GB"
|
||
case gb >= 16:
|
||
bucket = "16GB"
|
||
case gb >= 8:
|
||
bucket = "8GB"
|
||
default:
|
||
bucket = fmt.Sprintf("%dGB", gb)
|
||
}
|
||
_ = r.SetNodeLabel(ctx, nodeID, "gpu.vram", bucket)
|
||
}
|
||
if node.Name != "" {
|
||
_ = r.SetNodeLabel(ctx, nodeID, "node.name", node.Name)
|
||
}
|
||
// Mirror the typed MaxReplicasPerModel field as a label so the existing
|
||
// AND-selector machinery in ModelSchedulingConfig can target high-capacity
|
||
// nodes (e.g. {"node.replica-slots": "4"}). Always set it (default 1) so
|
||
// selectors don't have to special-case missing labels.
|
||
slots := node.MaxReplicasPerModel
|
||
if slots < 1 {
|
||
slots = 1
|
||
}
|
||
_ = r.SetNodeLabel(ctx, nodeID, "node.replica-slots", fmt.Sprintf("%d", slots))
|
||
}
|
||
|
||
// UpsertPendingBackendOp records or refreshes a pending backend operation for
|
||
// a node. If a row already exists for (nodeID, backend, op) we keep its
|
||
// Attempts/LastError but reset NextRetryAt to now, so reissuing the same
|
||
// delete/upgrade nudges it to the front of the queue instead of stacking a
|
||
// duplicate intent.
|
||
func (r *NodeRegistry) UpsertPendingBackendOp(ctx context.Context, nodeID, backend, op string, galleries []byte) error {
|
||
row := PendingBackendOp{
|
||
NodeID: nodeID,
|
||
Backend: backend,
|
||
Op: op,
|
||
Galleries: galleries,
|
||
NextRetryAt: time.Now(),
|
||
}
|
||
return r.db.WithContext(ctx).Clauses(clause.OnConflict{
|
||
Columns: []clause.Column{{Name: "node_id"}, {Name: "backend"}, {Name: "op"}},
|
||
DoUpdates: clause.AssignmentColumns([]string{"galleries", "next_retry_at"}),
|
||
}).Create(&row).Error
|
||
}
|
||
|
||
// ListDuePendingBackendOps returns queued ops whose NextRetryAt has passed
|
||
// AND whose node is currently healthy. The reconciler drains this list; we
|
||
// filter by node status in the query so a tick doesn't hammer NATS for
|
||
// nodes that obviously can't answer.
|
||
func (r *NodeRegistry) ListDuePendingBackendOps(ctx context.Context) ([]PendingBackendOp, error) {
|
||
var ops []PendingBackendOp
|
||
err := r.db.WithContext(ctx).
|
||
Joins("JOIN backend_nodes ON backend_nodes.id = pending_backend_ops.node_id").
|
||
Where("pending_backend_ops.next_retry_at <= ? AND backend_nodes.status = ?", time.Now(), StatusHealthy).
|
||
Order("pending_backend_ops.next_retry_at ASC").
|
||
Find(&ops).Error
|
||
if err != nil {
|
||
return nil, fmt.Errorf("listing due pending backend ops: %w", err)
|
||
}
|
||
return ops, nil
|
||
}
|
||
|
||
// ListPendingBackendOps returns every queued row (for the UI "pending on N
|
||
// nodes" chip and the pre-delete ConfirmDialog).
|
||
func (r *NodeRegistry) ListPendingBackendOps(ctx context.Context) ([]PendingBackendOp, error) {
|
||
var ops []PendingBackendOp
|
||
if err := r.db.WithContext(ctx).Order("backend ASC, created_at ASC").Find(&ops).Error; err != nil {
|
||
return nil, fmt.Errorf("listing pending backend ops: %w", err)
|
||
}
|
||
return ops, nil
|
||
}
|
||
|
||
// DeletePendingBackendOp removes a queue row — called after the op succeeds.
|
||
func (r *NodeRegistry) DeletePendingBackendOp(ctx context.Context, id uint) error {
|
||
if err := r.db.WithContext(ctx).Delete(&PendingBackendOp{}, id).Error; err != nil {
|
||
return fmt.Errorf("deleting pending backend op %d: %w", id, err)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// DeleteStalePendingBackendOps garbage-collects pending backend ops whose target
|
||
// node can never drain them. ListDuePendingBackendOps only returns rows behind a
|
||
// StatusHealthy node, so ops behind a node that went offline or draining are
|
||
// otherwise never retried, aged out, or deleted — they leak forever and keep the
|
||
// UI operation spinning. Draining nodes are cleared immediately (an explicit
|
||
// admin action; their model rows are already purged). Offline nodes are cleared
|
||
// only once their last heartbeat is older than `grace`, so a brief heartbeat blip
|
||
// does not nuke an install that is still legitimately in flight. Returns the
|
||
// number of rows deleted.
|
||
func (r *NodeRegistry) DeleteStalePendingBackendOps(ctx context.Context, grace time.Duration) (int64, error) {
|
||
cutoff := time.Now().Add(-grace)
|
||
// Draining nodes are cleared immediately (admin action; model rows already
|
||
// purged). Offline AND unhealthy nodes are cleared only once their heartbeat
|
||
// is older than the grace window: a node marked unhealthy on a NATS
|
||
// ErrNoResponders never transitions to offline (health.go skips re-marking
|
||
// it), so without including unhealthy here its ops would leak exactly like
|
||
// the offline case. A node with a fresh heartbeat (last_heartbeat > cutoff)
|
||
// is recovering and keeps its op for retry.
|
||
res := r.db.WithContext(ctx).
|
||
Where(`node_id IN (SELECT id FROM backend_nodes WHERE status = ?)
|
||
OR node_id IN (SELECT id FROM backend_nodes WHERE status IN ? AND last_heartbeat <= ?)`,
|
||
StatusDraining, []string{StatusOffline, StatusUnhealthy}, cutoff).
|
||
Delete(&PendingBackendOp{})
|
||
if res.Error != nil {
|
||
return 0, fmt.Errorf("deleting stale pending backend ops: %w", res.Error)
|
||
}
|
||
if res.RowsAffected > 0 {
|
||
xlog.Info("Cleared pending backend ops behind non-healthy nodes", "deleted", res.RowsAffected)
|
||
}
|
||
return res.RowsAffected, nil
|
||
}
|
||
|
||
// RecordPendingBackendOpFailure bumps Attempts, captures the error, and
|
||
// pushes NextRetryAt out with exponential backoff capped at 15 minutes.
|
||
func (r *NodeRegistry) RecordPendingBackendOpFailure(ctx context.Context, id uint, errMsg string) error {
|
||
return r.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
|
||
var row PendingBackendOp
|
||
if err := tx.First(&row, id).Error; err != nil {
|
||
return err
|
||
}
|
||
row.Attempts++
|
||
row.LastError = errMsg
|
||
row.NextRetryAt = time.Now().Add(backoffForAttempt(row.Attempts))
|
||
return tx.Save(&row).Error
|
||
})
|
||
}
|
||
|
||
// RecordPendingBackendOpInFlight is the "soft failure" cousin of
|
||
// RecordPendingBackendOpFailure. Used when a NATS install round-trip timed
|
||
// out but the worker is still installing in the background. Stores the
|
||
// message in LastError and pushes NextRetryAt out by `retryDelay` (typically
|
||
// the install timeout) so the reconciler does not immediately re-fire
|
||
// another install while the worker is still busy.
|
||
//
|
||
// Attempts is intentionally NOT incremented: an in-flight timeout is not a
|
||
// failed attempt, it is a still-in-progress one. Incrementing it would let a
|
||
// genuinely-progressing slow install (e.g. 30 GB CUDA image on Wi-Fi) trip
|
||
// the maxPendingBackendOpAttempts cap in the reconciler and dead-letter the
|
||
// row while the worker is still legitimately working.
|
||
func (r *NodeRegistry) RecordPendingBackendOpInFlight(ctx context.Context, id uint, lastError string, retryDelay time.Duration) error {
|
||
return r.db.WithContext(ctx).Model(&PendingBackendOp{}).
|
||
Where("id = ?", id).
|
||
Updates(map[string]any{
|
||
"last_error": lastError,
|
||
"next_retry_at": time.Now().Add(retryDelay),
|
||
}).Error
|
||
}
|
||
|
||
// backoffForAttempt is exponential from 30s doubling up to a 15m cap. The
|
||
// reconciler tick is 30s so anything shorter would just re-fire immediately.
|
||
func backoffForAttempt(attempts int) time.Duration {
|
||
const cap = 15 * time.Minute
|
||
base := 30 * time.Second
|
||
shift := attempts - 1
|
||
if shift < 0 {
|
||
shift = 0
|
||
}
|
||
if shift > 10 { // 2^10 * 30s already exceeds the cap
|
||
shift = 10
|
||
}
|
||
d := base << shift
|
||
if d > cap {
|
||
return cap
|
||
}
|
||
return d
|
||
}
|
||
|
||
// CountPendingBackendOpsByBackend returns a map of backend name to the count
|
||
// of pending rows. Used to decorate Manage → Backends with a "pending on N
|
||
// nodes" chip without exposing the full queue.
|
||
func (r *NodeRegistry) CountPendingBackendOpsByBackend(ctx context.Context) (map[string]int, error) {
|
||
type row struct {
|
||
Backend string
|
||
Count int
|
||
}
|
||
var rows []row
|
||
err := r.db.WithContext(ctx).Model(&PendingBackendOp{}).
|
||
Select("backend, COUNT(*) as count").
|
||
Group("backend").
|
||
Scan(&rows).Error
|
||
if err != nil {
|
||
return nil, fmt.Errorf("counting pending backend ops: %w", err)
|
||
}
|
||
out := make(map[string]int, len(rows))
|
||
for _, r := range rows {
|
||
out[r.Backend] = r.Count
|
||
}
|
||
return out, nil
|
||
}
|