fix(distributed): don't increment Attempts on in-flight install timeout

An in-flight timeout (worker still pulling the OCI image) is not a failed attempt, it's a delayed one. Incrementing Attempts let genuinely-progressing slow installs (e.g. 30 GB CUDA images on Wi-Fi) trip the reconciler's maxPendingBackendOpAttempts cap and dead-letter the queue row while the worker was still legitimately working. RecordPendingBackendOpInFlight now only updates LastError and NextRetryAt. Also documents "running_on_worker" in the NodeOpStatus.Status enum comment so Task 6 implementers see the full surface. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-24 00:26:34 -04:00 · 2026-05-22 20:32:59 +00:00
parent 169ff75633
commit 4b66c3ad45
2 changed files with 11 additions and 6 deletions
--- a/core/services/nodes/managers_distributed.go
+++ b/core/services/nodes/managers_distributed.go
@@ -75,7 +75,7 @@ func NewDistributedBackendManager(appConfig *config.ApplicationConfig, ml *model
 type NodeOpStatus struct {
 	NodeID   string `json:"node_id"`
 	NodeName string `json:"node_name"`
-	Status   string `json:"status"` // "success" | "queued" | "error"
+	Status   string `json:"status"` // "success" | "queued" | "error" | "running_on_worker"
 	Error    string `json:"error,omitempty"`
 }

--- a/core/services/nodes/registry.go
+++ b/core/services/nodes/registry.go
@@ -1517,15 +1517,20 @@ func (r *NodeRegistry) RecordPendingBackendOpFailure(ctx context.Context, id uin

 // RecordPendingBackendOpInFlight is the "soft failure" cousin of
 // RecordPendingBackendOpFailure. Used when a NATS install round-trip timed
-// out but the worker is still installing in the background. Increments
-// Attempts and stores the message in LastError, but pushes NextRetryAt out
-// by `retryDelay` (typically the install timeout) so the reconciler does
-// not immediately re-fire another install while the worker is still busy.
+// out but the worker is still installing in the background. Stores the
+// message in LastError and pushes NextRetryAt out by `retryDelay` (typically
+// the install timeout) so the reconciler does not immediately re-fire
+// another install while the worker is still busy.
+//
+// Attempts is intentionally NOT incremented: an in-flight timeout is not a
+// failed attempt, it is a still-in-progress one. Incrementing it would let a
+// genuinely-progressing slow install (e.g. 30 GB CUDA image on Wi-Fi) trip
+// the maxPendingBackendOpAttempts cap in the reconciler and dead-letter the
+// row while the worker is still legitimately working.
 func (r *NodeRegistry) RecordPendingBackendOpInFlight(ctx context.Context, id uint, lastError string, retryDelay time.Duration) error {
 	return r.db.WithContext(ctx).Model(&PendingBackendOp{}).
 		Where("id = ?", id).
 		Updates(map[string]any{
-			"attempts":      gorm.Expr("attempts + 1"),
 			"last_error":    lastError,
 			"next_retry_at": time.Now().Add(retryDelay),
 		}).Error