From 4b66c3ad45bbd2b13472bd0178e1bc45ccaa7641 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Fri, 22 May 2026 20:32:59 +0000 Subject: [PATCH] fix(distributed): don't increment Attempts on in-flight install timeout An in-flight timeout (worker still pulling the OCI image) is not a failed attempt, it's a delayed one. Incrementing Attempts let genuinely-progressing slow installs (e.g. 30 GB CUDA images on Wi-Fi) trip the reconciler's maxPendingBackendOpAttempts cap and dead-letter the queue row while the worker was still legitimately working. RecordPendingBackendOpInFlight now only updates LastError and NextRetryAt. Also documents "running_on_worker" in the NodeOpStatus.Status enum comment so Task 6 implementers see the full surface. Signed-off-by: Ettore Di Giacinto --- core/services/nodes/managers_distributed.go | 2 +- core/services/nodes/registry.go | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/core/services/nodes/managers_distributed.go b/core/services/nodes/managers_distributed.go index 94fee2997..c605ee600 100644 --- a/core/services/nodes/managers_distributed.go +++ b/core/services/nodes/managers_distributed.go @@ -75,7 +75,7 @@ func NewDistributedBackendManager(appConfig *config.ApplicationConfig, ml *model type NodeOpStatus struct { NodeID string `json:"node_id"` NodeName string `json:"node_name"` - Status string `json:"status"` // "success" | "queued" | "error" + Status string `json:"status"` // "success" | "queued" | "error" | "running_on_worker" Error string `json:"error,omitempty"` } diff --git a/core/services/nodes/registry.go b/core/services/nodes/registry.go index 5869f8e6a..ed742d599 100644 --- a/core/services/nodes/registry.go +++ b/core/services/nodes/registry.go @@ -1517,15 +1517,20 @@ func (r *NodeRegistry) RecordPendingBackendOpFailure(ctx context.Context, id uin // RecordPendingBackendOpInFlight is the "soft failure" cousin of // RecordPendingBackendOpFailure. Used when a NATS install round-trip timed -// out but the worker is still installing in the background. Increments -// Attempts and stores the message in LastError, but pushes NextRetryAt out -// by `retryDelay` (typically the install timeout) so the reconciler does -// not immediately re-fire another install while the worker is still busy. +// out but the worker is still installing in the background. Stores the +// message in LastError and pushes NextRetryAt out by `retryDelay` (typically +// the install timeout) so the reconciler does not immediately re-fire +// another install while the worker is still busy. +// +// Attempts is intentionally NOT incremented: an in-flight timeout is not a +// failed attempt, it is a still-in-progress one. Incrementing it would let a +// genuinely-progressing slow install (e.g. 30 GB CUDA image on Wi-Fi) trip +// the maxPendingBackendOpAttempts cap in the reconciler and dead-letter the +// row while the worker is still legitimately working. func (r *NodeRegistry) RecordPendingBackendOpInFlight(ctx context.Context, id uint, lastError string, retryDelay time.Duration) error { return r.db.WithContext(ctx).Model(&PendingBackendOp{}). Where("id = ?", id). Updates(map[string]any{ - "attempts": gorm.Expr("attempts + 1"), "last_error": lastError, "next_retry_at": time.Now().Add(retryDelay), }).Error