fix(backend): don't let a client disconnect cancel the model load

Image generation (and the tts/transcript/embeddings/vad/rerank/llm helpers) pass the request context to loader.Load so distributed routing decisions reach the request's X-LocalAI-Node holder. That context also governs cancellation of the load, so when a client disconnects mid-load the LoadModel RPC is aborted, stopLoadProcess tears down the backend process, and every retry restarts from scratch. Heavy diffusers/LLM models on a slow host (e.g. a shared-memory iGPU) take long enough to load that the request routinely ends first, so the model never finishes loading and the UI shows "NetworkError when attempting to fetch resource". Wrap the load context with context.WithoutCancel: the routing holder value still propagates, but the request's cancellation no longer aborts the load, so it runs to completion and caches for the next request. Inference keeps the cancellable request context, so a disconnect still stops generation. Adds a regression spec asserting a canceled request context does not cancel the model load while the routing holder still reaches the router. Fixes #10636 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
2026-07-03 04:46:54 -04:00 · 2026-07-02 20:52:51 +00:00
8 changed files with 84 additions and 26 deletions
--- a/core/backend/ctx_propagation_test.go
+++ b/core/backend/ctx_propagation_test.go
@@ -157,6 +157,33 @@ var _ = Describe("X-LocalAI-Node ctx propagation contract", func() {
 		stampViaRouterCtx()
 	})

+	// Regression for #10636: a canceled request context must NOT cancel the
+	// model LOAD. The heavy image/audio backends bind the load to the request
+	// context so the routing holder reaches the SmartRouter; but a large
+	// diffusers/LLM model on a slow (e.g. shared-memory iGPU) host can take
+	// far longer to load than the client stays connected. If the request's
+	// cancellation propagates to the load, the LoadModel RPC is aborted, the
+	// backend process is torn down, and every retry restarts from scratch and
+	// never converges. The load must instead run to completion and cache while
+	// still carrying the request's routing holder value.
+	It("ImageGeneration does not propagate request cancellation to the model load", func() {
+		canceledCtx, cancel := context.WithCancel(reqCtx)
+		cancel() // client disconnected while the (slow) load was still running
+
+		_, err := backend.ImageGeneration(canceledCtx, 64, 64, 1, 0, "p", "", "", "/tmp/out.png", loader, modelCfg, appCfg, nil)
+		// The load reached the router (short-circuit sentinel), i.e. it was
+		// NOT aborted early by the already-canceled request context.
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("router short-circuit (test)"))
+
+		routerCtx := routerCtxOf()
+		Expect(routerCtx).ToNot(BeNil(), "router callback must have been invoked")
+		Expect(routerCtx.Err()).To(BeNil(),
+			"a canceled request must not cancel the model load")
+		// The routing holder value still propagates despite the decoupling.
+		stampViaRouterCtx()
+	})
+
 	It("does NOT leak the holder when the app context is used instead", func() {
 		// Sanity: the bug being fixed manifests as the router getting
 		// appCfg.Context (no holder) instead of reqCtx (holder). A direct
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -40,10 +40,14 @@ func (e *modelEmbedder) Embed(ctx context.Context, text string) ([]float32, erro

 func ModelEmbedding(ctx context.Context, s string, tokens []int, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {

-	// model.WithContext(ctx) overrides the app-context default set in
-	// ModelOptions so distributed routing decisions reach the request's
-	// X-LocalAI-Node holder via distributedhdr.Stamp.
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// model.WithContext carries the request context into the load so distributed
+	// routing decisions reach the request's X-LocalAI-Node holder via
+	// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
+	// the request's cancellation, so a slow first load still completes and
+	// caches if the client disconnects instead of aborting the LoadModel RPC and
+	// tearing down the backend process (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))

 	inferenceModel, err := loader.Load(opts...)
 	if err != nil {
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -13,10 +13,14 @@ import (

 func ImageGeneration(ctx context.Context, height, width, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, refImages []string) (func() error, error) {

-	// model.WithContext(ctx) overrides the app-context default set in
-	// ModelOptions so distributed routing decisions reach the request's
-	// X-LocalAI-Node holder via distributedhdr.Stamp.
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// model.WithContext carries the request context into the load so distributed
+	// routing decisions reach the request's X-LocalAI-Node holder via
+	// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
+	// the request's cancellation, so a slow first load still completes and
+	// caches if the client disconnects instead of aborting the LoadModel RPC and
+	// tearing down the backend process (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
 	inferenceModel, err := loader.Load(
 		opts...,
 	)
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -111,7 +111,12 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
 	}
 	ctx = distributedhdr.MaybeWithPrefixChain(ctx, c.ModelID(), chainSource)

-	opts := ModelOptions(*c, o, model.WithContext(ctx))
+	// context.WithoutCancel decouples the model load from the request's
+	// cancellation while preserving its routing values, so a slow load still
+	// completes and caches if the client disconnects instead of aborting the
+	// LoadModel RPC mid-load (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(*c, o, model.WithContext(context.WithoutCancel(ctx)))
 	inferenceModel, err := loader.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(o, c.Name, c.Backend, err, map[string]any{"model_file": modelFile})
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -57,10 +57,14 @@ func (r *modelReranker) Rerank(ctx context.Context, query string, documents []st
 }

 func Rerank(ctx context.Context, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, modelConfig config.ModelConfig) (*proto.RerankResult, error) {
-	// model.WithContext(ctx) overrides the app-context default set in
-	// ModelOptions so distributed routing decisions reach the request's
-	// X-LocalAI-Node holder via distributedhdr.Stamp.
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// model.WithContext carries the request context into the load so distributed
+	// routing decisions reach the request's X-LocalAI-Node holder via
+	// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
+	// the request's cancellation, so a slow first load still completes and
+	// caches if the client disconnects instead of aborting the LoadModel RPC and
+	// tearing down the backend process (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
 	rerankModel, err := loader.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -45,10 +45,14 @@ func loadTranscriptionModel(ctx context.Context, ml *model.ModelLoader, modelCon
 	if modelConfig.Backend == "" {
 		modelConfig.Backend = model.WhisperBackend
 	}
-	// model.WithContext(ctx) overrides the app-context default set in
-	// ModelOptions so distributed routing decisions reach the request's
-	// X-LocalAI-Node holder via distributedhdr.Stamp.
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// model.WithContext carries the request context into the load so distributed
+	// routing decisions reach the request's X-LocalAI-Node holder via
+	// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
+	// the request's cancellation, so a slow first load still completes and
+	// caches if the client disconnects instead of aborting the LoadModel RPC and
+	// tearing down the backend process (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
 	transcriptionModel, err := ml.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -50,10 +50,14 @@ func ModelTTS(
 	appConfig *config.ApplicationConfig,
 	modelConfig config.ModelConfig,
 ) (string, *proto.Result, error) {
-	// model.WithContext(ctx) overrides the app-context default set in
-	// ModelOptions so distributed routing decisions reach the request's
-	// X-LocalAI-Node holder via distributedhdr.Stamp.
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// model.WithContext carries the request context into the load so distributed
+	// routing decisions reach the request's X-LocalAI-Node holder via
+	// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
+	// the request's cancellation, so a slow first load still completes and
+	// caches if the client disconnects instead of aborting the LoadModel RPC and
+	// tearing down the backend process (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
 	ttsModel, err := loader.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
@@ -153,7 +157,9 @@ func ModelTTSStream(
 	modelConfig config.ModelConfig,
 	audioCallback func([]byte) error,
 ) error {
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// See ModelTTS above: WithoutCancel decouples the load from request
+	// cancellation while preserving routing values (issue #10636).
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
 	ttsModel, err := loader.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
--- a/core/backend/vad.go
+++ b/core/backend/vad.go
@@ -14,10 +14,14 @@ func VAD(request *schema.VADRequest,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	modelConfig config.ModelConfig) (*schema.VADResponse, error) {
-	// model.WithContext(ctx) overrides the app-context default set in
-	// ModelOptions so distributed routing decisions reach the request's
-	// X-LocalAI-Node holder via distributedhdr.Stamp.
-	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	// model.WithContext carries the request context into the load so distributed
+	// routing decisions reach the request's X-LocalAI-Node holder via
+	// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
+	// the request's cancellation, so a slow first load still completes and
+	// caches if the client disconnects instead of aborting the LoadModel RPC and
+	// tearing down the backend process (issue #10636). Inference below keeps the
+	// cancellable ctx, so a disconnect still stops generation.
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
 	vadModel, err := ml.Load(opts...)
 	if err != nil {
 		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)