From 4ec39bb7766423730619ae415a7d71c4e3f74fce Mon Sep 17 00:00:00 2001
From: Adira <dennisadira@gmail.com>
Date: Tue, 30 Jun 2026 23:14:01 +0300
Subject: [PATCH] fix(watchdog): don't log optional Free() as an error when
 backend returns Unimplemented (#10602) (#10607)

* fix(watchdog): don't log optional Free() as an error when backend returns Unimplemented (#10602)

When the watchdog evicts a model, deleteProcess calls the backend's gRPC
Free() to release VRAM before stopping the process. Free is optional:
backends that don't override it -- the generated UnimplementedBackendServer
stub, many Python/external backends, or a federation proxy in distributed
mode -- return gRPC Unimplemented. That is expected, not a failure: VRAM is
reclaimed when the local process is stopped, or by the remote unloader for
remote backends. Logging it as "WARN Error freeing GPU resources" made a
benign, optional RPC look like a fault (the alarming line in #10602, seen
in distributed mode where the model is remote and Free hits a stub).

Treat gRPC Unimplemented from Free() as a no-op logged at Debug; genuine
failures still Warn. Free() is still attempted for every backend, so any
backend that does implement it is unaffected.

Add a reusable grpcerrors.IsUnimplemented helper following the package's
existing code-based detection idiom (prefer the typed status code, fall
back to the message across non-gRPC boundaries), with table tests.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Adira Denis Muhando <dennisadira@gmail.com>

* fix(watchdog): log a non-Unimplemented Free() failure at error level

Per review: now that the expected gRPC Unimplemented case is split out and
logged at Debug, any remaining Free() error is a genuine failure to release
VRAM, so surface it at error level instead of warn.

Assisted-by: Claude:claude-opus-4-8 [Claude Code]

Signed-off-by: Adira Denis Muhando <dennisadira@gmail.com>

---------

Signed-off-by: Adira Denis Muhando <dennisadira@gmail.com>
---
 pkg/grpc/grpcerrors/errors.go      | 17 +++++++++++++++++
 pkg/grpc/grpcerrors/errors_test.go | 12 ++++++++++++
 pkg/model/process.go               | 16 ++++++++++++++--
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/pkg/grpc/grpcerrors/errors.go b/pkg/grpc/grpcerrors/errors.go
index 724d63547..8cb57f416 100644
--- a/pkg/grpc/grpcerrors/errors.go
+++ b/pkg/grpc/grpcerrors/errors.go
@@ -58,6 +58,23 @@ func IsLiveTranscriptionUnsupported(err error) bool {
 	return strings.Contains(strings.ToLower(err.Error()), "unimplemented")
 }
 
+// IsUnimplemented reports whether err is a gRPC Unimplemented status — the
+// signal a backend gives for an RPC it does not implement. The generated
+// UnimplementedBackendServer stub returns exactly this for any RPC a backend
+// (e.g. a Python or external backend) has not overridden, so callers can treat
+// an optional RPC as a no-op rather than a failure. Prefers the typed status
+// code and falls back to the message for paths that lose the status (e.g. errors
+// wrapped across non-gRPC boundaries).
+func IsUnimplemented(err error) bool {
+	if err == nil {
+		return false
+	}
+	if status.Code(err) == codes.Unimplemented {
+		return true
+	}
+	return strings.Contains(strings.ToLower(err.Error()), "unimplemented")
+}
+
 // StreamTranscriptionUnsupported returns the canonical error a backend returns
 // when it (or the loaded model) cannot serve the server-streaming
 // AudioTranscriptionStream RPC. It carries codes.Unimplemented like the live
diff --git a/pkg/grpc/grpcerrors/errors_test.go b/pkg/grpc/grpcerrors/errors_test.go
index 932633510..7ce668226 100644
--- a/pkg/grpc/grpcerrors/errors_test.go
+++ b/pkg/grpc/grpcerrors/errors_test.go
@@ -55,6 +55,18 @@ var _ = Describe("grpcerrors", func() {
 		Expect(grpcerrors.IsModelNotLoaded(err)).To(BeFalse())
 	})
 
+	DescribeTable("IsUnimplemented",
+		func(err error, want bool) {
+			Expect(grpcerrors.IsUnimplemented(err)).To(Equal(want))
+		},
+		Entry("nil", nil, false),
+		Entry("typed code", status.Error(codes.Unimplemented, "method Free not implemented"), true),
+		Entry("stale stub message (Unknown code)", errors.New("rpc error: code = Unimplemented desc = "), true),
+		Entry("unrelated error", errors.New("context deadline exceeded"), false),
+		Entry("unrelated grpc code", status.Error(codes.Unavailable, "connection refused"), false),
+		Entry("model not loaded is NOT unimplemented", grpcerrors.ModelNotLoaded("parakeet-cpp"), false),
+	)
+
 	It("StreamTranscriptionUnsupported carries Unimplemented and is not ModelNotLoaded", func() {
 		err := grpcerrors.StreamTranscriptionUnsupported("parakeet-cpp", "not a streaming model")
 		Expect(status.Code(err)).To(Equal(codes.Unimplemented))
diff --git a/pkg/model/process.go b/pkg/model/process.go
index 95e3e0758..8a796a4f5 100644
--- a/pkg/model/process.go
+++ b/pkg/model/process.go
@@ -11,6 +11,7 @@ import (
 	"time"
 
 	"github.com/hpcloud/tail"
+	"github.com/mudler/LocalAI/pkg/grpc/grpcerrors"
 	"github.com/mudler/LocalAI/pkg/signals"
 	process "github.com/mudler/go-processmanager"
 	"github.com/mudler/xlog"
@@ -52,10 +53,21 @@ func (ml *ModelLoader) deleteProcess(s string) error {
 		hook(s)
 	}
 
-	// Free GPU resources before stopping the process to ensure VRAM is released
+	// Free GPU resources before stopping the process to ensure VRAM is released.
+	// Free is optional: backends that don't override it (the generated stub, many
+	// Python/external backends, or a federation proxy in distributed mode) return
+	// gRPC Unimplemented. That is expected, not a failure — VRAM is reclaimed when
+	// the process is stopped below, or by the remote unloader for remote backends —
+	// so don't surface it as an error.
 	xlog.Debug("Calling Free() to release GPU resources", "model", s)
 	if err := model.GRPC(false, ml.wd).Free(context.Background()); err != nil {
-		xlog.Warn("Error freeing GPU resources", "error", err, "model", s)
+		if grpcerrors.IsUnimplemented(err) {
+			xlog.Debug("Backend does not implement Free(); GPU release handled on process stop", "model", s)
+		} else {
+			// Now that the expected Unimplemented case is filtered out above, a
+			// remaining error is a genuine failure to release VRAM — surface it.
+			xlog.Error("Error freeing GPU resources", "error", err, "model", s)
+		}
 	}
 
 	process := model.Process()