diff --git a/pkg/grpc/grpcerrors/errors.go b/pkg/grpc/grpcerrors/errors.go index 724d63547..8cb57f416 100644 --- a/pkg/grpc/grpcerrors/errors.go +++ b/pkg/grpc/grpcerrors/errors.go @@ -58,6 +58,23 @@ func IsLiveTranscriptionUnsupported(err error) bool { return strings.Contains(strings.ToLower(err.Error()), "unimplemented") } +// IsUnimplemented reports whether err is a gRPC Unimplemented status — the +// signal a backend gives for an RPC it does not implement. The generated +// UnimplementedBackendServer stub returns exactly this for any RPC a backend +// (e.g. a Python or external backend) has not overridden, so callers can treat +// an optional RPC as a no-op rather than a failure. Prefers the typed status +// code and falls back to the message for paths that lose the status (e.g. errors +// wrapped across non-gRPC boundaries). +func IsUnimplemented(err error) bool { + if err == nil { + return false + } + if status.Code(err) == codes.Unimplemented { + return true + } + return strings.Contains(strings.ToLower(err.Error()), "unimplemented") +} + // StreamTranscriptionUnsupported returns the canonical error a backend returns // when it (or the loaded model) cannot serve the server-streaming // AudioTranscriptionStream RPC. It carries codes.Unimplemented like the live diff --git a/pkg/grpc/grpcerrors/errors_test.go b/pkg/grpc/grpcerrors/errors_test.go index 932633510..7ce668226 100644 --- a/pkg/grpc/grpcerrors/errors_test.go +++ b/pkg/grpc/grpcerrors/errors_test.go @@ -55,6 +55,18 @@ var _ = Describe("grpcerrors", func() { Expect(grpcerrors.IsModelNotLoaded(err)).To(BeFalse()) }) + DescribeTable("IsUnimplemented", + func(err error, want bool) { + Expect(grpcerrors.IsUnimplemented(err)).To(Equal(want)) + }, + Entry("nil", nil, false), + Entry("typed code", status.Error(codes.Unimplemented, "method Free not implemented"), true), + Entry("stale stub message (Unknown code)", errors.New("rpc error: code = Unimplemented desc = "), true), + Entry("unrelated error", errors.New("context deadline exceeded"), false), + Entry("unrelated grpc code", status.Error(codes.Unavailable, "connection refused"), false), + Entry("model not loaded is NOT unimplemented", grpcerrors.ModelNotLoaded("parakeet-cpp"), false), + ) + It("StreamTranscriptionUnsupported carries Unimplemented and is not ModelNotLoaded", func() { err := grpcerrors.StreamTranscriptionUnsupported("parakeet-cpp", "not a streaming model") Expect(status.Code(err)).To(Equal(codes.Unimplemented)) diff --git a/pkg/model/process.go b/pkg/model/process.go index 95e3e0758..8a796a4f5 100644 --- a/pkg/model/process.go +++ b/pkg/model/process.go @@ -11,6 +11,7 @@ import ( "time" "github.com/hpcloud/tail" + "github.com/mudler/LocalAI/pkg/grpc/grpcerrors" "github.com/mudler/LocalAI/pkg/signals" process "github.com/mudler/go-processmanager" "github.com/mudler/xlog" @@ -52,10 +53,21 @@ func (ml *ModelLoader) deleteProcess(s string) error { hook(s) } - // Free GPU resources before stopping the process to ensure VRAM is released + // Free GPU resources before stopping the process to ensure VRAM is released. + // Free is optional: backends that don't override it (the generated stub, many + // Python/external backends, or a federation proxy in distributed mode) return + // gRPC Unimplemented. That is expected, not a failure — VRAM is reclaimed when + // the process is stopped below, or by the remote unloader for remote backends — + // so don't surface it as an error. xlog.Debug("Calling Free() to release GPU resources", "model", s) if err := model.GRPC(false, ml.wd).Free(context.Background()); err != nil { - xlog.Warn("Error freeing GPU resources", "error", err, "model", s) + if grpcerrors.IsUnimplemented(err) { + xlog.Debug("Backend does not implement Free(); GPU release handled on process stop", "model", s) + } else { + // Now that the expected Unimplemented case is filtered out above, a + // remaining error is a genuine failure to release VRAM — surface it. + xlog.Error("Error freeing GPU resources", "error", err, "model", s) + } } process := model.Process()