From 6e5a58ca704c88cb93d602c8ec4a1dd2bb216e2b Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Tue, 3 Mar 2026 12:39:06 +0100 Subject: [PATCH] feat: Add Free RPC to backend.proto for VRAM cleanup (#8751) * fix: Add VRAM cleanup when stopping models - Add Free() method to AIModel interface for proper GPU resource cleanup - Implement Free() in llama backend to release llama.cpp model resources - Add Free() stub implementations in base and SingleThread backends - Modify deleteProcess() to call Free() before stopping the process to ensure VRAM is properly released when models are unloaded Fixes issue where VRAM was not freed when stopping models, which could lead to memory exhaustion when running multiple models sequentially. * feat: Add Free RPC to backend.proto for VRAM cleanup\n\n- Add rpc Free(HealthMessage) returns (Result) {} to backend.proto\n- This RPC is required to properly expose the Free() method\n through the gRPC interface for VRAM resource cleanup\n\nRefs: PR #8739 * Apply suggestion from @mudler Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto Co-authored-by: localai-bot Co-authored-by: Ettore Di Giacinto --- backend/backend.proto | 1 + backend/go/llm/llama/llama.go | 14 ++++++++++++++ pkg/grpc/base/base.go | 3 +++ pkg/grpc/base/singlethread.go | 7 +++++++ pkg/grpc/interface.go | 1 + pkg/model/process.go | 9 ++++++++- 6 files changed, 34 insertions(+), 1 deletion(-) diff --git a/backend/backend.proto b/backend/backend.proto index 07cd095d2..be12dfab7 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -9,6 +9,7 @@ package backend; service Backend { rpc Health(HealthMessage) returns (Reply) {} + rpc Free(HealthMessage) returns (Result) {} rpc Predict(PredictOptions) returns (Reply) {} rpc LoadModel(ModelOptions) returns (Result) {} rpc PredictStream(PredictOptions) returns (stream Reply) {} diff --git a/backend/go/llm/llama/llama.go b/backend/go/llm/llama/llama.go index 011023fe7..ceca1fa5e 100644 --- a/backend/go/llm/llama/llama.go +++ b/backend/go/llm/llama/llama.go @@ -18,6 +18,20 @@ type LLM struct { draftModel *llama.LLama } + +// Free releases GPU resources and frees the llama model +// This should be called when the model is being unloaded to properly release VRAM +func (llm *LLM) Free() error { + if llm.llama != nil { + llm.llama.Free() + llm.llama = nil + } + if llm.draftModel != nil { + llm.draftModel.Free() + llm.draftModel = nil + } + return nil +} func (llm *LLM) Load(opts *pb.ModelOptions) error { ropeFreqBase := float32(10000) ropeFreqScale := float32(1) diff --git a/pkg/grpc/base/base.go b/pkg/grpc/base/base.go index 6a72cc95c..0a96f0b26 100644 --- a/pkg/grpc/base/base.go +++ b/pkg/grpc/base/base.go @@ -130,3 +130,6 @@ func memoryUsage() *pb.MemoryUsageData { } return &mud } +func (llm *Base) Free() error { + return nil +} diff --git a/pkg/grpc/base/singlethread.go b/pkg/grpc/base/singlethread.go index e5da73edf..b40a6a1f1 100644 --- a/pkg/grpc/base/singlethread.go +++ b/pkg/grpc/base/singlethread.go @@ -50,3 +50,10 @@ func (llm *SingleThread) Status() (pb.StatusResponse, error) { Memory: mud, }, nil } + + +// Free releases resources for SingleThread backends +// The base implementation does nothing, derived classes should override if needed +func (llm *SingleThread) Free() error { + return llm.Base.Free() +} diff --git a/pkg/grpc/interface.go b/pkg/grpc/interface.go index 9610b817e..333af8ab9 100644 --- a/pkg/grpc/interface.go +++ b/pkg/grpc/interface.go @@ -12,6 +12,7 @@ type AIModel interface { Predict(*pb.PredictOptions) (string, error) PredictStream(*pb.PredictOptions, chan string) error Load(*pb.ModelOptions) error + Free() error Embeddings(*pb.PredictOptions) ([]float32, error) GenerateImage(*pb.GenerateImageRequest) error GenerateVideo(*pb.GenerateVideoRequest) error diff --git a/pkg/model/process.go b/pkg/model/process.go index aab2e9aa3..e179a3265 100644 --- a/pkg/model/process.go +++ b/pkg/model/process.go @@ -46,6 +46,14 @@ func (ml *ModelLoader) deleteProcess(s string) error { xlog.Debug("Deleting process", "model", s) + // Free GPU resources before stopping the process to ensure VRAM is released + if freeFunc, ok := model.GRPC(false, ml.wd).(interface{ Free() error }); ok { + xlog.Debug("Calling Free() to release GPU resources", "model", s) + if err := freeFunc.Free(); err != nil { + xlog.Warn("Error freeing GPU resources", "error", err, "model", s) + } + } + process := model.Process() if process == nil { xlog.Error("No process", "model", s) @@ -65,7 +73,6 @@ func (ml *ModelLoader) deleteProcess(s string) error { return err } - func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error { var err error = nil ml.mu.Lock()