diff --git a/backend/backend.proto b/backend/backend.proto index 07cd095d2..be12dfab7 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -9,6 +9,7 @@ package backend; service Backend { rpc Health(HealthMessage) returns (Reply) {} + rpc Free(HealthMessage) returns (Result) {} rpc Predict(PredictOptions) returns (Reply) {} rpc LoadModel(ModelOptions) returns (Result) {} rpc PredictStream(PredictOptions) returns (stream Reply) {} diff --git a/backend/go/llm/llama/llama.go b/backend/go/llm/llama/llama.go index 011023fe7..ceca1fa5e 100644 --- a/backend/go/llm/llama/llama.go +++ b/backend/go/llm/llama/llama.go @@ -18,6 +18,20 @@ type LLM struct { draftModel *llama.LLama } + +// Free releases GPU resources and frees the llama model +// This should be called when the model is being unloaded to properly release VRAM +func (llm *LLM) Free() error { + if llm.llama != nil { + llm.llama.Free() + llm.llama = nil + } + if llm.draftModel != nil { + llm.draftModel.Free() + llm.draftModel = nil + } + return nil +} func (llm *LLM) Load(opts *pb.ModelOptions) error { ropeFreqBase := float32(10000) ropeFreqScale := float32(1) diff --git a/pkg/grpc/base/base.go b/pkg/grpc/base/base.go index 6a72cc95c..0a96f0b26 100644 --- a/pkg/grpc/base/base.go +++ b/pkg/grpc/base/base.go @@ -130,3 +130,6 @@ func memoryUsage() *pb.MemoryUsageData { } return &mud } +func (llm *Base) Free() error { + return nil +} diff --git a/pkg/grpc/base/singlethread.go b/pkg/grpc/base/singlethread.go index e5da73edf..b40a6a1f1 100644 --- a/pkg/grpc/base/singlethread.go +++ b/pkg/grpc/base/singlethread.go @@ -50,3 +50,10 @@ func (llm *SingleThread) Status() (pb.StatusResponse, error) { Memory: mud, }, nil } + + +// Free releases resources for SingleThread backends +// The base implementation does nothing, derived classes should override if needed +func (llm *SingleThread) Free() error { + return llm.Base.Free() +} diff --git a/pkg/grpc/interface.go b/pkg/grpc/interface.go index 9610b817e..333af8ab9 100644 --- a/pkg/grpc/interface.go +++ b/pkg/grpc/interface.go @@ -12,6 +12,7 @@ type AIModel interface { Predict(*pb.PredictOptions) (string, error) PredictStream(*pb.PredictOptions, chan string) error Load(*pb.ModelOptions) error + Free() error Embeddings(*pb.PredictOptions) ([]float32, error) GenerateImage(*pb.GenerateImageRequest) error GenerateVideo(*pb.GenerateVideoRequest) error diff --git a/pkg/model/process.go b/pkg/model/process.go index aab2e9aa3..e179a3265 100644 --- a/pkg/model/process.go +++ b/pkg/model/process.go @@ -46,6 +46,14 @@ func (ml *ModelLoader) deleteProcess(s string) error { xlog.Debug("Deleting process", "model", s) + // Free GPU resources before stopping the process to ensure VRAM is released + if freeFunc, ok := model.GRPC(false, ml.wd).(interface{ Free() error }); ok { + xlog.Debug("Calling Free() to release GPU resources", "model", s) + if err := freeFunc.Free(); err != nil { + xlog.Warn("Error freeing GPU resources", "error", err, "model", s) + } + } + process := model.Process() if process == nil { xlog.Error("No process", "model", s) @@ -65,7 +73,6 @@ func (ml *ModelLoader) deleteProcess(s string) error { return err } - func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error { var err error = nil ml.mu.Lock()