mirror of
https://github.com/mudler/LocalAI.git
synced 2026-03-31 21:25:59 -04:00
feat: Add Free RPC to backend.proto for VRAM cleanup (#8751)
* fix: Add VRAM cleanup when stopping models
- Add Free() method to AIModel interface for proper GPU resource cleanup
- Implement Free() in llama backend to release llama.cpp model resources
- Add Free() stub implementations in base and SingleThread backends
- Modify deleteProcess() to call Free() before stopping the process
to ensure VRAM is properly released when models are unloaded
Fixes issue where VRAM was not freed when stopping models, which
could lead to memory exhaustion when running multiple models
sequentially.
* feat: Add Free RPC to backend.proto for VRAM cleanup\n\n- Add rpc Free(HealthMessage) returns (Result) {} to backend.proto\n- This RPC is required to properly expose the Free() method\n through the gRPC interface for VRAM resource cleanup\n\nRefs: PR #8739
* Apply suggestion from @mudler
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---------
Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
Co-authored-by: localai-bot <localai-bot@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
This commit is contained in:
@@ -130,3 +130,6 @@ func memoryUsage() *pb.MemoryUsageData {
|
||||
}
|
||||
return &mud
|
||||
}
|
||||
func (llm *Base) Free() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -50,3 +50,10 @@ func (llm *SingleThread) Status() (pb.StatusResponse, error) {
|
||||
Memory: mud,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
||||
// Free releases resources for SingleThread backends
|
||||
// The base implementation does nothing, derived classes should override if needed
|
||||
func (llm *SingleThread) Free() error {
|
||||
return llm.Base.Free()
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ type AIModel interface {
|
||||
Predict(*pb.PredictOptions) (string, error)
|
||||
PredictStream(*pb.PredictOptions, chan string) error
|
||||
Load(*pb.ModelOptions) error
|
||||
Free() error
|
||||
Embeddings(*pb.PredictOptions) ([]float32, error)
|
||||
GenerateImage(*pb.GenerateImageRequest) error
|
||||
GenerateVideo(*pb.GenerateVideoRequest) error
|
||||
|
||||
@@ -46,6 +46,14 @@ func (ml *ModelLoader) deleteProcess(s string) error {
|
||||
|
||||
xlog.Debug("Deleting process", "model", s)
|
||||
|
||||
// Free GPU resources before stopping the process to ensure VRAM is released
|
||||
if freeFunc, ok := model.GRPC(false, ml.wd).(interface{ Free() error }); ok {
|
||||
xlog.Debug("Calling Free() to release GPU resources", "model", s)
|
||||
if err := freeFunc.Free(); err != nil {
|
||||
xlog.Warn("Error freeing GPU resources", "error", err, "model", s)
|
||||
}
|
||||
}
|
||||
|
||||
process := model.Process()
|
||||
if process == nil {
|
||||
xlog.Error("No process", "model", s)
|
||||
@@ -65,7 +73,6 @@ func (ml *ModelLoader) deleteProcess(s string) error {
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error {
|
||||
var err error = nil
|
||||
ml.mu.Lock()
|
||||
|
||||
Reference in New Issue
Block a user