Files
LocalAI/pkg/grpc/base/singlethread.go
LocalAI [bot] 6e5a58ca70 feat: Add Free RPC to backend.proto for VRAM cleanup (#8751)
* fix: Add VRAM cleanup when stopping models

- Add Free() method to AIModel interface for proper GPU resource cleanup
- Implement Free() in llama backend to release llama.cpp model resources
- Add Free() stub implementations in base and SingleThread backends
- Modify deleteProcess() to call Free() before stopping the process
  to ensure VRAM is properly released when models are unloaded

Fixes issue where VRAM was not freed when stopping models, which
could lead to memory exhaustion when running multiple models
sequentially.

* feat: Add Free RPC to backend.proto for VRAM cleanup\n\n- Add rpc Free(HealthMessage) returns (Result) {} to backend.proto\n- This RPC is required to properly expose the Free() method\n  through the gRPC interface for VRAM resource cleanup\n\nRefs: PR #8739

* Apply suggestion from @mudler

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>

---------

Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
Co-authored-by: localai-bot <localai-bot@users.noreply.github.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
2026-03-03 12:39:06 +01:00

60 lines
1.3 KiB
Go

package base
import (
"sync"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
// SingleThread are backends that does not support multiple requests.
// There will be only one request being served at the time.
// This is useful for models that are not thread safe and cannot run
// multiple requests at the same time.
type SingleThread struct {
Base
backendBusy sync.Mutex
}
// Locking returns true if the backend needs to lock resources
func (llm *SingleThread) Locking() bool {
return true
}
func (llm *SingleThread) Lock() {
llm.backendBusy.Lock()
}
func (llm *SingleThread) Unlock() {
llm.backendBusy.Unlock()
}
func (llm *SingleThread) Busy() bool {
r := llm.backendBusy.TryLock()
if r {
llm.backendBusy.Unlock()
}
return r
}
// backends may wish to call this to capture the gopsutil info, then enhance with additional memory usage details?
func (llm *SingleThread) Status() (pb.StatusResponse, error) {
mud := memoryUsage()
state := pb.StatusResponse_READY
if llm.Busy() {
state = pb.StatusResponse_BUSY
}
return pb.StatusResponse{
State: state,
Memory: mud,
}, nil
}
// Free releases resources for SingleThread backends
// The base implementation does nothing, derived classes should override if needed
func (llm *SingleThread) Free() error {
return llm.Base.Free()
}