mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-21 23:29:04 -04:00
* feat(watchdog): add size-aware LRU eviction mode When the model count hits the LRU limit or the memory reclaimer fires, evict the largest model by on-disk file size first rather than the least-recently-used one. For GGUF models the file size is a reliable proxy for GPU/RAM footprint, so evicting the largest candidate maximises freed memory per eviction round while keeping small utility models (embeddings, classifiers, rerankers) resident. Changes: - `pkg/model/watchdog.go`: add `sizeAwareEviction` flag and `modelSizes map[string]int64` to `WatchDog`; sort candidates by `sizeBytes` desc (LRU time as tiebreaker) when the flag is set; add `RegisterModelSize`, `SetSizeAwareEviction`, `GetSizeAwareEviction` - `pkg/model/watchdog_options.go`: add `WithSizeAwareEviction` option - `pkg/model/initializers.go`: stat model file after load and call `RegisterModelSize` so size data is available before the first eviction - `core/config/application_config.go`, `runtime_settings.go`: add `SizeAwareEviction` field and `WithSizeAwareEviction` app option; expose via `ToRuntimeSettings` / `ApplyRuntimeSettings` for the `POST /api/settings` live-reload path - `core/cli/run.go`: add `--size-aware-eviction` flag / `LOCALAI_SIZE_AWARE_EVICTION` env var - `core/application/startup.go`, `watchdog.go`: wire the new option through to `NewWatchDog` - `pkg/model/watchdog_test.go`: 5 new specs — option enable, dynamic toggle, largest-first ordering, equal-size LRU tiebreaker, no-size fallback to LRU, and size-map cleanup on eviction Closes #9375 Signed-off-by: supermario_leo <leo.stack@outlook.com> * refactor(watchdog): use vram estimation scaffolding for model size Replace the brittle os.Stat(modelFile) approach with a proper call to pkg/vram, which handles multi-file models (DownloadFiles, MMProj) and all weight file types, not just single GGUF files. - Add estimateModelSizeBytes() in core/backend/options.go that collects all weight file URIs from the model config, resolves them to file:// URIs, and calls vram.Estimate() with the shared DefaultCachedSizeResolver (15-min TTL cache avoids redundant stat calls on repeated loads) - Thread the result through via a new WithModelSizeBytes() loader option - In initializers.go, consume the pre-computed size instead of calling os.Stat; if no size was supplied (e.g. for external/router-dispatched models) the registration is simply skipped Signed-off-by: supermario_leo <leo.stack@outlook.com> * refactor(watchdog): use EstimateModel with HF fallback for size estimation Switch estimateModelSizeBytes from calling vram.Estimate directly to the unified vram.EstimateModel entry point, which adds automatic fallbacks: file-based GGUF metadata → HF API → size string. Also extract the HuggingFace repo ID from model URIs (huggingface://, hf://, https://huggingface.co/ and org/model short-form) and pass it as ModelEstimateInput.HFRepo, so models not yet downloaded locally can still get a size estimate via the HF API. Addresses @mudler's review feedback: "better to rely on EstimateModel and pass by the HF URL of the model extracted from the URI". Signed-off-by: supermario_leo <leo.stack@outlook.com> * feat(webui): add Size-Aware Eviction toggle to settings page The size-aware eviction setting was wired through the CLI flag and the RuntimeSettings live-reload path (POST /api/settings) but had no handle on the React settings page, so it could not be toggled from the UI. Add a Size-Aware Eviction toggle to the Watchdog section, next to the existing Force Eviction When Busy / LRU eviction handles. The settings page loads and saves the whole RuntimeSettings object, so the new size_aware_eviction key is picked up with no extra plumbing. Addresses @mudler's review feedback: the application config setting should land on the same UI settings page as the other handles. Signed-off-by: supermario_leo <leo.stack@outlook.com> --------- Signed-off-by: supermario_leo <leo.stack@outlook.com>
112 lines
2.1 KiB
Go
112 lines
2.1 KiB
Go
package model
|
|
|
|
import (
|
|
"context"
|
|
|
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
|
)
|
|
|
|
type Options struct {
|
|
backendString string
|
|
model string
|
|
modelID string
|
|
context context.Context
|
|
|
|
gRPCOptions *pb.ModelOptions
|
|
|
|
externalBackends map[string]string
|
|
|
|
grpcAttempts int
|
|
grpcAttemptsDelay int
|
|
parallelRequests bool
|
|
|
|
// modelSizeBytes is the estimated total weight size in bytes, pre-computed
|
|
// by the caller using the vram estimation scaffolding. When non-zero it is
|
|
// registered with the watchdog so size-aware eviction can rank models.
|
|
modelSizeBytes int64
|
|
}
|
|
|
|
type Option func(*Options)
|
|
|
|
var EnableParallelRequests = func(o *Options) {
|
|
o.parallelRequests = true
|
|
}
|
|
|
|
func WithExternalBackend(name string, uri string) Option {
|
|
return func(o *Options) {
|
|
if o.externalBackends == nil {
|
|
o.externalBackends = make(map[string]string)
|
|
}
|
|
o.externalBackends[name] = uri
|
|
}
|
|
}
|
|
|
|
func WithGRPCAttempts(attempts int) Option {
|
|
return func(o *Options) {
|
|
o.grpcAttempts = attempts
|
|
}
|
|
}
|
|
|
|
func WithGRPCAttemptsDelay(delay int) Option {
|
|
return func(o *Options) {
|
|
o.grpcAttemptsDelay = delay
|
|
}
|
|
}
|
|
|
|
func WithBackendString(backend string) Option {
|
|
return func(o *Options) {
|
|
o.backendString = backend
|
|
}
|
|
}
|
|
|
|
func WithDefaultBackendString(backend string) Option {
|
|
return func(o *Options) {
|
|
if o.backendString == "" {
|
|
o.backendString = backend
|
|
}
|
|
}
|
|
}
|
|
|
|
func WithModel(modelFile string) Option {
|
|
return func(o *Options) {
|
|
o.model = modelFile
|
|
}
|
|
}
|
|
|
|
func WithLoadGRPCLoadModelOpts(opts *pb.ModelOptions) Option {
|
|
return func(o *Options) {
|
|
o.gRPCOptions = opts
|
|
}
|
|
}
|
|
|
|
func WithContext(ctx context.Context) Option {
|
|
return func(o *Options) {
|
|
o.context = ctx
|
|
}
|
|
}
|
|
|
|
func WithModelID(id string) Option {
|
|
return func(o *Options) {
|
|
o.modelID = id
|
|
}
|
|
}
|
|
|
|
func WithModelSizeBytes(bytes int64) Option {
|
|
return func(o *Options) {
|
|
o.modelSizeBytes = bytes
|
|
}
|
|
}
|
|
|
|
func NewOptions(opts ...Option) *Options {
|
|
o := &Options{
|
|
gRPCOptions: &pb.ModelOptions{},
|
|
context: context.Background(),
|
|
grpcAttempts: 20,
|
|
grpcAttemptsDelay: 2,
|
|
}
|
|
for _, opt := range opts {
|
|
opt(o)
|
|
}
|
|
return o
|
|
}
|