Files
LocalAI/core/services/modeladmin/vram.go
Richard Palethorpe 969005b2a1 feat(gallery): Speed up load times and clean gallery entries (#9211)
* feat: Rework VRAM estimation and use known_usecases in gallery

Signed-off-by: Richard Palethorpe <io@richiejp.com>
Assisted-by: Claude:claude-opus-4-7[1m] [Claude Code]

* chore(gallery): regenerate gallery index and add known_usecases to model entries

Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-05-06 14:51:38 +02:00

132 lines
3.7 KiB
Go

package modeladmin
import (
"context"
"fmt"
"path/filepath"
"strings"
"time"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/pkg/system"
"github.com/mudler/LocalAI/pkg/vram"
)
// VRAMRequest is the input for EstimateVRAM. JSON tags let the HTTP
// handler bind directly into this type instead of carrying a parallel
// private struct.
type VRAMRequest struct {
Model string `json:"model"`
ContextSize uint32 `json:"context_size,omitempty"`
GPULayers int `json:"gpu_layers,omitempty"`
KVQuantBits int `json:"kv_quant_bits,omitempty"`
}
// VRAMResponse embeds vram.EstimateResult and adds the context-defaulted
// note fields the HTTP endpoint surfaces.
type VRAMResponse struct {
vram.EstimateResult
ContextNote string `json:"context_note,omitempty"`
ModelMaxContext uint64 `json:"model_max_context,omitempty"`
}
// EstimateVRAM computes a VRAM estimate for an installed model. It mirrors
// VRAMEstimateEndpoint without any HTTP coupling.
func EstimateVRAM(ctx context.Context, req VRAMRequest, cl *config.ModelConfigLoader, sysState *system.SystemState) (*VRAMResponse, error) {
if req.Model == "" {
return nil, ErrNameRequired
}
cfg, exists := cl.GetModelConfig(req.Model)
if !exists {
return nil, ErrNotFound
}
modelsPath := sysState.Model.ModelsPath
var files []vram.FileInput
seen := make(map[string]bool)
for _, f := range cfg.DownloadFiles {
addWeightFile(string(f.URI), modelsPath, &files, seen)
}
if cfg.Model != "" {
addWeightFile(cfg.Model, modelsPath, &files, seen)
}
if cfg.MMProj != "" {
addWeightFile(cfg.MMProj, modelsPath, &files, seen)
}
if len(files) == 0 {
// No weight files: the caller (HTTP or MCP) reports this as a
// non-error empty estimate. Returning a typed nil here lets both
// layers format the message consistently.
return &VRAMResponse{ContextNote: "no weight files found for estimation"}, nil
}
contextDefaulted := false
ctxLen := req.ContextSize
if ctxLen == 0 {
if cfg.ContextSize != nil {
ctxLen = uint32(*cfg.ContextSize)
} else {
ctxLen = 8192
contextDefaulted = true
}
}
opts := vram.EstimateOptions{
GPULayers: req.GPULayers,
KVQuantBits: req.KVQuantBits,
}
subCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
multi, err := vram.EstimateMultiContext(subCtx, files, []uint32{ctxLen}, opts, vram.DefaultCachedSizeResolver(), vram.DefaultCachedGGUFReader())
if err != nil {
return nil, fmt.Errorf("vram estimate: %w", err)
}
at := multi.Estimates[fmt.Sprint(ctxLen)]
resp := &VRAMResponse{
EstimateResult: vram.EstimateResult{
SizeBytes: multi.SizeBytes,
SizeDisplay: multi.SizeDisplay,
ContextLength: at.ContextLength,
VRAMBytes: at.VRAMBytes,
VRAMDisplay: at.VRAMDisplay,
},
ModelMaxContext: multi.ModelMaxContext,
}
if contextDefaulted && multi.ModelMaxContext > 0 {
resp.ContextNote = fmt.Sprintf(
"Estimate used default context_size=8192. The model's trained maximum context is %d; VRAM usage will be higher at larger context sizes.",
multi.ModelMaxContext,
)
}
return resp, nil
}
// resolveModelURI converts a relative model path to a file:// URI so the
// size resolver can stat it on disk. URIs that already have a scheme are
// returned unchanged.
func resolveModelURI(uri, modelsPath string) string {
if strings.Contains(uri, "://") {
return uri
}
return "file://" + filepath.Join(modelsPath, uri)
}
// addWeightFile appends a resolved weight file to files.
func addWeightFile(uri, modelsPath string, files *[]vram.FileInput, seen map[string]bool) {
if !vram.IsWeightFile(uri) {
return
}
resolved := resolveModelURI(uri, modelsPath)
if seen[resolved] {
return
}
seen[resolved] = true
*files = append(*files, vram.FileInput{URI: resolved, Size: 0})
}