package modeladmin import ( "context" "fmt" "path/filepath" "strings" "time" "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/pkg/system" "github.com/mudler/LocalAI/pkg/vram" ) // VRAMRequest is the input for EstimateVRAM. JSON tags let the HTTP // handler bind directly into this type instead of carrying a parallel // private struct. type VRAMRequest struct { Model string `json:"model"` ContextSize uint32 `json:"context_size,omitempty"` GPULayers int `json:"gpu_layers,omitempty"` KVQuantBits int `json:"kv_quant_bits,omitempty"` } // VRAMResponse embeds vram.EstimateResult and adds the context-defaulted // note fields the HTTP endpoint surfaces. type VRAMResponse struct { vram.EstimateResult ContextNote string `json:"context_note,omitempty"` ModelMaxContext uint64 `json:"model_max_context,omitempty"` } // EstimateVRAM computes a VRAM estimate for an installed model. It mirrors // VRAMEstimateEndpoint without any HTTP coupling. func EstimateVRAM(ctx context.Context, req VRAMRequest, cl *config.ModelConfigLoader, sysState *system.SystemState) (*VRAMResponse, error) { if req.Model == "" { return nil, ErrNameRequired } cfg, exists := cl.GetModelConfig(req.Model) if !exists { return nil, ErrNotFound } modelsPath := sysState.Model.ModelsPath var files []vram.FileInput seen := make(map[string]bool) for _, f := range cfg.DownloadFiles { addWeightFile(string(f.URI), modelsPath, &files, seen) } if cfg.Model != "" { addWeightFile(cfg.Model, modelsPath, &files, seen) } if cfg.MMProj != "" { addWeightFile(cfg.MMProj, modelsPath, &files, seen) } if len(files) == 0 { // No weight files: the caller (HTTP or MCP) reports this as a // non-error empty estimate. Returning a typed nil here lets both // layers format the message consistently. return &VRAMResponse{ContextNote: "no weight files found for estimation"}, nil } contextDefaulted := false ctxLen := req.ContextSize if ctxLen == 0 { if cfg.ContextSize != nil { ctxLen = uint32(*cfg.ContextSize) } else { ctxLen = 8192 contextDefaulted = true } } opts := vram.EstimateOptions{ GPULayers: req.GPULayers, KVQuantBits: req.KVQuantBits, } subCtx, cancel := context.WithTimeout(ctx, 10*time.Second) defer cancel() multi, err := vram.EstimateMultiContext(subCtx, files, []uint32{ctxLen}, opts, vram.DefaultCachedSizeResolver(), vram.DefaultCachedGGUFReader()) if err != nil { return nil, fmt.Errorf("vram estimate: %w", err) } at := multi.Estimates[fmt.Sprint(ctxLen)] resp := &VRAMResponse{ EstimateResult: vram.EstimateResult{ SizeBytes: multi.SizeBytes, SizeDisplay: multi.SizeDisplay, ContextLength: at.ContextLength, VRAMBytes: at.VRAMBytes, VRAMDisplay: at.VRAMDisplay, }, ModelMaxContext: multi.ModelMaxContext, } if contextDefaulted && multi.ModelMaxContext > 0 { resp.ContextNote = fmt.Sprintf( "Estimate used default context_size=8192. The model's trained maximum context is %d; VRAM usage will be higher at larger context sizes.", multi.ModelMaxContext, ) } return resp, nil } // resolveModelURI converts a relative model path to a file:// URI so the // size resolver can stat it on disk. URIs that already have a scheme are // returned unchanged. func resolveModelURI(uri, modelsPath string) string { if strings.Contains(uri, "://") { return uri } return "file://" + filepath.Join(modelsPath, uri) } // addWeightFile appends a resolved weight file to files. func addWeightFile(uri, modelsPath string, files *[]vram.FileInput, seen map[string]bool) { if !vram.IsWeightFile(uri) { return } resolved := resolveModelURI(uri, modelsPath) if seen[resolved] { return } seen[resolved] = true *files = append(*files, vram.FileInput{URI: resolved, Size: 0}) }