package vram import ( "context" "fmt" "path" "strings" "github.com/mudler/LocalAI/pkg/downloader" "github.com/mudler/xlog" ) var weightExts = map[string]bool{ ".gguf": true, ".safetensors": true, ".bin": true, ".pt": true, } func IsWeightFile(nameOrURI string) bool { ext := strings.ToLower(path.Ext(path.Base(nameOrURI))) return weightExts[ext] } func IsGGUF(nameOrURI string) bool { return strings.ToLower(path.Ext(path.Base(nameOrURI))) == ".gguf" } // modelProfile captures the "fixed" properties of a model after I/O. // Everything except context length is constant for a given model. type modelProfile struct { sizeBytes uint64 // total weight file size ggufSize uint64 // GGUF file size (subset of sizeBytes) meta *GGUFMeta // nil if no GGUF metadata available } // resolveProfile does all I/O: iterates files, fetches sizes and GGUF metadata. func resolveProfile(ctx context.Context, files []FileInput, sizeResolver SizeResolver, ggufReader GGUFMetadataReader) modelProfile { var p modelProfile var firstGGUFURI string for i := range files { f := &files[i] if !IsWeightFile(f.URI) { continue } sz := f.Size if sz <= 0 && sizeResolver != nil { var err error sz, err = sizeResolver.ContentLength(ctx, f.URI) if err != nil { continue } } p.sizeBytes += uint64(sz) if IsGGUF(f.URI) { p.ggufSize += uint64(sz) if firstGGUFURI == "" { firstGGUFURI = f.URI } } } if p.ggufSize > 0 && ggufReader != nil && firstGGUFURI != "" { p.meta, _ = ggufReader.ReadMetadata(ctx, firstGGUFURI) } return p } // computeVRAM is pure arithmetic — no I/O. Returns VRAM bytes for a given // model profile and context length. func computeVRAM(p modelProfile, ctxLen uint32, opts EstimateOptions) uint64 { kvQuantBits := opts.KVQuantBits if kvQuantBits == 0 { kvQuantBits = 16 } if p.ggufSize > 0 { meta := p.meta if meta != nil && (meta.BlockCount > 0 || meta.EmbeddingLength > 0) { nLayers := meta.BlockCount if nLayers == 0 { nLayers = 32 } dModel := meta.EmbeddingLength if dModel == 0 { dModel = 4096 } headCountKV := meta.HeadCountKV if headCountKV == 0 { headCountKV = meta.HeadCount } if headCountKV == 0 { headCountKV = 8 } gpuLayers := opts.GPULayers if gpuLayers <= 0 { gpuLayers = int(nLayers) } bKV := uint32(kvQuantBits / 8) if bKV == 0 { bKV = 4 } M_model := p.ggufSize M_KV := uint64(bKV) * uint64(dModel) * uint64(headCountKV) * uint64(ctxLen) P := M_model * 2 M_overhead := uint64(0.02*float64(P) + 0.15*1e9) vramBytes := M_model + M_KV + M_overhead if nLayers > 0 && gpuLayers < int(nLayers) { layerRatio := float64(gpuLayers) / float64(nLayers) vramBytes = uint64(layerRatio*float64(M_model)) + M_KV + M_overhead } return vramBytes } return sizeOnlyVRAM(p.ggufSize, ctxLen) } if p.sizeBytes > 0 { return sizeOnlyVRAM(p.sizeBytes, ctxLen) } return 0 } func sizeOnlyVRAM(sizeOnDisk uint64, ctxLen uint32) uint64 { k := uint64(1024) vram := sizeOnDisk + k*uint64(ctxLen)*2 if vram < sizeOnDisk { vram = sizeOnDisk } return vram } // buildEstimates computes VRAMAt entries for each context size from a profile. func buildEstimates(p modelProfile, contextSizes []uint32, opts EstimateOptions) map[string]VRAMAt { m := make(map[string]VRAMAt, len(contextSizes)) for _, ctxLen := range contextSizes { vramBytes := computeVRAM(p, ctxLen, opts) m[fmt.Sprint(ctxLen)] = VRAMAt{ ContextLength: ctxLen, VRAMBytes: vramBytes, VRAMDisplay: FormatBytes(vramBytes), } } return m } // EstimateMultiContext estimates model size and VRAM at multiple context sizes. // It performs I/O once (resolveProfile) then computes VRAM for each context size. func EstimateMultiContext(ctx context.Context, files []FileInput, contextSizes []uint32, opts EstimateOptions, sizeResolver SizeResolver, ggufReader GGUFMetadataReader) (MultiContextEstimate, error) { if len(contextSizes) == 0 { contextSizes = []uint32{8192} } p := resolveProfile(ctx, files, sizeResolver, ggufReader) result := MultiContextEstimate{ SizeBytes: p.sizeBytes, SizeDisplay: FormatBytes(p.sizeBytes), Estimates: buildEstimates(p, contextSizes, opts), } if p.meta != nil && p.meta.MaximumContextLength > 0 { result.ModelMaxContext = p.meta.MaximumContextLength } return result, nil } // ParseSizeString parses a human-readable size string (e.g. "500MB", "14.5 GB", "2tb") // into bytes. Supports B, KB, MB, GB, TB, PB (case-insensitive, space optional). // Uses SI units (1 KB = 1000 B). func ParseSizeString(s string) (uint64, error) { s = strings.TrimSpace(s) if s == "" { return 0, fmt.Errorf("empty size string") } s = strings.ToUpper(s) i := 0 for i < len(s) && (s[i] == '.' || (s[i] >= '0' && s[i] <= '9')) { i++ } if i == 0 { return 0, fmt.Errorf("no numeric value in size string: %q", s) } numStr := s[:i] suffix := strings.TrimSpace(s[i:]) var num float64 if _, err := fmt.Sscanf(numStr, "%f", &num); err != nil { return 0, fmt.Errorf("invalid numeric value %q: %w", numStr, err) } if num < 0 { return 0, fmt.Errorf("negative size: %q", s) } multiplier := uint64(1) switch suffix { case "", "B": multiplier = 1 case "K", "KB": multiplier = 1000 case "M", "MB": multiplier = 1000 * 1000 case "G", "GB": multiplier = 1000 * 1000 * 1000 case "T", "TB": multiplier = 1000 * 1000 * 1000 * 1000 case "P", "PB": multiplier = 1000 * 1000 * 1000 * 1000 * 1000 default: return 0, fmt.Errorf("unknown size suffix: %q", suffix) } return uint64(num * float64(multiplier)), nil } func FormatBytes(n uint64) string { const unit = 1000 if n < unit { return fmt.Sprintf("%d B", n) } div, exp := uint64(unit), 0 for u := n / unit; u >= unit; u /= unit { div *= unit exp++ } return fmt.Sprintf("%.1f %cB", float64(n)/float64(div), "KMGTPE"[exp]) } type defaultSizeResolver struct{} func (defaultSizeResolver) ContentLength(ctx context.Context, uri string) (int64, error) { return downloader.URI(uri).ContentLength(ctx) } func DefaultSizeResolver() SizeResolver { return defaultSizeResolver{} } func DefaultGGUFReader() GGUFMetadataReader { return defaultGGUFReader{} } // ModelEstimateInput describes the inputs for a unified VRAM/size estimation. // The estimator cascades through available data: files -> size string -> HF repo -> zero. type ModelEstimateInput struct { Files []FileInput // weight files with optional pre-known sizes Size string // gallery hardcoded size (e.g. "14.5GB") HFRepo string // HF repo ID or URL Options EstimateOptions // GPU layers, KV quant bits } // EstimateModelMultiContext provides a unified VRAM estimation entry point // that returns estimates at multiple context sizes. // It tries (in order): // 1. Direct file-based estimation (GGUF metadata or file size heuristic) // 2. ParseSizeString from Size field // 3. HuggingFace repo file listing // 4. Zero result func EstimateModelMultiContext(ctx context.Context, input ModelEstimateInput, contextSizes []uint32) (MultiContextEstimate, error) { if len(contextSizes) == 0 { contextSizes = []uint32{8192} } // 1. Try direct file estimation if len(input.Files) > 0 { result, err := EstimateMultiContext(ctx, input.Files, contextSizes, input.Options, DefaultCachedSizeResolver(), DefaultCachedGGUFReader()) if err != nil { xlog.Debug("VRAM estimation from files failed", "error", err) } if err == nil && result.SizeBytes > 0 { return result, nil } } // 2. Try size string if input.Size != "" { if sizeBytes, err := ParseSizeString(input.Size); err != nil { xlog.Debug("VRAM estimation from size string failed", "error", err, "size", input.Size) } else if sizeBytes > 0 { return MultiContextEstimate{ SizeBytes: sizeBytes, SizeDisplay: FormatBytes(sizeBytes), Estimates: buildEstimates(modelProfile{sizeBytes: sizeBytes}, contextSizes, EstimateOptions{}), }, nil } } // 3. Try HF repo hfRepo := input.HFRepo if repoID, ok := ExtractHFRepoID(hfRepo); ok { hfRepo = repoID } if hfRepo != "" { totalBytes, err := hfRepoWeightSize(ctx, hfRepo) if err != nil { xlog.Debug("VRAM estimation from HF repo failed", "error", err, "repo", hfRepo) } if err == nil && totalBytes > 0 { return MultiContextEstimate{ SizeBytes: totalBytes, SizeDisplay: FormatBytes(totalBytes), Estimates: buildEstimates(modelProfile{sizeBytes: totalBytes}, contextSizes, EstimateOptions{}), }, nil } } // 4. No estimation possible return MultiContextEstimate{}, nil }