fix: drop gguf VRAM estimation (now redundant) (#8325)

fix: drop gguf VRAM estimation

Cleanup. This is now handled directly in llama.cpp, no need to estimate from Go.

VRAM estimation in general is tricky, but llama.cpp ( 41ea26144e/src/llama.cpp (L168) ) lately has added an automatic "fitting" of models to VRAM, so we can drop backend-specific GGUF VRAM estimation from our code instead of trying to guess as we already enable it

 397f7f0862/backend/cpp/llama-cpp/grpc-server.cpp (L393)

Fixes: https://github.com/mudler/LocalAI/issues/8302
See: https://github.com/mudler/LocalAI/issues/8302#issuecomment-3830773472
This commit is contained in:
Ettore Di Giacinto
2026-02-01 17:33:28 +01:00
committed by GitHub
parent b6459ddd57
commit 800f749c7b
2 changed files with 0 additions and 84 deletions

View File

@@ -38,30 +38,6 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) {
}
}
// vram estimation
vram, err := xsysinfo.TotalAvailableVRAM()
if err != nil {
xlog.Error("guessDefaultsFromFile(TotalAvailableVRAM)", "error", err)
} else if vram > 0 {
estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram)
if err != nil {
xlog.Error("guessDefaultsFromFile(EstimateGGUFVRAMUsage)", "error", err)
} else {
if estimate.IsFullOffload {
xlog.Warn("guessDefaultsFromFile: full offload is recommended")
}
if estimate.EstimatedVRAM > vram {
xlog.Warn("guessDefaultsFromFile: estimated VRAM usage is greater than available VRAM")
}
if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 {
xlog.Debug("guessDefaultsFromFile: layers estimated", "layers", estimate.EstimatedLayers)
cfg.NGPULayers = &estimate.EstimatedLayers
}
}
}
if cfg.NGPULayers == nil {
// we assume we want to offload all layers
defaultHigh := defaultNGPULayers

View File

@@ -1,60 +0,0 @@
package xsysinfo
import (
gguf "github.com/gpustack/gguf-parser-go"
)
type VRAMEstimate struct {
TotalVRAM uint64
AvailableVRAM uint64
ModelSize uint64
EstimatedLayers int
EstimatedVRAM uint64
IsFullOffload bool
}
func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) {
// Get model metadata
m := f.Metadata()
estimate := f.EstimateLLaMACppRun()
lmes := estimate.SummarizeItem(true, 0, 0)
estimatedVRAM := uint64(0)
availableLayers := lmes.OffloadLayers // TODO: check if we can just use OffloadLayers here
for _, vram := range lmes.VRAMs {
estimatedVRAM += uint64(vram.NonUMA)
}
// Calculate base model size
modelSize := uint64(m.Size)
if availableLayers == 0 {
availableLayers = 1
}
if estimatedVRAM == 0 {
estimatedVRAM = 1
}
// Estimate number of layers that can fit in VRAM
// Each layer typically requires about 1/32 of the model size
layerSize := estimatedVRAM / availableLayers
estimatedLayers := int(availableVRAM / layerSize)
if availableVRAM > estimatedVRAM {
estimatedLayers = int(availableLayers)
}
// Calculate estimated VRAM usage
return &VRAMEstimate{
TotalVRAM: availableVRAM,
AvailableVRAM: availableVRAM,
ModelSize: modelSize,
EstimatedLayers: estimatedLayers,
EstimatedVRAM: estimatedVRAM,
IsFullOffload: availableVRAM > estimatedVRAM,
}, nil
}