diff --git a/core/config/gguf.go b/core/config/gguf.go index 0d788dad4..7b23c8ce9 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -38,30 +38,6 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { } } - // vram estimation - vram, err := xsysinfo.TotalAvailableVRAM() - if err != nil { - xlog.Error("guessDefaultsFromFile(TotalAvailableVRAM)", "error", err) - } else if vram > 0 { - estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram) - if err != nil { - xlog.Error("guessDefaultsFromFile(EstimateGGUFVRAMUsage)", "error", err) - } else { - if estimate.IsFullOffload { - xlog.Warn("guessDefaultsFromFile: full offload is recommended") - } - - if estimate.EstimatedVRAM > vram { - xlog.Warn("guessDefaultsFromFile: estimated VRAM usage is greater than available VRAM") - } - - if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 { - xlog.Debug("guessDefaultsFromFile: layers estimated", "layers", estimate.EstimatedLayers) - cfg.NGPULayers = &estimate.EstimatedLayers - } - } - } - if cfg.NGPULayers == nil { // we assume we want to offload all layers defaultHigh := defaultNGPULayers diff --git a/pkg/xsysinfo/gguf.go b/pkg/xsysinfo/gguf.go deleted file mode 100644 index 0ea9bca06..000000000 --- a/pkg/xsysinfo/gguf.go +++ /dev/null @@ -1,60 +0,0 @@ -package xsysinfo - -import ( - gguf "github.com/gpustack/gguf-parser-go" -) - -type VRAMEstimate struct { - TotalVRAM uint64 - AvailableVRAM uint64 - ModelSize uint64 - EstimatedLayers int - EstimatedVRAM uint64 - IsFullOffload bool -} - -func EstimateGGUFVRAMUsage(f *gguf.GGUFFile, availableVRAM uint64) (*VRAMEstimate, error) { - // Get model metadata - m := f.Metadata() - - estimate := f.EstimateLLaMACppRun() - - lmes := estimate.SummarizeItem(true, 0, 0) - estimatedVRAM := uint64(0) - availableLayers := lmes.OffloadLayers // TODO: check if we can just use OffloadLayers here - - for _, vram := range lmes.VRAMs { - estimatedVRAM += uint64(vram.NonUMA) - } - - // Calculate base model size - modelSize := uint64(m.Size) - - if availableLayers == 0 { - availableLayers = 1 - } - - if estimatedVRAM == 0 { - estimatedVRAM = 1 - } - - // Estimate number of layers that can fit in VRAM - // Each layer typically requires about 1/32 of the model size - layerSize := estimatedVRAM / availableLayers - - estimatedLayers := int(availableVRAM / layerSize) - if availableVRAM > estimatedVRAM { - estimatedLayers = int(availableLayers) - } - - // Calculate estimated VRAM usage - - return &VRAMEstimate{ - TotalVRAM: availableVRAM, - AvailableVRAM: availableVRAM, - ModelSize: modelSize, - EstimatedLayers: estimatedLayers, - EstimatedVRAM: estimatedVRAM, - IsFullOffload: availableVRAM > estimatedVRAM, - }, nil -}