LocalAI/core/services/modeladmin/vram.go

package modeladmin

import (
	"context"
	"fmt"
	"path/filepath"
	"strings"
	"time"

	"github.com/mudler/LocalAI/core/config"
	"github.com/mudler/LocalAI/pkg/system"
	"github.com/mudler/LocalAI/pkg/vram"
)

// VRAMRequest is the input for EstimateVRAM. JSON tags let the HTTP
// handler bind directly into this type instead of carrying a parallel
// private struct.
type VRAMRequest struct {
	Model       string `json:"model"`
	ContextSize uint32 `json:"context_size,omitempty"`
	GPULayers   int    `json:"gpu_layers,omitempty"`
	KVQuantBits int    `json:"kv_quant_bits,omitempty"`
}

// VRAMResponse embeds vram.EstimateResult and adds the context-defaulted
// note fields the HTTP endpoint surfaces.
type VRAMResponse struct {
	vram.EstimateResult
	ContextNote     string `json:"context_note,omitempty"`
	ModelMaxContext uint64 `json:"model_max_context,omitempty"`
}

// EstimateVRAM computes a VRAM estimate for an installed model. It mirrors
// VRAMEstimateEndpoint without any HTTP coupling.
func EstimateVRAM(ctx context.Context, req VRAMRequest, cl *config.ModelConfigLoader, sysState *system.SystemState) (*VRAMResponse, error) {
	if req.Model == "" {
		return nil, ErrNameRequired
	}
	cfg, exists := cl.GetModelConfig(req.Model)
	if !exists {
		return nil, ErrNotFound
	}
	modelsPath := sysState.Model.ModelsPath

	var files []vram.FileInput
	seen := make(map[string]bool)

	for _, f := range cfg.DownloadFiles {
		addWeightFile(string(f.URI), modelsPath, &files, seen)
	}
	if cfg.Model != "" {
		addWeightFile(cfg.Model, modelsPath, &files, seen)
	}
	if cfg.MMProj != "" {
		addWeightFile(cfg.MMProj, modelsPath, &files, seen)
	}

	if len(files) == 0 {
		// No weight files: the caller (HTTP or MCP) reports this as a
		// non-error empty estimate. Returning a typed nil here lets both
		// layers format the message consistently.
		return &VRAMResponse{ContextNote: "no weight files found for estimation"}, nil
	}

	contextDefaulted := false
	ctxLen := req.ContextSize
	if ctxLen == 0 {
		if cfg.ContextSize != nil {
			ctxLen = uint32(*cfg.ContextSize)
		} else {
			ctxLen = 8192
			contextDefaulted = true
		}
	}

	opts := vram.EstimateOptions{
		GPULayers:   req.GPULayers,
		KVQuantBits: req.KVQuantBits,
	}

	subCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
	defer cancel()

	multi, err := vram.EstimateMultiContext(subCtx, files, []uint32{ctxLen}, opts, vram.DefaultCachedSizeResolver(), vram.DefaultCachedGGUFReader())
	if err != nil {
		return nil, fmt.Errorf("vram estimate: %w", err)
	}

	at := multi.Estimates[fmt.Sprint(ctxLen)]
	resp := &VRAMResponse{
		EstimateResult: vram.EstimateResult{
			SizeBytes:     multi.SizeBytes,
			SizeDisplay:   multi.SizeDisplay,
			ContextLength: at.ContextLength,
			VRAMBytes:     at.VRAMBytes,
			VRAMDisplay:   at.VRAMDisplay,
		},
		ModelMaxContext: multi.ModelMaxContext,
	}

	if contextDefaulted && multi.ModelMaxContext > 0 {
		resp.ContextNote = fmt.Sprintf(
			"Estimate used default context_size=8192. The model's trained maximum context is %d; VRAM usage will be higher at larger context sizes.",
			multi.ModelMaxContext,
		)
	}
	return resp, nil
}

// resolveModelURI converts a relative model path to a file:// URI so the
// size resolver can stat it on disk. URIs that already have a scheme are
// returned unchanged.
func resolveModelURI(uri, modelsPath string) string {
	if strings.Contains(uri, "://") {
		return uri
	}
	return "file://" + filepath.Join(modelsPath, uri)
}

// addWeightFile appends a resolved weight file to files.
func addWeightFile(uri, modelsPath string, files *[]vram.FileInput, seen map[string]bool) {
	if !vram.IsWeightFile(uri) {
		return
	}
	resolved := resolveModelURI(uri, modelsPath)
	if seen[resolved] {
		return
	}
	seen[resolved] = true
	*files = append(*files, vram.FileInput{URI: resolved, Size: 0})
}