feat(gallery): Speed up load times and clean gallery entries (#9211)

* feat: Rework VRAM estimation and use known_usecases in gallery

Signed-off-by: Richard Palethorpe <io@richiejp.com>
Assisted-by: Claude:claude-opus-4-7[1m] [Claude Code]

* chore(gallery): regenerate gallery index and add known_usecases to model entries

Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
This commit is contained in:
Richard Palethorpe
2026-05-06 13:51:38 +01:00
committed by GitHub
parent 6d56bf98fe
commit 969005b2a1
47 changed files with 17089 additions and 5345 deletions

View File

@@ -17,6 +17,7 @@ import (
"github.com/mudler/LocalAI/core/services/jobs"
"github.com/mudler/LocalAI/core/services/nodes"
"github.com/mudler/LocalAI/core/services/storage"
"github.com/mudler/LocalAI/pkg/vram"
coreStartup "github.com/mudler/LocalAI/core/startup"
"github.com/mudler/LocalAI/internal"
@@ -251,6 +252,10 @@ func New(opts ...config.AppOption) (*Application, error) {
go uc.Run(options.Context)
}
// Wire gallery generation counter into VRAM caches so they invalidate
// when gallery data refreshes instead of using a fixed TTL.
vram.SetGalleryGenerationFunc(gallery.GalleryGeneration)
if options.ConfigFile != "" {
if err := application.ModelConfigLoader().LoadMultipleModelConfigsSingleFile(options.ConfigFile, configLoaderOpts...); err != nil {
xlog.Error("error loading config file", "error", err)

View File

@@ -0,0 +1,480 @@
package config
import (
"slices"
"strings"
)
// Usecase name constants — the canonical string values used in gallery entries,
// model configs (known_usecases), and UsecaseInfoMap keys.
const (
UsecaseChat = "chat"
UsecaseCompletion = "completion"
UsecaseEdit = "edit"
UsecaseVision = "vision"
UsecaseEmbeddings = "embeddings"
UsecaseTokenize = "tokenize"
UsecaseImage = "image"
UsecaseVideo = "video"
UsecaseTranscript = "transcript"
UsecaseTTS = "tts"
UsecaseSoundGeneration = "sound_generation"
UsecaseRerank = "rerank"
UsecaseDetection = "detection"
UsecaseVAD = "vad"
UsecaseAudioTransform = "audio_transform"
UsecaseDiarization = "diarization"
)
// GRPCMethod identifies a Backend service RPC from backend.proto.
type GRPCMethod string
const (
MethodPredict GRPCMethod = "Predict"
MethodPredictStream GRPCMethod = "PredictStream"
MethodEmbedding GRPCMethod = "Embedding"
MethodGenerateImage GRPCMethod = "GenerateImage"
MethodGenerateVideo GRPCMethod = "GenerateVideo"
MethodAudioTranscription GRPCMethod = "AudioTranscription"
MethodTTS GRPCMethod = "TTS"
MethodTTSStream GRPCMethod = "TTSStream"
MethodSoundGeneration GRPCMethod = "SoundGeneration"
MethodTokenizeString GRPCMethod = "TokenizeString"
MethodDetect GRPCMethod = "Detect"
MethodRerank GRPCMethod = "Rerank"
MethodVAD GRPCMethod = "VAD"
MethodAudioTransform GRPCMethod = "AudioTransform"
MethodDiarize GRPCMethod = "Diarize"
)
// UsecaseInfo describes a single known_usecase value and how it maps
// to the gRPC backend API.
type UsecaseInfo struct {
// Flag is the ModelConfigUsecase bitmask value.
Flag ModelConfigUsecase
// GRPCMethod is the primary Backend service RPC this usecase maps to.
GRPCMethod GRPCMethod
// IsModifier is true when this usecase doesn't map to its own gRPC RPC
// but modifies how another RPC behaves (e.g., vision uses Predict with images).
IsModifier bool
// DependsOn names the usecase(s) this modifier requires (e.g., "chat").
DependsOn string
// Description is a human/LLM-readable explanation of what this usecase means.
Description string
}
// UsecaseInfoMap maps each known_usecase string to its gRPC and semantic info.
var UsecaseInfoMap = map[string]UsecaseInfo{
UsecaseChat: {
Flag: FLAG_CHAT,
GRPCMethod: MethodPredict,
Description: "Conversational/instruction-following via the Predict RPC with chat templates.",
},
UsecaseCompletion: {
Flag: FLAG_COMPLETION,
GRPCMethod: MethodPredict,
Description: "Text completion via the Predict RPC with a completion template.",
},
UsecaseEdit: {
Flag: FLAG_EDIT,
GRPCMethod: MethodPredict,
Description: "Text editing via the Predict RPC with an edit template.",
},
UsecaseVision: {
Flag: FLAG_VISION,
GRPCMethod: MethodPredict,
IsModifier: true,
DependsOn: UsecaseChat,
Description: "The model accepts images alongside text in the Predict RPC. For llama-cpp this requires an mmproj file.",
},
UsecaseEmbeddings: {
Flag: FLAG_EMBEDDINGS,
GRPCMethod: MethodEmbedding,
Description: "Vector embedding generation via the Embedding RPC.",
},
UsecaseTokenize: {
Flag: FLAG_TOKENIZE,
GRPCMethod: MethodTokenizeString,
Description: "Tokenization via the TokenizeString RPC without running inference.",
},
UsecaseImage: {
Flag: FLAG_IMAGE,
GRPCMethod: MethodGenerateImage,
Description: "Image generation via the GenerateImage RPC (Stable Diffusion, Flux, etc.).",
},
UsecaseVideo: {
Flag: FLAG_VIDEO,
GRPCMethod: MethodGenerateVideo,
Description: "Video generation via the GenerateVideo RPC.",
},
UsecaseTranscript: {
Flag: FLAG_TRANSCRIPT,
GRPCMethod: MethodAudioTranscription,
Description: "Speech-to-text via the AudioTranscription RPC.",
},
UsecaseTTS: {
Flag: FLAG_TTS,
GRPCMethod: MethodTTS,
Description: "Text-to-speech via the TTS RPC.",
},
UsecaseSoundGeneration: {
Flag: FLAG_SOUND_GENERATION,
GRPCMethod: MethodSoundGeneration,
Description: "Music/sound generation via the SoundGeneration RPC (not speech).",
},
UsecaseRerank: {
Flag: FLAG_RERANK,
GRPCMethod: MethodRerank,
Description: "Document reranking via the Rerank RPC.",
},
UsecaseDetection: {
Flag: FLAG_DETECTION,
GRPCMethod: MethodDetect,
Description: "Object detection via the Detect RPC with bounding boxes.",
},
UsecaseVAD: {
Flag: FLAG_VAD,
GRPCMethod: MethodVAD,
Description: "Voice activity detection via the VAD RPC.",
},
UsecaseAudioTransform: {
Flag: FLAG_AUDIO_TRANSFORM,
GRPCMethod: MethodAudioTransform,
Description: "Audio-in / audio-out transformations (echo cancellation, noise suppression, dereverberation, voice conversion) via the AudioTransform RPC.",
},
UsecaseDiarization: {
Flag: FLAG_DIARIZATION,
GRPCMethod: MethodDiarize,
Description: "Speaker diarization (who-spoke-when, per-speaker segments) via the Diarize RPC.",
},
}
// BackendCapability describes which gRPC methods and usecases a backend supports.
// Derived from reviewing actual implementations in backend/go/ and backend/python/.
type BackendCapability struct {
// GRPCMethods lists the Backend service RPCs this backend implements.
GRPCMethods []GRPCMethod
// PossibleUsecases lists all usecase strings this backend can support.
PossibleUsecases []string
// DefaultUsecases lists the conservative safe defaults.
DefaultUsecases []string
// AcceptsImages indicates multimodal image input in Predict.
AcceptsImages bool
// AcceptsVideos indicates multimodal video input in Predict.
AcceptsVideos bool
// AcceptsAudios indicates multimodal audio input in Predict.
AcceptsAudios bool
// Description is a human-readable summary of the backend.
Description string
}
// BackendCapabilities maps each backend name (as used in model configs and gallery
// entries) to its verified capabilities. This is the single source of truth for
// what each backend supports.
//
// Backend names use hyphens (e.g., "llama-cpp") matching the gallery convention.
// Use NormalizeBackendName() for names with dots (e.g., "llama.cpp").
var BackendCapabilities = map[string]BackendCapability{
// --- LLM / text generation backends ---
"llama-cpp": {
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding, MethodTokenizeString},
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEdit, UsecaseEmbeddings, UsecaseTokenize, UsecaseVision},
DefaultUsecases: []string{UsecaseChat},
AcceptsImages: true, // requires mmproj
Description: "llama.cpp GGUF models — LLM inference with optional vision via mmproj",
},
"vllm": {
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding},
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings, UsecaseVision},
DefaultUsecases: []string{UsecaseChat},
AcceptsImages: true,
AcceptsVideos: true,
Description: "vLLM engine — high-throughput LLM serving with optional multimodal",
},
"vllm-omni": {
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodGenerateImage, MethodGenerateVideo, MethodTTS},
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseImage, UsecaseVideo, UsecaseTTS, UsecaseVision},
DefaultUsecases: []string{UsecaseChat},
AcceptsImages: true,
AcceptsVideos: true,
AcceptsAudios: true,
Description: "vLLM omni-modal — supports text, image, video generation and TTS",
},
"transformers": {
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding, MethodTTS, MethodSoundGeneration},
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings, UsecaseTTS, UsecaseSoundGeneration},
DefaultUsecases: []string{UsecaseChat},
Description: "HuggingFace transformers — general-purpose Python inference",
},
"mlx": {
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding},
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings},
DefaultUsecases: []string{UsecaseChat},
Description: "Apple MLX framework — optimized for Apple Silicon",
},
"mlx-distributed": {
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding},
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings},
DefaultUsecases: []string{UsecaseChat},
Description: "MLX distributed inference across multiple Apple Silicon devices",
},
"mlx-vlm": {
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding},
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings, UsecaseVision},
DefaultUsecases: []string{UsecaseChat, UsecaseVision},
AcceptsImages: true,
AcceptsAudios: true,
Description: "MLX vision-language models with multimodal input",
},
"mlx-audio": {
GRPCMethods: []GRPCMethod{MethodPredict, MethodTTS},
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseTTS},
DefaultUsecases: []string{UsecaseChat},
Description: "MLX audio models — text generation and TTS",
},
// --- Image/video generation backends ---
"diffusers": {
GRPCMethods: []GRPCMethod{MethodGenerateImage, MethodGenerateVideo},
PossibleUsecases: []string{UsecaseImage, UsecaseVideo},
DefaultUsecases: []string{UsecaseImage},
Description: "HuggingFace diffusers — Stable Diffusion, Flux, video generation",
},
"stablediffusion": {
GRPCMethods: []GRPCMethod{MethodGenerateImage},
PossibleUsecases: []string{UsecaseImage},
DefaultUsecases: []string{UsecaseImage},
Description: "Stable Diffusion native backend",
},
"stablediffusion-ggml": {
GRPCMethods: []GRPCMethod{MethodGenerateImage},
PossibleUsecases: []string{UsecaseImage},
DefaultUsecases: []string{UsecaseImage},
Description: "Stable Diffusion via GGML quantized models",
},
// --- Speech-to-text backends ---
"whisper": {
GRPCMethods: []GRPCMethod{MethodAudioTranscription, MethodVAD},
PossibleUsecases: []string{UsecaseTranscript, UsecaseVAD},
DefaultUsecases: []string{UsecaseTranscript},
Description: "OpenAI Whisper — speech recognition and voice activity detection",
},
"faster-whisper": {
GRPCMethods: []GRPCMethod{MethodAudioTranscription},
PossibleUsecases: []string{UsecaseTranscript},
DefaultUsecases: []string{UsecaseTranscript},
Description: "CTranslate2-accelerated Whisper for faster transcription",
},
"whisperx": {
GRPCMethods: []GRPCMethod{MethodAudioTranscription},
PossibleUsecases: []string{UsecaseTranscript},
DefaultUsecases: []string{UsecaseTranscript},
Description: "WhisperX — Whisper with word-level timestamps and speaker diarization",
},
"moonshine": {
GRPCMethods: []GRPCMethod{MethodAudioTranscription},
PossibleUsecases: []string{UsecaseTranscript},
DefaultUsecases: []string{UsecaseTranscript},
Description: "Moonshine speech recognition",
},
"nemo": {
GRPCMethods: []GRPCMethod{MethodAudioTranscription},
PossibleUsecases: []string{UsecaseTranscript},
DefaultUsecases: []string{UsecaseTranscript},
Description: "NVIDIA NeMo speech recognition",
},
"qwen-asr": {
GRPCMethods: []GRPCMethod{MethodAudioTranscription},
PossibleUsecases: []string{UsecaseTranscript},
DefaultUsecases: []string{UsecaseTranscript},
Description: "Qwen automatic speech recognition",
},
"voxtral": {
GRPCMethods: []GRPCMethod{MethodAudioTranscription},
PossibleUsecases: []string{UsecaseTranscript},
DefaultUsecases: []string{UsecaseTranscript},
Description: "Voxtral speech recognition",
},
"vibevoice": {
GRPCMethods: []GRPCMethod{MethodAudioTranscription, MethodTTS},
PossibleUsecases: []string{UsecaseTranscript, UsecaseTTS},
DefaultUsecases: []string{UsecaseTranscript, UsecaseTTS},
Description: "VibeVoice — bidirectional speech (transcription and synthesis)",
},
// --- TTS backends ---
"piper": {
GRPCMethods: []GRPCMethod{MethodTTS},
PossibleUsecases: []string{UsecaseTTS},
DefaultUsecases: []string{UsecaseTTS},
Description: "Piper — fast neural TTS optimized for Raspberry Pi",
},
"kokoro": {
GRPCMethods: []GRPCMethod{MethodTTS},
PossibleUsecases: []string{UsecaseTTS},
DefaultUsecases: []string{UsecaseTTS},
Description: "Kokoro TTS",
},
"coqui": {
GRPCMethods: []GRPCMethod{MethodTTS},
PossibleUsecases: []string{UsecaseTTS},
DefaultUsecases: []string{UsecaseTTS},
Description: "Coqui TTS — multi-speaker neural synthesis",
},
"kitten-tts": {
GRPCMethods: []GRPCMethod{MethodTTS},
PossibleUsecases: []string{UsecaseTTS},
DefaultUsecases: []string{UsecaseTTS},
Description: "Kitten TTS",
},
"outetts": {
GRPCMethods: []GRPCMethod{MethodTTS},
PossibleUsecases: []string{UsecaseTTS},
DefaultUsecases: []string{UsecaseTTS},
Description: "OuteTTS",
},
"pocket-tts": {
GRPCMethods: []GRPCMethod{MethodTTS},
PossibleUsecases: []string{UsecaseTTS},
DefaultUsecases: []string{UsecaseTTS},
Description: "Pocket TTS — lightweight text-to-speech",
},
"qwen-tts": {
GRPCMethods: []GRPCMethod{MethodTTS},
PossibleUsecases: []string{UsecaseTTS},
DefaultUsecases: []string{UsecaseTTS},
Description: "Qwen TTS",
},
"faster-qwen3-tts": {
GRPCMethods: []GRPCMethod{MethodTTS},
PossibleUsecases: []string{UsecaseTTS},
DefaultUsecases: []string{UsecaseTTS},
Description: "Faster Qwen3 TTS — accelerated Qwen TTS",
},
"fish-speech": {
GRPCMethods: []GRPCMethod{MethodTTS},
PossibleUsecases: []string{UsecaseTTS},
DefaultUsecases: []string{UsecaseTTS},
Description: "Fish Speech TTS",
},
"neutts": {
GRPCMethods: []GRPCMethod{MethodTTS},
PossibleUsecases: []string{UsecaseTTS},
DefaultUsecases: []string{UsecaseTTS},
Description: "NeuTTS — neural text-to-speech",
},
"chatterbox": {
GRPCMethods: []GRPCMethod{MethodTTS},
PossibleUsecases: []string{UsecaseTTS},
DefaultUsecases: []string{UsecaseTTS},
Description: "Chatterbox TTS",
},
"voxcpm": {
GRPCMethods: []GRPCMethod{MethodTTS, MethodTTSStream},
PossibleUsecases: []string{UsecaseTTS},
DefaultUsecases: []string{UsecaseTTS},
Description: "VoxCPM TTS with streaming support",
},
// --- Sound generation backends ---
"ace-step": {
GRPCMethods: []GRPCMethod{MethodTTS, MethodSoundGeneration},
PossibleUsecases: []string{UsecaseTTS, UsecaseSoundGeneration},
DefaultUsecases: []string{UsecaseSoundGeneration},
Description: "ACE-Step — music and sound generation",
},
"acestep-cpp": {
GRPCMethods: []GRPCMethod{MethodSoundGeneration},
PossibleUsecases: []string{UsecaseSoundGeneration},
DefaultUsecases: []string{UsecaseSoundGeneration},
Description: "ACE-Step C++ — native sound generation",
},
"transformers-musicgen": {
GRPCMethods: []GRPCMethod{MethodTTS, MethodSoundGeneration},
PossibleUsecases: []string{UsecaseTTS, UsecaseSoundGeneration},
DefaultUsecases: []string{UsecaseSoundGeneration},
Description: "Meta MusicGen via transformers — music generation from text",
},
// --- Audio transform backends ---
"localvqe": {
GRPCMethods: []GRPCMethod{MethodAudioTransform},
PossibleUsecases: []string{UsecaseAudioTransform},
DefaultUsecases: []string{UsecaseAudioTransform},
Description: "LocalVQE — joint AEC, noise suppression, and dereverberation for 16 kHz mono speech",
},
// --- Utility backends ---
"rerankers": {
GRPCMethods: []GRPCMethod{MethodRerank},
PossibleUsecases: []string{UsecaseRerank},
DefaultUsecases: []string{UsecaseRerank},
Description: "Cross-encoder reranking models",
},
"rfdetr": {
GRPCMethods: []GRPCMethod{MethodDetect},
PossibleUsecases: []string{UsecaseDetection},
DefaultUsecases: []string{UsecaseDetection},
Description: "RF-DETR object detection",
},
"silero-vad": {
GRPCMethods: []GRPCMethod{MethodVAD},
PossibleUsecases: []string{UsecaseVAD},
DefaultUsecases: []string{UsecaseVAD},
Description: "Silero VAD — voice activity detection",
},
}
// NormalizeBackendName converts backend names to the canonical hyphenated form
// used in gallery entries (e.g., "llama.cpp" → "llama-cpp").
func NormalizeBackendName(backend string) string {
return strings.ReplaceAll(backend, ".", "-")
}
// GetBackendCapability returns the capability info for a backend, or nil if unknown.
// Handles backend name normalization.
func GetBackendCapability(backend string) *BackendCapability {
if cap, ok := BackendCapabilities[NormalizeBackendName(backend)]; ok {
return &cap
}
return nil
}
// PossibleUsecasesForBackend returns all usecases a backend can support.
// Returns nil if the backend is unknown.
func PossibleUsecasesForBackend(backend string) []string {
if cap := GetBackendCapability(backend); cap != nil {
return cap.PossibleUsecases
}
return nil
}
// DefaultUsecasesForBackend returns the conservative default usecases.
// Returns nil if the backend is unknown.
func DefaultUsecasesForBackendCap(backend string) []string {
if cap := GetBackendCapability(backend); cap != nil {
return cap.DefaultUsecases
}
return nil
}
// IsValidUsecaseForBackend checks whether a usecase is in a backend's possible set.
// Returns true for unknown backends (permissive fallback).
func IsValidUsecaseForBackend(backend, usecase string) bool {
cap := GetBackendCapability(backend)
if cap == nil {
return true // unknown backend — don't restrict
}
return slices.Contains(cap.PossibleUsecases, usecase)
}
// AllBackendNames returns a sorted list of all known backend names.
func AllBackendNames() []string {
names := make([]string, 0, len(BackendCapabilities))
for name := range BackendCapabilities {
names = append(names, name)
}
slices.Sort(names)
return names
}

View File

@@ -0,0 +1,95 @@
package config
import (
"slices"
"strings"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("BackendCapabilities", func() {
It("every backend declares possible/default usecases and gRPC methods", func() {
for name, cap := range BackendCapabilities {
Expect(cap.PossibleUsecases).NotTo(BeEmpty(), "backend %q has no possible usecases", name)
Expect(cap.DefaultUsecases).NotTo(BeEmpty(), "backend %q has no default usecases", name)
Expect(cap.GRPCMethods).NotTo(BeEmpty(), "backend %q has no gRPC methods", name)
}
})
It("default usecases are a subset of possible usecases", func() {
for name, cap := range BackendCapabilities {
for _, d := range cap.DefaultUsecases {
Expect(cap.PossibleUsecases).To(ContainElement(d), "backend %q: default %q not in possible %v", name, d, cap.PossibleUsecases)
}
}
})
It("every backend's possible usecases map to a known FLAG_*", func() {
allFlags := GetAllModelConfigUsecases()
for name, cap := range BackendCapabilities {
for _, u := range cap.PossibleUsecases {
info, ok := UsecaseInfoMap[u]
Expect(ok).To(BeTrue(), "backend %q: usecase %q not in UsecaseInfoMap", name, u)
flagName := "FLAG_" + strings.ToUpper(u)
if _, ok := allFlags[flagName]; ok {
continue
}
// Some usecase names don't transform exactly to FLAG_<UPPER>; fall back to flag value lookup.
found := false
for _, flag := range allFlags {
if flag == info.Flag {
found = true
break
}
}
Expect(found).To(BeTrue(), "backend %q: usecase %q flag %d not in GetAllModelConfigUsecases", name, u, info.Flag)
}
}
})
It("every UsecaseInfoMap entry has a non-zero flag and a gRPC method", func() {
for name, info := range UsecaseInfoMap {
Expect(info.Flag).NotTo(Equal(FLAG_ANY), "usecase %q has FLAG_ANY (zero) — should have a real flag", name)
Expect(info.GRPCMethod).NotTo(BeEmpty(), "usecase %q has no gRPC method", name)
}
})
})
var _ = Describe("GetBackendCapability", func() {
It("returns the capability for a known backend", func() {
cap := GetBackendCapability("llama-cpp")
Expect(cap).NotTo(BeNil())
Expect(cap.PossibleUsecases).To(ContainElement("chat"))
})
It("normalizes hyphenated names so llama.cpp resolves to llama-cpp", func() {
Expect(GetBackendCapability("llama.cpp")).NotTo(BeNil())
})
It("returns nil for unknown backends", func() {
Expect(GetBackendCapability("nonexistent")).To(BeNil())
})
})
var _ = Describe("IsValidUsecaseForBackend", func() {
It("accepts a backend's declared usecases", func() {
Expect(IsValidUsecaseForBackend("piper", "tts")).To(BeTrue())
})
It("rejects usecases outside a backend's possible set", func() {
Expect(IsValidUsecaseForBackend("piper", "chat")).To(BeFalse())
})
It("is permissive for unknown backends", func() {
Expect(IsValidUsecaseForBackend("unknown", "anything")).To(BeTrue())
})
})
var _ = Describe("AllBackendNames", func() {
It("returns 30+ backends in sorted order", func() {
names := AllBackendNames()
Expect(len(names)).To(BeNumerically(">=", 30))
Expect(slices.IsSorted(names)).To(BeTrue())
})
})

View File

@@ -630,16 +630,45 @@ const (
FLAG_TOKENIZE ModelConfigUsecase = 0b001000000000
FLAG_VAD ModelConfigUsecase = 0b010000000000
FLAG_VIDEO ModelConfigUsecase = 0b100000000000
FLAG_DETECTION ModelConfigUsecase = 0b1000000000000
FLAG_FACE_RECOGNITION ModelConfigUsecase = 0b10000000000000
FLAG_SPEAKER_RECOGNITION ModelConfigUsecase = 0b100000000000000
FLAG_AUDIO_TRANSFORM ModelConfigUsecase = 0b1000000000000000
FLAG_DIARIZATION ModelConfigUsecase = 0b10000000000000000
FLAG_DETECTION ModelConfigUsecase = 0b1000000000000
FLAG_VISION ModelConfigUsecase = 0b10000000000000
FLAG_FACE_RECOGNITION ModelConfigUsecase = 0b100000000000000
FLAG_SPEAKER_RECOGNITION ModelConfigUsecase = 0b1000000000000000
FLAG_AUDIO_TRANSFORM ModelConfigUsecase = 0b10000000000000000
FLAG_DIARIZATION ModelConfigUsecase = 0b100000000000000000
// Common Subsets
FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
)
// ModalityGroups defines groups of usecases that belong to the same modality.
// Flags within the same group are NOT orthogonal (e.g., chat and completion are
// both text/language). A model is multimodal when its usecases span 2+ groups.
var ModalityGroups = []ModelConfigUsecase{
FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT, // text/language
FLAG_VISION | FLAG_DETECTION, // visual understanding
FLAG_TRANSCRIPT, // speech input
FLAG_TTS | FLAG_SOUND_GENERATION, // audio output
FLAG_AUDIO_TRANSFORM, // audio in/out transforms
FLAG_IMAGE | FLAG_VIDEO, // visual generation
}
// IsMultimodal returns true if the given usecases span two or more orthogonal
// modality groups. For example chat+vision is multimodal, but chat+completion
// is not (both belong to the text/language group).
func IsMultimodal(usecases ModelConfigUsecase) bool {
groupCount := 0
for _, group := range ModalityGroups {
if usecases&group != 0 {
groupCount++
if groupCount >= 2 {
return true
}
}
}
return false
}
func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
return map[string]ModelConfigUsecase{
// Note: FLAG_ANY is intentionally excluded from this map
@@ -657,7 +686,8 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
"FLAG_VAD": FLAG_VAD,
"FLAG_LLM": FLAG_LLM,
"FLAG_VIDEO": FLAG_VIDEO,
"FLAG_DETECTION": FLAG_DETECTION,
"FLAG_DETECTION": FLAG_DETECTION,
"FLAG_VISION": FLAG_VISION,
"FLAG_FACE_RECOGNITION": FLAG_FACE_RECOGNITION,
"FLAG_SPEAKER_RECOGNITION": FLAG_SPEAKER_RECOGNITION,
"FLAG_AUDIO_TRANSFORM": FLAG_AUDIO_TRANSFORM,

View File

@@ -7,6 +7,8 @@ import (
"path/filepath"
"slices"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/lithammer/fuzzysearch/fuzzy"
@@ -92,6 +94,34 @@ func (gm GalleryElements[T]) Search(term string) GalleryElements[T] {
return filteredModels
}
// FilterGalleryModelsByUsecase returns models whose known_usecases include all
// the bits set in usecase. For example, passing FLAG_CHAT matches any model
// with the chat usecase; passing FLAG_CHAT|FLAG_VISION matches only models
// that have both.
func FilterGalleryModelsByUsecase(models GalleryElements[*GalleryModel], usecase config.ModelConfigUsecase) GalleryElements[*GalleryModel] {
var filtered GalleryElements[*GalleryModel]
for _, m := range models {
u := m.GetKnownUsecases()
if u != nil && (*u&usecase) == usecase {
filtered = append(filtered, m)
}
}
return filtered
}
// FilterGalleryModelsByMultimodal returns models whose known_usecases span two
// or more orthogonal modality groups (e.g. chat+vision, tts+transcript).
func FilterGalleryModelsByMultimodal(models GalleryElements[*GalleryModel]) GalleryElements[*GalleryModel] {
var filtered GalleryElements[*GalleryModel]
for _, m := range models {
u := m.GetKnownUsecases()
if u != nil && config.IsMultimodal(*u) {
filtered = append(filtered, m)
}
}
return filtered
}
func (gm GalleryElements[T]) FilterByTag(tag string) GalleryElements[T] {
var filtered GalleryElements[T]
for _, m := range gm {
@@ -267,6 +297,77 @@ func AvailableGalleryModels(galleries []config.Gallery, systemState *system.Syst
return models, nil
}
var (
availableModelsMu sync.RWMutex
availableModelsCache GalleryElements[*GalleryModel]
refreshing atomic.Bool
galleryGeneration atomic.Uint64
)
// GalleryGeneration returns a counter that increments each time the gallery
// model list is refreshed from upstream. VRAM estimation caches use this to
// invalidate entries when the gallery data changes.
func GalleryGeneration() uint64 { return galleryGeneration.Load() }
// AvailableGalleryModelsCached returns gallery models from an in-memory cache.
// Local-only fields (installed status) are refreshed on every call. A background
// goroutine is triggered to re-fetch the full model list (including network
// calls) so subsequent requests pick up changes without blocking the caller.
// The first call with an empty cache blocks until the initial load completes.
func AvailableGalleryModelsCached(galleries []config.Gallery, systemState *system.SystemState) (GalleryElements[*GalleryModel], error) {
availableModelsMu.RLock()
cached := availableModelsCache
availableModelsMu.RUnlock()
if cached != nil {
// Refresh installed status under write lock to avoid races with
// concurrent readers and the background refresh goroutine.
availableModelsMu.Lock()
for _, m := range cached {
_, err := os.Stat(filepath.Join(systemState.Model.ModelsPath, fmt.Sprintf("%s.yaml", m.GetName())))
m.SetInstalled(err == nil)
}
availableModelsMu.Unlock()
// Trigger a background refresh if one is not already running.
triggerGalleryRefresh(galleries, systemState)
return cached, nil
}
// No cache yet — must do a blocking load.
models, err := AvailableGalleryModels(galleries, systemState)
if err != nil {
return nil, err
}
availableModelsMu.Lock()
availableModelsCache = models
galleryGeneration.Add(1)
availableModelsMu.Unlock()
return models, nil
}
// triggerGalleryRefresh starts a background goroutine that refreshes the
// gallery model cache. Only one refresh runs at a time; concurrent calls
// are no-ops.
func triggerGalleryRefresh(galleries []config.Gallery, systemState *system.SystemState) {
if !refreshing.CompareAndSwap(false, true) {
return
}
go func() {
defer refreshing.Store(false)
models, err := AvailableGalleryModels(galleries, systemState)
if err != nil {
xlog.Error("background gallery refresh failed", "error", err)
return
}
availableModelsMu.Lock()
availableModelsCache = models
galleryGeneration.Add(1)
availableModelsMu.Unlock()
}()
}
// List available backends
func AvailableBackends(galleries []config.Gallery, systemState *system.SystemState) (GalleryElements[*GalleryBackend], error) {
return availableBackendsWithFilter(galleries, systemState, true)

View File

@@ -581,4 +581,42 @@ var _ = Describe("Gallery", func() {
Expect(mergedParams["model"]).To(Equal("nanbeige4.1-3b-q4_k_m.gguf"))
})
})
Describe("GetKnownUsecases", func() {
It("uses explicit known_usecases from overrides when present", func() {
m := &GalleryModel{
Metadata: Metadata{Backend: "stablediffusion-ggml"},
Overrides: map[string]any{
"known_usecases": []any{"chat"},
},
}
u := m.GetKnownUsecases()
Expect(u).NotTo(BeNil())
// Override wins over the backend's image default.
Expect(*u & config.FLAG_CHAT).To(Equal(config.FLAG_CHAT))
Expect(*u & config.FLAG_IMAGE).To(Equal(config.ModelConfigUsecase(0)))
})
It("falls back to backend defaults when no override is set", func() {
m := &GalleryModel{Metadata: Metadata{Backend: "stablediffusion-ggml"}}
u := m.GetKnownUsecases()
Expect(u).NotTo(BeNil())
Expect(*u & config.FLAG_IMAGE).To(Equal(config.FLAG_IMAGE))
})
It("returns nil when neither overrides nor a known backend provide usecases", func() {
m := &GalleryModel{}
Expect(m.GetKnownUsecases()).To(BeNil())
})
It("filters models without explicit known_usecases via backend defaults", func() {
models := GalleryElements[*GalleryModel]{
&GalleryModel{Metadata: Metadata{Name: "sd-model", Backend: "stablediffusion-ggml"}},
&GalleryModel{Metadata: Metadata{Name: "whisper-model", Backend: "whisper"}},
}
filtered := FilterGalleryModelsByUsecase(models, config.FLAG_IMAGE)
Expect(filtered).To(HaveLen(1))
Expect(filtered[0].Name).To(Equal("sd-model"))
})
})
})

View File

@@ -97,7 +97,7 @@ func (i *DiffuserImporter) Import(details Details) (gallery.ModelConfig, error)
modelConfig := config.ModelConfig{
Name: name,
Description: description,
KnownUsecaseStrings: []string{"image"},
KnownUsecaseStrings: []string{config.UsecaseImage},
Backend: backend,
PredictionOptions: schema.PredictionOptions{
BasicModelRequest: schema.BasicModelRequest{

View File

@@ -135,7 +135,7 @@ func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error)
modelConfig := config.ModelConfig{
Name: name,
Description: description,
KnownUsecaseStrings: []string{"chat"},
KnownUsecaseStrings: []string{config.UsecaseChat},
Options: []string{"use_jinja:true"},
Backend: backend,
TemplateConfig: config.TemplateConfig{

View File

@@ -45,7 +45,7 @@ func ImportLocalPath(dirPath, name string) (*config.ModelConfig, error) {
cfg := &config.ModelConfig{
Name: name,
Backend: "llama-cpp",
KnownUsecaseStrings: []string{"chat"},
KnownUsecaseStrings: []string{config.UsecaseChat},
Options: []string{"use_jinja:true"},
}
cfg.Model = relPath(ggufFile)
@@ -104,7 +104,7 @@ func ImportLocalPath(dirPath, name string) (*config.ModelConfig, error) {
cfg := &config.ModelConfig{
Name: name,
Backend: "transformers",
KnownUsecaseStrings: []string{"chat"},
KnownUsecaseStrings: []string{config.UsecaseChat},
}
cfg.Model = baseModel
cfg.TemplateConfig.UseTokenizerTemplate = true
@@ -120,7 +120,7 @@ func ImportLocalPath(dirPath, name string) (*config.ModelConfig, error) {
cfg := &config.ModelConfig{
Name: name,
Backend: "transformers",
KnownUsecaseStrings: []string{"chat"},
KnownUsecaseStrings: []string{config.UsecaseChat},
}
cfg.Model = baseModel
cfg.TemplateConfig.UseTokenizerTemplate = true
@@ -135,7 +135,7 @@ func ImportLocalPath(dirPath, name string) (*config.ModelConfig, error) {
cfg := &config.ModelConfig{
Name: name,
Backend: "transformers",
KnownUsecaseStrings: []string{"chat"},
KnownUsecaseStrings: []string{config.UsecaseChat},
}
cfg.Model = relPath(dirPath)
cfg.TemplateConfig.UseTokenizerTemplate = true

View File

@@ -73,7 +73,7 @@ func (i *MLXImporter) Import(details Details) (gallery.ModelConfig, error) {
modelConfig := config.ModelConfig{
Name: name,
Description: description,
KnownUsecaseStrings: []string{"chat"},
KnownUsecaseStrings: []string{config.UsecaseChat},
Backend: backend,
PredictionOptions: schema.PredictionOptions{
BasicModelRequest: schema.BasicModelRequest{

View File

@@ -87,7 +87,7 @@ func (i *TransformersImporter) Import(details Details) (gallery.ModelConfig, err
modelConfig := config.ModelConfig{
Name: name,
Description: description,
KnownUsecaseStrings: []string{"chat"},
KnownUsecaseStrings: []string{config.UsecaseChat},
Backend: backend,
PredictionOptions: schema.PredictionOptions{
BasicModelRequest: schema.BasicModelRequest{

View File

@@ -77,7 +77,7 @@ func (i *VLLMImporter) Import(details Details) (gallery.ModelConfig, error) {
modelConfig := config.ModelConfig{
Name: name,
Description: description,
KnownUsecaseStrings: []string{"chat"},
KnownUsecaseStrings: []string{config.UsecaseChat},
Backend: backend,
PredictionOptions: schema.PredictionOptions{
BasicModelRequest: schema.BasicModelRequest{

View File

@@ -52,3 +52,39 @@ func (m *GalleryModel) GetTags() []string {
func (m *GalleryModel) GetDescription() string {
return m.Description
}
// GetKnownUsecases returns the usecase flags declared by the gallery entry,
// falling back to the resolved backend's default usecases when the entry has
// none of its own. Returns nil only when neither source provides any.
//
// Why the fallback: many gallery entries omit known_usecases because their
// backend has only one sensible mode (e.g. stablediffusion-ggml is always
// image generation). Without this fallback such models silently disappear
// from usecase-based filtering in the UI.
func (m *GalleryModel) GetKnownUsecases() *config.ModelConfigUsecase {
if strs := overrideUsecaseStrings(m.Overrides); len(strs) > 0 {
return config.GetUsecasesFromYAML(strs)
}
if defaults := config.DefaultUsecasesForBackendCap(m.Backend); len(defaults) > 0 {
return config.GetUsecasesFromYAML(defaults)
}
return nil
}
func overrideUsecaseStrings(overrides map[string]any) []string {
raw, ok := overrides["known_usecases"]
if !ok {
return nil
}
list, ok := raw.([]any)
if !ok {
return nil
}
strs := make([]string, 0, len(list))
for _, v := range list {
if s, ok := v.(string); ok {
strs = append(strs, s)
}
}
return strs
}

View File

@@ -116,13 +116,13 @@ func AutocompleteEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, a
capability := strings.TrimPrefix(provider, "models:")
var filterFn config.ModelConfigFilterFn
switch capability {
case "chat":
case config.UsecaseChat:
filterFn = config.BuildUsecaseFilterFn(config.FLAG_CHAT)
case "tts":
case config.UsecaseTTS:
filterFn = config.BuildUsecaseFilterFn(config.FLAG_TTS)
case "vad":
case config.UsecaseVAD:
filterFn = config.BuildUsecaseFilterFn(config.FLAG_VAD)
case "transcript":
case config.UsecaseTranscript:
filterFn = config.BuildUsecaseFilterFn(config.FLAG_TRANSCRIPT)
default:
filterFn = config.NoFilterFn

View File

@@ -77,18 +77,17 @@ func ImportModelURIEndpoint(cl *config.ModelConfigLoader, appConfig *config.Appl
}
estCtx, cancel := context.WithTimeout(c.Request().Context(), 5*time.Second)
defer cancel()
result, err := vram.EstimateModel(estCtx, vram.ModelEstimateInput{
Files: files,
Options: vram.EstimateOptions{ContextLength: 8192},
})
result, err := vram.EstimateModelMultiContext(estCtx, vram.ModelEstimateInput{
Files: files,
}, []uint32{8192})
if err == nil {
if result.SizeBytes > 0 {
resp.EstimatedSizeBytes = result.SizeBytes
resp.EstimatedSizeDisplay = result.SizeDisplay
}
if result.VRAMBytes > 0 {
resp.EstimatedVRAMBytes = result.VRAMBytes
resp.EstimatedVRAMDisplay = result.VRAMDisplay
if v := result.VRAMForContext(8192); v > 0 {
resp.EstimatedVRAMBytes = v
resp.EstimatedVRAMDisplay = vram.FormatBytes(v)
}
}
}

View File

@@ -9,10 +9,9 @@ import (
)
// VRAMEstimateEndpoint returns a handler that estimates VRAM usage for an
// installed model configuration. For uninstalled models (gallery URLs), use
// the gallery-level estimates in /api/models instead.
// installed model configuration at multiple context sizes.
// @Summary Estimate VRAM usage for a model
// @Description Estimates VRAM based on model weight files, context size, and GPU layers
// @Description Estimates VRAM based on model weight files at multiple context sizes
// @Tags config
// @Accept json
// @Produce json

View File

@@ -121,13 +121,13 @@ var _ = Describe("VRAM Estimate Endpoint", func() {
Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
// The response should have non-zero size and vram estimates.
// JSON numbers unmarshal as float64.
sizeBytes, ok := resp["sizeBytes"].(float64)
Expect(ok).To(BeTrue(), "sizeBytes should be a number, got: %v (response: %s)", resp["sizeBytes"], rec.Body.String())
sizeBytes, ok := resp["size_bytes"].(float64)
Expect(ok).To(BeTrue(), "size_bytes should be a number, got: %v (response: %s)", resp["size_bytes"], rec.Body.String())
Expect(sizeBytes).To(BeNumerically(">", 0))
vramBytes, ok := resp["vramBytes"].(float64)
Expect(ok).To(BeTrue(), "vramBytes should be a number")
vramBytes, ok := resp["vram_bytes"].(float64)
Expect(ok).To(BeTrue(), "vram_bytes should be a number")
Expect(vramBytes).To(BeNumerically(">", 0))
Expect(resp["sizeDisplay"]).NotTo(BeEmpty())
Expect(resp["vramDisplay"]).NotTo(BeEmpty())
Expect(resp["size_display"]).NotTo(BeEmpty())
Expect(resp["vram_display"]).NotTo(BeEmpty())
})
})

View File

@@ -2,13 +2,13 @@ import { test, expect } from '@playwright/test'
const MOCK_MODELS_RESPONSE = {
models: [
{ name: 'llama-model', description: 'A llama model', backend: 'llama-cpp', installed: false, tags: ['llm'] },
{ name: 'whisper-model', description: 'A whisper model', backend: 'whisper', installed: true, tags: ['stt'] },
{ name: 'llama-model', description: 'A llama model', backend: 'llama-cpp', installed: false, tags: ['chat'] },
{ name: 'whisper-model', description: 'A whisper model', backend: 'whisper', installed: true, tags: ['transcript'] },
{ name: 'stablediffusion-model', description: 'An image model', backend: 'stablediffusion', installed: false, tags: ['sd'] },
{ name: 'unknown-model', description: 'No backend', backend: '', installed: false, tags: [] },
],
allBackends: ['llama-cpp', 'stablediffusion', 'whisper'],
allTags: ['llm', 'sd', 'stt'],
allTags: ['chat', 'sd', 'transcript'],
availableModels: 4,
installedModels: 1,
totalPages: 1,
@@ -78,3 +78,121 @@ test.describe('Models Gallery - Backend Features', () => {
await expect(detail.locator('text=llama-cpp')).toBeVisible()
})
})
const BACKEND_USECASES_MOCK = {
'llama-cpp': ['chat', 'embeddings', 'vision'],
'whisper': ['transcript'],
'stablediffusion': ['image'],
}
test.describe('Models Gallery - Multi-select Filters', () => {
test.beforeEach(async ({ page }) => {
await page.route('**/api/models*', (route) => {
route.fulfill({
contentType: 'application/json',
body: JSON.stringify(MOCK_MODELS_RESPONSE),
})
})
await page.route('**/api/backends/usecases', (route) => {
route.fulfill({
contentType: 'application/json',
body: JSON.stringify(BACKEND_USECASES_MOCK),
})
})
await page.goto('/app/models')
await expect(page.locator('th', { hasText: 'Backend' })).toBeVisible({ timeout: 10_000 })
})
test('multi-select toggle: click Chat, TTS, then Chat again', async ({ page }) => {
const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
await chatBtn.click()
await expect(chatBtn).toHaveClass(/active/)
await ttsBtn.click()
await expect(chatBtn).toHaveClass(/active/)
await expect(ttsBtn).toHaveClass(/active/)
// Click Chat again to deselect it
await chatBtn.click()
await expect(chatBtn).not.toHaveClass(/active/)
await expect(ttsBtn).toHaveClass(/active/)
})
test('"All" clears selection', async ({ page }) => {
const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
const allBtn = page.locator('.filter-btn', { hasText: 'All' })
await chatBtn.click()
await expect(chatBtn).toHaveClass(/active/)
await allBtn.click()
await expect(allBtn).toHaveClass(/active/)
await expect(chatBtn).not.toHaveClass(/active/)
})
test('query param sent correctly with multiple filters', async ({ page }) => {
const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
// Click Chat and wait for its request to settle
await chatBtn.click()
await page.waitForResponse(resp => resp.url().includes('/api/models'))
// Now click TTS and capture the resulting request
const [request] = await Promise.all([
page.waitForRequest(req => {
if (!req.url().includes('/api/models')) return false
const u = new URL(req.url())
const tag = u.searchParams.get('tag')
return tag && tag.split(',').length >= 2
}),
ttsBtn.click(),
])
const url = new URL(request.url())
const tags = url.searchParams.get('tag').split(',').sort()
expect(tags).toEqual(['chat', 'tts'])
})
test('backend greys out unavailable filters', async ({ page }) => {
// Select llama-cpp backend via dropdown
await page.locator('button', { hasText: 'All Backends' }).click()
const dropdown = page.locator('input[placeholder="Search backends..."]').locator('..').locator('..')
await dropdown.locator('text=llama-cpp').click()
// Wait for filter state to update
const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
const sttBtn = page.locator('.filter-btn', { hasText: 'STT' })
const imageBtn = page.locator('.filter-btn', { hasText: 'Image' })
// TTS, STT, Image should be disabled for llama-cpp
await expect(ttsBtn).toBeDisabled()
await expect(sttBtn).toBeDisabled()
await expect(imageBtn).toBeDisabled()
// Chat, Embeddings, Vision should remain enabled
const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
const embBtn = page.locator('.filter-btn', { hasText: 'Embeddings' })
const visBtn = page.locator('.filter-btn', { hasText: 'Vision' })
await expect(chatBtn).toBeEnabled()
await expect(embBtn).toBeEnabled()
await expect(visBtn).toBeEnabled()
})
test('backend clears incompatible filters', async ({ page }) => {
// Select TTS filter first
const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
await ttsBtn.click()
await expect(ttsBtn).toHaveClass(/active/)
// Now select llama-cpp backend (which doesn't support TTS)
await page.locator('button', { hasText: 'All Backends' }).click()
const dropdown = page.locator('input[placeholder="Search backends..."]').locator('..').locator('..')
await dropdown.locator('text=llama-cpp').click()
// TTS should be auto-removed from selection
await expect(ttsBtn).not.toHaveClass(/active/)
})
})

View File

@@ -20,6 +20,7 @@
"vision": "Vision",
"tts": "TTS",
"stt": "STT",
"diarization": "Diarisierung",
"embedding": "Embedding",
"rerank": "Rerank",
"allBackends": "Alle Backends",

View File

@@ -14,14 +14,20 @@
},
"filters": {
"all": "All",
"llm": "LLM",
"llm": "Chat",
"image": "Image",
"video": "Video",
"multimodal": "Multimodal",
"vision": "Vision",
"tts": "TTS",
"stt": "STT",
"embedding": "Embedding",
"diarization": "Diarization",
"soundGen": "Sound",
"audioTransform": "Audio FX",
"embedding": "Embeddings",
"rerank": "Rerank",
"detection": "Detection",
"vad": "VAD",
"allBackends": "All Backends",
"searchBackends": "Search backends..."
},

View File

@@ -20,6 +20,7 @@
"vision": "Visión",
"tts": "TTS",
"stt": "STT",
"diarization": "Diarización",
"embedding": "Embedding",
"rerank": "Rerank",
"allBackends": "Todos los backends",

View File

@@ -20,6 +20,7 @@
"vision": "Visione",
"tts": "TTS",
"stt": "STT",
"diarization": "Diarizzazione",
"embedding": "Embedding",
"rerank": "Rerank",
"allBackends": "Tutti i backend",

View File

@@ -20,6 +20,7 @@
"vision": "视觉",
"tts": "TTS",
"stt": "STT",
"diarization": "说话人分离",
"embedding": "嵌入",
"rerank": "重排",
"allBackends": "所有后端",

View File

@@ -296,11 +296,11 @@ export default function Backends() {
const FILTERS = [
{ key: '', label: 'All', icon: 'fa-layer-group' },
{ key: 'llm', label: 'LLM', icon: 'fa-brain' },
{ key: 'chat', label: 'Chat', icon: 'fa-brain' },
{ key: 'image', label: 'Image', icon: 'fa-image' },
{ key: 'video', label: 'Video', icon: 'fa-video' },
{ key: 'tts', label: 'TTS', icon: 'fa-microphone' },
{ key: 'stt', label: 'STT', icon: 'fa-headphones' },
{ key: 'transcript', label: 'STT', icon: 'fa-headphones' },
{ key: 'vision', label: 'Vision', icon: 'fa-eye' },
]

View File

@@ -11,16 +11,26 @@ import GalleryLoader from '../components/GalleryLoader'
import React from 'react'
const CONTEXT_SIZES = [8192, 16384, 32768, 65536, 131072, 262144]
const CONTEXT_LABELS = ['8K', '16K', '32K', '64K', '128K', '256K']
const FILTERS = [
{ key: '', labelKey: 'filters.all', icon: 'fa-layer-group' },
{ key: 'llm', labelKey: 'filters.llm', icon: 'fa-brain' },
{ key: 'sd', labelKey: 'filters.image', icon: 'fa-image' },
{ key: 'chat', labelKey: 'filters.llm', icon: 'fa-brain' },
{ key: 'image', labelKey: 'filters.image', icon: 'fa-image' },
{ key: 'video', labelKey: 'filters.video', icon: 'fa-video' },
{ key: 'multimodal', labelKey: 'filters.multimodal', icon: 'fa-shapes' },
{ key: 'vision', labelKey: 'filters.vision', icon: 'fa-eye' },
{ key: 'tts', labelKey: 'filters.tts', icon: 'fa-microphone' },
{ key: 'stt', labelKey: 'filters.stt', icon: 'fa-headphones' },
{ key: 'embedding', labelKey: 'filters.embedding', icon: 'fa-vector-square' },
{ key: 'reranker', labelKey: 'filters.rerank', icon: 'fa-sort' },
{ key: 'transcript', labelKey: 'filters.stt', icon: 'fa-headphones' },
{ key: 'diarization', labelKey: 'filters.diarization', icon: 'fa-users' },
{ key: 'sound_generation', labelKey: 'filters.soundGen', icon: 'fa-music' },
{ key: 'audio_transform', labelKey: 'filters.audioTransform', icon: 'fa-sliders' },
{ key: 'embeddings', labelKey: 'filters.embedding', icon: 'fa-vector-square' },
{ key: 'rerank', labelKey: 'filters.rerank', icon: 'fa-sort' },
{ key: 'detection', labelKey: 'filters.detection', icon: 'fa-bullseye' },
{ key: 'vad', labelKey: 'filters.vad', icon: 'fa-wave-square' },
]
export default function Models() {
@@ -34,7 +44,7 @@ export default function Models() {
const [page, setPage] = useState(1)
const [totalPages, setTotalPages] = useState(1)
const [search, setSearch] = useState('')
const [filter, setFilter] = useState('')
const [filters, setFilters] = useState([])
const [sort, setSort] = useState('')
const [order, setOrder] = useState('asc')
const [installing, setInstalling] = useState(new Map())
@@ -43,6 +53,9 @@ export default function Models() {
const [stats, setStats] = useState({ total: 0, installed: 0, repositories: 0 })
const [backendFilter, setBackendFilter] = useState('')
const [allBackends, setAllBackends] = useState([])
const [backendUsecases, setBackendUsecases] = useState({})
const [estimates, setEstimates] = useState({})
const [contextSize, setContextSize] = useState(CONTEXT_SIZES[0])
const [confirmDialog, setConfirmDialog] = useState(null)
// Total GPU memory for "fits" check
@@ -52,14 +65,14 @@ export default function Models() {
try {
setLoading(true)
const searchVal = params.search !== undefined ? params.search : search
const filterVal = params.filter !== undefined ? params.filter : filter
const filtersVal = params.filters !== undefined ? params.filters : filters
const sortVal = params.sort !== undefined ? params.sort : sort
const backendVal = params.backendFilter !== undefined ? params.backendFilter : backendFilter
const queryParams = {
page: params.page || page,
items: 9,
}
if (filterVal) queryParams.tag = filterVal
if (filtersVal.length > 0) queryParams.tag = filtersVal.join(',')
if (searchVal) queryParams.term = searchVal
if (backendVal) queryParams.backend = backendVal
if (sortVal) {
@@ -79,11 +92,27 @@ export default function Models() {
} finally {
setLoading(false)
}
}, [page, search, filter, sort, order, backendFilter, addToast, t])
}, [page, search, filters, sort, order, backendFilter, addToast, t])
useEffect(() => {
fetchModels()
}, [page, filter, sort, order, backendFilter])
}, [page, filters, sort, order, backendFilter])
// Fetch backend→usecase mapping once on mount
useEffect(() => {
modelsApi.backendUsecases().then(setBackendUsecases).catch(() => {})
}, [])
// When backend changes, remove selected filters that aren't available
useEffect(() => {
if (backendFilter && backendUsecases[backendFilter]) {
setFilters(prev => {
const possible = backendUsecases[backendFilter]
const filtered = prev.filter(k => k === 'multimodal' || possible.includes(k))
return filtered.length !== prev.length ? filtered : prev
})
}
}, [backendFilter, backendUsecases])
// Re-fetch when operations change (install/delete completion)
useEffect(() => {
@@ -95,11 +124,42 @@ export default function Models() {
fetchModels({ search: value, page: 1 })
})
// Fetch VRAM/size estimates asynchronously for visible models.
useEffect(() => {
if (models.length === 0) return
let cancelled = false
models.forEach(model => {
const id = model.name || model.id
if (estimates[id]) return
modelsApi.estimate(id, CONTEXT_SIZES).then(est => {
if (cancelled) return
if (est && (est.sizeBytes || est.estimates)) {
setEstimates(prev => ({ ...prev, [id]: est }))
}
}).catch(() => {})
})
return () => { cancelled = true }
}, [models])
const handleSearch = (value) => {
setSearch(value)
debouncedFetch(value)
}
const toggleFilter = (key) => {
if (key === '') { setFilters([]); setPage(1); return }
setFilters(prev =>
prev.includes(key) ? prev.filter(k => k !== key) : [...prev, key]
)
setPage(1)
}
const isFilterAvailable = (key) => {
if (!backendFilter || key === '' || key === 'multimodal') return true
const possible = backendUsecases[backendFilter]
return !possible || possible.includes(key)
}
const handleSort = (col) => {
if (sort === col) {
setOrder(o => o === 'asc' ? 'desc' : 'asc')
@@ -221,16 +281,23 @@ export default function Models() {
{/* Filter buttons */}
<div className="filter-bar">
{FILTERS.map(f => (
<button
key={f.key}
className={`filter-btn ${filter === f.key ? 'active' : ''}`}
onClick={() => { setFilter(f.key); setPage(1) }}
>
<i className={`fas ${f.icon}`} style={{ marginRight: 4 }} />
{t(f.labelKey)}
</button>
))}
{FILTERS.map(f => {
const isAll = f.key === ''
const active = isAll ? filters.length === 0 : filters.includes(f.key)
const available = isFilterAvailable(f.key)
return (
<button
key={f.key}
className={`filter-btn ${active ? 'active' : ''}`}
disabled={!available}
style={!available ? { opacity: 0.4, cursor: 'not-allowed' } : undefined}
onClick={() => toggleFilter(f.key)}
>
<i className={`fas ${f.icon}`} style={{ marginRight: 4 }} />
{t(f.labelKey)}
</button>
)
})}
{allBackends.length > 0 && (
<SearchableSelect
value={backendFilter}
@@ -244,6 +311,25 @@ export default function Models() {
)}
</div>
{/* Context size slider for VRAM estimates */}
<div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)', marginBottom: 'var(--spacing-md)', fontSize: '0.8125rem' }}>
<label style={{ color: 'var(--color-text-muted)', whiteSpace: 'nowrap' }}>
<i className="fas fa-memory" style={{ marginRight: 4 }} />
Context:
</label>
<input
type="range"
min={0}
max={CONTEXT_SIZES.length - 1}
value={CONTEXT_SIZES.indexOf(contextSize)}
onChange={(e) => setContextSize(CONTEXT_SIZES[e.target.value])}
style={{ width: 140, accentColor: 'var(--color-primary)' }}
/>
<span style={{ fontWeight: 600, minWidth: '3em' }}>
{CONTEXT_LABELS[CONTEXT_SIZES.indexOf(contextSize)]}
</span>
</div>
{/* Table */}
{loading ? (
<GalleryLoader />
@@ -252,12 +338,12 @@ export default function Models() {
<div className="empty-state-icon"><i className="fas fa-search" /></div>
<h2 className="empty-state-title">{t('empty.title')}</h2>
<p className="empty-state-text">
{search || filter || backendFilter ? t('empty.withFilters') : t('empty.noFilters')}
{search || filters.length > 0 || backendFilter ? t('empty.withFilters') : t('empty.noFilters')}
</p>
{(search || filter || backendFilter) && (
{(search || filters.length > 0 || backendFilter) && (
<button
className="btn btn-secondary btn-sm"
onClick={() => { handleSearch(''); setFilter(''); setBackendFilter(''); setPage(1) }}
onClick={() => { handleSearch(''); setFilters([]); setBackendFilter(''); setPage(1) }}
>
<i className="fas fa-times" /> {t('search.clearFilters')}
</button>
@@ -286,9 +372,14 @@ export default function Models() {
<tbody>
{models.map((model, idx) => {
const name = model.name || model.id
const estData = estimates[name]
const sizeDisplay = estData?.sizeDisplay
const ctxEst = estData?.estimates?.[String(contextSize)]
const vramDisplay = ctxEst?.vramDisplay
const vramBytes = ctxEst?.vramBytes
const installing = isInstalling(name)
const progress = getOperationProgress(name)
const fit = fitsGpu(model.estimated_vram_bytes)
const fit = fitsGpu(vramBytes)
const isExpanded = expandedRow === idx
return (
@@ -355,15 +446,15 @@ export default function Models() {
{/* Size / VRAM */}
<td>
<div style={{ display: 'flex', flexDirection: 'column', gap: '2px' }}>
{(model.estimated_size_display || model.estimated_vram_display) ? (
{(sizeDisplay || vramDisplay) ? (
<>
<span style={{ fontSize: '0.75rem', color: 'var(--color-text-secondary)' }}>
{model.estimated_size_display && model.estimated_size_display !== '0 B' && (
<span>{t('table.size', { size: model.estimated_size_display })}</span>
{sizeDisplay && sizeDisplay !== '0 B' && (
<span>{t('table.size', { size: sizeDisplay })}</span>
)}
{model.estimated_size_display && model.estimated_size_display !== '0 B' && model.estimated_vram_display && model.estimated_vram_display !== '0 B' && ' · '}
{model.estimated_vram_display && model.estimated_vram_display !== '0 B' && (
<span>{t('table.vram', { vram: model.estimated_vram_display })}</span>
{sizeDisplay && sizeDisplay !== '0 B' && vramDisplay && vramDisplay !== '0 B' && ' · '}
{vramDisplay && vramDisplay !== '0 B' && (
<span>{t('table.vram', { vram: vramDisplay })}</span>
)}
</span>
{fit !== null && (
@@ -437,7 +528,7 @@ export default function Models() {
{isExpanded && (
<tr>
<td colSpan="8" style={{ padding: 0 }}>
<ModelDetail model={model} fit={fit} expandedFiles={expandedFiles} setExpandedFiles={setExpandedFiles} t={t} />
<ModelDetail model={model} fit={fit} sizeDisplay={sizeDisplay} vramDisplay={vramDisplay} expandedFiles={expandedFiles} setExpandedFiles={setExpandedFiles} t={t} />
</td>
</tr>
)}
@@ -490,7 +581,7 @@ function DetailRow({ label, children }) {
)
}
function ModelDetail({ model, fit, expandedFiles, setExpandedFiles, t }) {
function ModelDetail({ model, fit, sizeDisplay, vramDisplay, expandedFiles, setExpandedFiles, t }) {
const files = model.additionalFiles || model.files || []
return (
<div style={{ padding: 'var(--spacing-md) var(--spacing-lg)', background: 'var(--color-bg-primary)', borderTop: '1px solid var(--color-border-subtle)' }}>
@@ -516,12 +607,12 @@ function ModelDetail({ model, fit, expandedFiles, setExpandedFiles, t }) {
)}
</DetailRow>
<DetailRow label={t('detail.size')}>
{model.estimated_size_display && model.estimated_size_display !== '0 B' ? model.estimated_size_display : null}
{sizeDisplay && sizeDisplay !== '0 B' ? sizeDisplay : null}
</DetailRow>
<DetailRow label={t('detail.vram')}>
{model.estimated_vram_display && model.estimated_vram_display !== '0 B' ? (
{vramDisplay && vramDisplay !== '0 B' ? (
<span style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)' }}>
{model.estimated_vram_display}
{vramDisplay}
{fit !== null && (
<span style={{ fontSize: '0.75rem', color: fit ? 'var(--color-success)' : 'var(--color-error)' }}>
<i className="fas fa-microchip" /> {fit ? t('detail.fitsGpu') : t('detail.mayNotFitGpu')}

View File

@@ -86,6 +86,10 @@ export const modelsApi = {
listCapabilities: () => fetchJSON(API_CONFIG.endpoints.modelsCapabilities),
install: (id) => postJSON(API_CONFIG.endpoints.installModel(id), {}),
delete: (id) => postJSON(API_CONFIG.endpoints.deleteModel(id), {}),
estimate: (id, contexts) => fetchJSON(
buildUrl(API_CONFIG.endpoints.modelEstimate(id),
contexts?.length ? { contexts: contexts.join(',') } : {})
),
getConfig: (id) => postJSON(API_CONFIG.endpoints.modelConfig(id), {}),
getConfigJson: (name) => fetchJSON(API_CONFIG.endpoints.modelConfigJson(name)),
getJob: (uid) => fetchJSON(API_CONFIG.endpoints.modelJob(uid)),
@@ -116,6 +120,7 @@ export const modelsApi = {
method: 'PATCH',
body: JSON.stringify(patch),
}),
backendUsecases: () => fetchJSON('/api/backends/usecases'),
}
// Backends API

View File

@@ -9,6 +9,7 @@ export const API_CONFIG = {
models: '/api/models',
installModel: (id) => `/api/models/install/${id}`,
deleteModel: (id) => `/api/models/delete/${id}`,
modelEstimate: (id) => `/api/models/estimate/${id}`,
modelConfig: (id) => `/api/models/config/${id}`,
modelConfigJson: (name) => `/api/models/config-json/${name}`,
configMetadata: '/api/models/config-metadata',

View File

@@ -9,11 +9,9 @@ import (
"math"
"net/http"
"net/url"
"path"
"slices"
"strconv"
"strings"
"sync"
"time"
"github.com/google/uuid"
@@ -37,8 +35,81 @@ const (
licenseSortFieldName = "license"
statusSortFieldName = "status"
ascSortOrder = "asc"
multimodalFilterKey = "multimodal"
)
// usecaseFilters maps UI filter keys to ModelConfigUsecase flags for
// capability-based gallery filtering.
var usecaseFilters = map[string]config.ModelConfigUsecase{
config.UsecaseChat: config.FLAG_CHAT,
config.UsecaseImage: config.FLAG_IMAGE,
config.UsecaseVideo: config.FLAG_VIDEO,
config.UsecaseVision: config.FLAG_VISION,
config.UsecaseTTS: config.FLAG_TTS,
config.UsecaseTranscript: config.FLAG_TRANSCRIPT,
config.UsecaseSoundGeneration: config.FLAG_SOUND_GENERATION,
config.UsecaseEmbeddings: config.FLAG_EMBEDDINGS,
config.UsecaseRerank: config.FLAG_RERANK,
config.UsecaseDetection: config.FLAG_DETECTION,
config.UsecaseVAD: config.FLAG_VAD,
config.UsecaseAudioTransform: config.FLAG_AUDIO_TRANSFORM,
config.UsecaseDiarization: config.FLAG_DIARIZATION,
}
// extractHFRepo tries to find a HuggingFace repo ID from model overrides or URLs.
func extractHFRepo(overrides map[string]any, urls []string) string {
if overrides != nil {
if params, ok := overrides["parameters"].(map[string]any); ok {
if modelRef, ok := params["model"].(string); ok {
if repoID, ok := vram.ExtractHFRepoID(modelRef); ok {
return repoID
}
}
}
}
for _, u := range urls {
if repoID, ok := vram.ExtractHFRepoID(u); ok {
return repoID
}
}
return ""
}
// buildEstimateInput creates a vram.ModelEstimateInput from gallery model metadata.
func buildEstimateInput(m *gallery.GalleryModel) vram.ModelEstimateInput {
var input vram.ModelEstimateInput
input.Size = m.Size
if hfRepoID := extractHFRepo(m.Overrides, m.URLs); hfRepoID != "" {
input.HFRepo = hfRepoID
}
for _, f := range m.AdditionalFiles {
if vram.IsWeightFile(f.URI) {
input.Files = append(input.Files, vram.FileInput{URI: f.URI, Size: 0})
}
}
return input
}
// parseContextSizes parses a comma-separated list of context sizes from a query param.
// Returns a default of [8192] if the param is empty or unparseable.
func parseContextSizes(raw string) []uint32 {
if raw == "" {
return []uint32{8192}
}
var sizes []uint32
for _, s := range strings.Split(raw, ",") {
s = strings.TrimSpace(s)
if v, err := strconv.ParseUint(s, 10, 32); err == nil && v > 0 {
sizes = append(sizes, uint32(v))
}
}
if len(sizes) == 0 {
return []uint32{8192}
}
return sizes
}
// getDirectorySize calculates the total size of files in a directory
// metaParentOf returns the name of the auto-resolving (meta) backend that
// declares `name` as one of its hardware-specific variants in its
@@ -260,7 +331,7 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
items = "9"
}
models, err := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.SystemState)
models, err := gallery.AvailableGalleryModelsCached(appConfig.Galleries, appConfig.SystemState)
if err != nil {
xlog.Error("could not list models from galleries", "error", err)
return c.JSON(http.StatusInternalServerError, map[string]any{
@@ -294,8 +365,30 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
}
slices.Sort(backendNames)
// Filter by usecase tags (comma-separated for multi-select).
if tag != "" {
models = gallery.GalleryElements[*gallery.GalleryModel](models).FilterByTag(tag)
var combinedFlag config.ModelConfigUsecase
hasMultimodal := false
var plainTags []string
for _, t := range strings.Split(tag, ",") {
t = strings.TrimSpace(t)
if t == multimodalFilterKey {
hasMultimodal = true
} else if flag, ok := usecaseFilters[t]; ok {
combinedFlag |= flag
} else if t != "" {
plainTags = append(plainTags, t)
}
}
if hasMultimodal {
models = gallery.FilterGalleryModelsByMultimodal(models)
}
if combinedFlag != config.FLAG_ANY {
models = gallery.FilterGalleryModelsByUsecase(models, combinedFlag)
}
for _, pt := range plainTags {
models = gallery.GalleryElements[*gallery.GalleryModel](models).FilterByTag(pt)
}
}
if term != "" {
models = gallery.GalleryElements[*gallery.GalleryModel](models).Search(term)
@@ -355,41 +448,6 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
modelsJSON := make([]map[string]any, 0, len(models))
seenIDs := make(map[string]bool)
weightExts := map[string]bool{".gguf": true, ".safetensors": true, ".bin": true, ".pt": true}
extractHFRepo := func(overrides map[string]any, urls []string) string {
// Try overrides.parameters.model first
if overrides != nil {
if params, ok := overrides["parameters"].(map[string]any); ok {
if modelRef, ok := params["model"].(string); ok {
if repoID, ok := vram.ExtractHFRepoID(modelRef); ok {
return repoID
}
}
}
}
// Fall back to the first HuggingFace URL in the metadata urls list
for _, u := range urls {
if repoID, ok := vram.ExtractHFRepoID(u); ok {
return repoID
}
}
return ""
}
hasWeightFiles := func(files []gallery.File) bool {
for _, f := range files {
ext := strings.ToLower(path.Ext(path.Base(f.URI)))
if weightExts[ext] {
return true
}
}
return false
}
const hfEstimateTimeout = 10 * time.Second
const estimateConcurrency = 3
sem := make(chan struct{}, estimateConcurrency)
var wg sync.WaitGroup
for _, m := range models {
modelID := m.ID()
@@ -431,63 +489,9 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
"backend": m.Backend,
}
// Build EstimateModel input from available metadata
var estimateInput vram.ModelEstimateInput
estimateInput.Options = vram.EstimateOptions{ContextLength: 8192}
estimateInput.Size = m.Size
if hfRepoID := extractHFRepo(m.Overrides, m.URLs); hfRepoID != "" {
estimateInput.HFRepo = hfRepoID
}
if hasWeightFiles(m.AdditionalFiles) {
files := make([]gallery.File, len(m.AdditionalFiles))
copy(files, m.AdditionalFiles)
for _, f := range files {
ext := strings.ToLower(path.Ext(path.Base(f.URI)))
if weightExts[ext] {
estimateInput.Files = append(estimateInput.Files, vram.FileInput{URI: f.URI, Size: 0})
}
}
}
// Run estimation (async for file-based and HF repo, sync for size string only)
needsAsync := len(estimateInput.Files) > 0 || estimateInput.HFRepo != ""
if needsAsync {
input := estimateInput
wg.Go(func() {
sem <- struct{}{}
defer func() { <-sem }()
ctx, cancel := context.WithTimeout(context.Background(), hfEstimateTimeout)
defer cancel()
result, err := vram.EstimateModel(ctx, input)
if err == nil {
if result.SizeBytes > 0 {
obj["estimated_size_bytes"] = result.SizeBytes
obj["estimated_size_display"] = result.SizeDisplay
}
if result.VRAMBytes > 0 {
obj["estimated_vram_bytes"] = result.VRAMBytes
obj["estimated_vram_display"] = result.VRAMDisplay
}
}
})
} else if estimateInput.Size != "" {
result, _ := vram.EstimateModel(context.Background(), estimateInput)
if result.SizeBytes > 0 {
obj["estimated_size_bytes"] = result.SizeBytes
obj["estimated_size_display"] = result.SizeDisplay
}
if result.VRAMBytes > 0 {
obj["estimated_vram_bytes"] = result.VRAMBytes
obj["estimated_vram_display"] = result.VRAMDisplay
}
}
modelsJSON = append(modelsJSON, obj)
}
wg.Wait()
prevPage := pageNum - 1
nextPage := pageNum + 1
if prevPage < 1 {
@@ -639,6 +643,65 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
})
})
// Returns a mapping of backend names to the usecase filter keys they support.
// Used by the gallery frontend to grey out usecase filter buttons when a
// backend is selected.
app.GET("/api/backends/usecases", func(c echo.Context) error {
result := make(map[string][]string, len(config.BackendCapabilities))
for name, cap := range config.BackendCapabilities {
var keys []string
for _, uc := range cap.PossibleUsecases {
if _, ok := usecaseFilters[uc]; ok {
keys = append(keys, uc)
}
}
slices.Sort(keys)
result[name] = keys
}
return c.JSON(200, result)
}, adminMiddleware)
// Returns VRAM/size estimates for a single gallery model at multiple
// context sizes. The frontend calls this per-model so the gallery page
// can load instantly and fill in estimates asynchronously.
// Query params:
// contexts - comma-separated context sizes (default: 8192)
app.GET("/api/models/estimate/:id", func(c echo.Context) error {
modelID, err := url.QueryUnescape(c.Param("id"))
if err != nil {
return c.JSON(http.StatusBadRequest, map[string]any{"error": "invalid model ID"})
}
contextSizes := parseContextSizes(c.QueryParam("contexts"))
// Look up the model from the gallery to build the estimate input.
models, err := gallery.AvailableGalleryModelsCached(appConfig.Galleries, appConfig.SystemState)
if err != nil {
return c.JSON(http.StatusInternalServerError, map[string]any{"error": err.Error()})
}
model := gallery.FindGalleryElement(models, modelID)
if model == nil {
return c.JSON(http.StatusNotFound, map[string]any{"error": "model not found"})
}
input := buildEstimateInput(model)
if len(input.Files) == 0 && input.HFRepo == "" && input.Size == "" {
return c.JSON(200, vram.MultiContextEstimate{})
}
ctx, cancel := context.WithTimeout(c.Request().Context(), 10*time.Second)
defer cancel()
result, err := vram.EstimateModelMultiContext(ctx, input, contextSizes)
if err != nil {
xlog.Debug("model estimate failed", "model", modelID, "error", err)
return c.JSON(200, vram.MultiContextEstimate{})
}
return c.JSON(200, result)
}, adminMiddleware)
app.POST("/api/models/install/:id", func(c echo.Context) error {
galleryID := c.Param("id")
// URL decode the gallery ID (e.g., "localai%40model" -> "localai@model")
@@ -742,7 +805,7 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
}
xlog.Debug("API job submitted to get config", "galleryID", galleryID)
models, err := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.SystemState)
models, err := gallery.AvailableGalleryModelsCached(appConfig.Galleries, appConfig.SystemState)
if err != nil {
return c.JSON(http.StatusInternalServerError, map[string]any{
"error": err.Error(),

View File

@@ -43,17 +43,16 @@ func EstimateVRAM(ctx context.Context, req VRAMRequest, cl *config.ModelConfigLo
modelsPath := sysState.Model.ModelsPath
var files []vram.FileInput
var firstGGUF string
seen := make(map[string]bool)
for _, f := range cfg.DownloadFiles {
addWeightFile(string(f.URI), modelsPath, &files, &firstGGUF, seen)
addWeightFile(string(f.URI), modelsPath, &files, seen)
}
if cfg.Model != "" {
addWeightFile(cfg.Model, modelsPath, &files, &firstGGUF, seen)
addWeightFile(cfg.Model, modelsPath, &files, seen)
}
if cfg.MMProj != "" {
addWeightFile(cfg.MMProj, modelsPath, &files, &firstGGUF, seen)
addWeightFile(cfg.MMProj, modelsPath, &files, seen)
}
if len(files) == 0 {
@@ -64,39 +63,46 @@ func EstimateVRAM(ctx context.Context, req VRAMRequest, cl *config.ModelConfigLo
}
contextDefaulted := false
opts := vram.EstimateOptions{
ContextLength: req.ContextSize,
GPULayers: req.GPULayers,
KVQuantBits: req.KVQuantBits,
}
if opts.ContextLength == 0 {
ctxLen := req.ContextSize
if ctxLen == 0 {
if cfg.ContextSize != nil {
opts.ContextLength = uint32(*cfg.ContextSize)
ctxLen = uint32(*cfg.ContextSize)
} else {
opts.ContextLength = 8192
ctxLen = 8192
contextDefaulted = true
}
}
opts := vram.EstimateOptions{
GPULayers: req.GPULayers,
KVQuantBits: req.KVQuantBits,
}
subCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
result, err := vram.Estimate(subCtx, files, opts, vram.DefaultCachedSizeResolver(), vram.DefaultCachedGGUFReader())
multi, err := vram.EstimateMultiContext(subCtx, files, []uint32{ctxLen}, opts, vram.DefaultCachedSizeResolver(), vram.DefaultCachedGGUFReader())
if err != nil {
return nil, fmt.Errorf("vram estimate: %w", err)
}
resp := &VRAMResponse{EstimateResult: result}
at := multi.Estimates[fmt.Sprint(ctxLen)]
resp := &VRAMResponse{
EstimateResult: vram.EstimateResult{
SizeBytes: multi.SizeBytes,
SizeDisplay: multi.SizeDisplay,
ContextLength: at.ContextLength,
VRAMBytes: at.VRAMBytes,
VRAMDisplay: at.VRAMDisplay,
},
ModelMaxContext: multi.ModelMaxContext,
}
if contextDefaulted && firstGGUF != "" {
ggufMeta, err := vram.DefaultCachedGGUFReader().ReadMetadata(subCtx, firstGGUF)
if err == nil && ggufMeta != nil && ggufMeta.MaximumContextLength > 0 {
resp.ModelMaxContext = ggufMeta.MaximumContextLength
resp.ContextNote = fmt.Sprintf(
"Estimate used default context_size=8192. The model's trained maximum context is %d; VRAM usage will be higher at larger context sizes.",
ggufMeta.MaximumContextLength,
)
}
if contextDefaulted && multi.ModelMaxContext > 0 {
resp.ContextNote = fmt.Sprintf(
"Estimate used default context_size=8192. The model's trained maximum context is %d; VRAM usage will be higher at larger context sizes.",
multi.ModelMaxContext,
)
}
return resp, nil
}
@@ -111,8 +117,8 @@ func resolveModelURI(uri, modelsPath string) string {
return "file://" + filepath.Join(modelsPath, uri)
}
// addWeightFile appends a resolved weight file to files and tracks the first GGUF.
func addWeightFile(uri, modelsPath string, files *[]vram.FileInput, firstGGUF *string, seen map[string]bool) {
// addWeightFile appends a resolved weight file to files.
func addWeightFile(uri, modelsPath string, files *[]vram.FileInput, seen map[string]bool) {
if !vram.IsWeightFile(uri) {
return
}
@@ -122,7 +128,4 @@ func addWeightFile(uri, modelsPath string, files *[]vram.FileInput, firstGGUF *s
}
seen[resolved] = true
*files = append(*files, vram.FileInput{URI: resolved, Size: 0})
if *firstGGUF == "" && vram.IsGGUF(uri) {
*firstGGUF = resolved
}
}

View File

@@ -628,10 +628,14 @@ func (r *SmartRouter) estimateModelVRAM(ctx context.Context, opts *pb.ModelOptio
estCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
ctxSize := uint32(opts.ContextSize)
if ctxSize == 0 {
ctxSize = 8192
}
input := vram.ModelEstimateInput{
Options: vram.EstimateOptions{
ContextLength: uint32(opts.ContextSize),
GPULayers: int(opts.NGPULayers),
GPULayers: int(opts.NGPULayers),
},
}
@@ -649,28 +653,15 @@ func (r *SmartRouter) estimateModelVRAM(ctx context.Context, opts *pb.ModelOptio
}
}
// If model file exists, get its size as fallback
if opts.ModelFile != "" && len(input.Files) == 0 {
if info, err := os.Stat(opts.ModelFile); err == nil {
return vram.EstimateFromSize(uint64(info.Size())).VRAMBytes
}
}
if len(input.Files) == 0 && input.HFRepo == "" && input.Size == "" {
return 0
}
result, err := vram.EstimateModel(estCtx, input)
if err != nil || result.VRAMBytes == 0 {
// Last resort: try model file size
if opts.ModelFile != "" {
if info, statErr := os.Stat(opts.ModelFile); statErr == nil {
return vram.EstimateFromSize(uint64(info.Size())).VRAMBytes
}
}
result, err := vram.EstimateModelMultiContext(estCtx, input, []uint32{ctxSize})
if err != nil {
return 0
}
return result.VRAMBytes
return result.VRAMForContext(ctxSize)
}
// installBackendOnNode sends a NATS backend.install request-reply to the node.

View File

@@ -1,42 +1,42 @@
---
name: "chatml"
config_file: |
backend: "llama-cpp"
mmap: true
template:
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
function: |
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
chat: |
{{.Input -}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- '<|im_end|>'
- '<dummy32000>'
- '</s>'
- '<|endoftext|>'
backend: llama-cpp
context_size: 4096
f16: true
known_usecases:
- chat
mmap: true
stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
template:
chat: |
{{.Input -}}
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
{{.Input}}
function: |
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
<|im_end|>
{{.Input -}}
<|im_start|>assistant
name: chatml

View File

@@ -1,24 +1,24 @@
---
name: "deepseek-r1"
config_file: |
backend: "llama-cpp"
context_size: 131072
mmap: true
f16: true
stopwords:
- <begin▁of▁sentence>
- <end▁of▁sentence>
- <User>
- <Assistant>
template:
chat_message: |
{{if eq .RoleName "system" -}}{{.Content }}
{{ end -}}
{{if eq .RoleName "user" -}}<User>{{.Content}}
{{end -}}
{{if eq .RoleName "assistant" -}}<Assistant>{{.Content}}<end▁of▁sentence>{{end}}
completion: |
{{.Input}}
chat: |
{{.Input -}}<Assistant>
backend: llama-cpp
context_size: 131072
f16: true
known_usecases:
- chat
mmap: true
stopwords:
- <begin▁of▁sentence>
- <end▁of▁sentence>
- <User>
- <Assistant>
template:
chat: |
{{.Input -}}<Assistant>
chat_message: |
{{if eq .RoleName "system" -}}{{.Content }}
{{ end -}}
{{if eq .RoleName "user" -}}<User>{{.Content}}
{{end -}}
{{if eq .RoleName "assistant" -}}<Assistant>{{.Content}}<end▁of▁sentence>{{end}}
completion: |
{{.Input}}
name: deepseek-r1

View File

@@ -1,41 +1,42 @@
---
name: "gemma"
config_file: |
backend: "llama-cpp"
mmap: true
context_size: 8192
template:
chat_message: |-
<start_of_turn>{{if eq .RoleName "assistant" }}model{{else}}{{ .RoleName }}{{end}}
{{ if .FunctionCall -}}
{{ else if eq .RoleName "tool" -}}
{{ end -}}
{{ if .Content -}}
{{.Content -}}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<end_of_turn>
chat: |
{{.Input }}
<start_of_turn>model
completion: |
{{.Input}}
function: |
<start_of_turn>system
You have access to functions. If you decide to invoke any of the function(s),
you MUST put it in the format of
{"name": function name, "parameters": dictionary of argument name and its value}
backend: llama-cpp
context_size: 8192
known_usecases:
- chat
- completion
mmap: true
stopwords:
- <|im_end|>
- <end_of_turn>
- <start_of_turn>
template:
chat: |
{{.Input }}
<start_of_turn>model
chat_message: |-
<start_of_turn>{{if eq .RoleName "assistant" }}model{{else}}{{ .RoleName }}{{end}}
{{ if .FunctionCall -}}
{{ else if eq .RoleName "tool" -}}
{{ end -}}
{{ if .Content -}}
{{.Content -}}
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<end_of_turn>
completion: |
{{.Input}}
function: |
<start_of_turn>system
You have access to functions. If you decide to invoke any of the function(s),
you MUST put it in the format of
{"name": function name, "parameters": dictionary of argument name and its value}
You SHOULD NOT include any other text in the response if you call a function
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
<end_of_turn>
{{.Input -}}
<start_of_turn>model
stopwords:
- '<|im_end|>'
- '<end_of_turn>'
- '<start_of_turn>'
You SHOULD NOT include any other text in the response if you call a function
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
<end_of_turn>
{{.Input -}}
<start_of_turn>model
name: gemma

View File

@@ -1,48 +1,49 @@
---
name: "granite-3.2"
config_file: |
backend: "llama-cpp"
mmap: true
template:
chat_message: |
<|start_of_role|>{{ .RoleName }}<|end_of_role|>
{{ if .FunctionCall -}}
<tool_call>
{{ else if eq .RoleName "tool" -}}
<tool_response>
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if eq .RoleName "tool" -}}
</tool_response>
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
</tool_call>
{{ end -}}
<|end_of_text|>
function: |
<|start_of_role|>system<|end_of_role|>
You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.
backend: llama-cpp
context_size: 8192
f16: true
known_usecases:
- chat
- completion
mmap: true
stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|end_of_text|>
template:
chat: |
{{.Input -}}
<|start_of_role|>assistant<|end_of_role|>
chat_message: |
<|start_of_role|>{{ .RoleName }}<|end_of_role|>
{{ if .FunctionCall -}}
<tool_call>
{{ else if eq .RoleName "tool" -}}
<tool_response>
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if eq .RoleName "tool" -}}
</tool_response>
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
</tool_call>
{{ end -}}
<|end_of_text|>
completion: |
{{.Input}}
function: |
<|start_of_role|>system<|end_of_role|>
You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.
Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
{{.Input -}}
<|start_of_role|>assistant<|end_of_role|>
chat: |
{{.Input -}}
<|start_of_role|>assistant<|end_of_role|>
completion: |
{{.Input}}
context_size: 8192
f16: true
stopwords:
- '<|im_end|>'
- '<dummy32000>'
- '</s>'
- '<|end_of_text|>'
Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments
{{.Input -}}
<|start_of_role|>assistant<|end_of_role|>
name: granite-3.2

View File

@@ -1,69 +1,69 @@
---
name: "harmony"
config_file: |
mmap: true
backend: "llama-cpp"
template:
chat_message: |-
<|start|>{{ if .FunctionCall -}}functions.{{ .FunctionCall.Name }} to=assistant{{ else if eq .RoleName "assistant"}}assistant<|channel|>final<|message|>{{else}}{{ .RoleName }}{{end}}<|message|>
{{- if .Content -}}
{{- .Content -}}
{{- end -}}
{{- if .FunctionCall -}}
{{- toJson .FunctionCall -}}
{{- end -}}<|end|>
function: |-
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: {{ now | date "Mon Jan 2 15:04:05 MST 2006" }}
backend: llama-cpp
context_size: 8192
f16: true
known_usecases:
- chat
mmap: true
stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
- <|return|>
template:
chat: |-
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: {{ now | date "Mon Jan 2 15:04:05 MST 2006" }}
Reasoning: {{if eq .ReasoningEffort ""}}medium{{else}}{{.ReasoningEffort}}{{end}}
Reasoning: {{if eq .ReasoningEffort ""}}medium{{else}}{{.ReasoningEffort}}{{end}}
# {{with .Metadata}}{{ if ne .system_prompt "" }}{{ .system_prompt }}{{ end }}{{else}}You are a friendly and helpful assistant.{{ end }}<|end|>{{- .Input -}}<|start|>assistant
# {{with .Metadata}}{{ if ne .system_prompt "" }}{{ .system_prompt }}{{ end }}{{else}}You are a friendly and helpful assistant.{{ end }}<|end|>{{- .Input -}}<|start|>assistant
chat_message: |-
<|start|>{{ if .FunctionCall -}}functions.{{ .FunctionCall.Name }} to=assistant{{ else if eq .RoleName "assistant"}}assistant<|channel|>final<|message|>{{else}}{{ .RoleName }}{{end}}<|message|>
{{- if .Content -}}
{{- .Content -}}
{{- end -}}
{{- if .FunctionCall -}}
{{- toJson .FunctionCall -}}
{{- end -}}<|end|>
completion: |
{{.Input}}
function: |-
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: {{ now | date "Mon Jan 2 15:04:05 MST 2006" }}
# Tools
Reasoning: {{if eq .ReasoningEffort ""}}medium{{else}}{{.ReasoningEffort}}{{end}}
## functions
# {{with .Metadata}}{{ if ne .system_prompt "" }}{{ .system_prompt }}{{ end }}{{else}}You are a friendly and helpful assistant.{{ end }}<|end|>{{- .Input -}}<|start|>assistant
namespace functions {
{{-range .Functions}}
{{if .Description }}
// {{ .Description }}
{{- end }}
{{- if and .Parameters.Properties (gt (len .Parameters.Properties) 0) }}
type {{ .Name }} = (_: {
{{- range $name, $prop := .Parameters.Properties }}
{{- if $prop.Description }}
// {{ $prop.Description }}
{{- end }}
{{ $name }}: {{ if gt (len $prop.Type) 1 }}{{ range $i, $t := $prop.Type }}{{ if $i }} | {{ end }}{{ $t }}{{ end }}{{ else }}{{ index $prop.Type 0 }}{{ end }},
{{- end }}
}) => any;
{{- else }}
type {{ .Function.Name }} = () => any;
{{- end }}
{{- end }}{{/* end of range .Functions */}}
} // namespace functions
# Tools
# Instructions
## functions
<|end|>{{.Input -}}<|start|>assistant
chat: |-
<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: {{ now | date "Mon Jan 2 15:04:05 MST 2006" }}
namespace functions {
{{-range .Functions}}
{{if .Description }}
// {{ .Description }}
{{- end }}
{{- if and .Parameters.Properties (gt (len .Parameters.Properties) 0) }}
type {{ .Name }} = (_: {
{{- range $name, $prop := .Parameters.Properties }}
{{- if $prop.Description }}
// {{ $prop.Description }}
{{- end }}
{{ $name }}: {{ if gt (len $prop.Type) 1 }}{{ range $i, $t := $prop.Type }}{{ if $i }} | {{ end }}{{ $t }}{{ end }}{{ else }}{{ index $prop.Type 0 }}{{ end }},
{{- end }}
}) => any;
{{- else }}
type {{ .Function.Name }} = () => any;
{{- end }}
{{- end }}{{/* end of range .Functions */}}
} // namespace functions
Reasoning: {{if eq .ReasoningEffort ""}}medium{{else}}{{.ReasoningEffort}}{{end}}
# Instructions
# {{with .Metadata}}{{ if ne .system_prompt "" }}{{ .system_prompt }}{{ end }}{{else}}You are a friendly and helpful assistant.{{ end }}<|end|>{{- .Input -}}<|start|>assistant
completion: |
{{.Input}}
context_size: 8192
f16: true
stopwords:
- '<|im_end|>'
- '<dummy32000>'
- '</s>'
- '<|endoftext|>'
- '<|return|>'
<|end|>{{.Input -}}<|start|>assistant
name: harmony

View File

File diff suppressed because it is too large Load Diff

View File

@@ -1,46 +1,46 @@
---
name: "lfm"
config_file: |
backend: "llama-cpp"
mmap: true
template:
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
<|tool_call_start|>
{{ else if eq .RoleName "tool" -}}
<|tool_response_start|>
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if eq .RoleName "tool" -}}
<|tool_response_end|>
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
function: |
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.
List of tools: <|tool_list_start|>[
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
]<|tool_list_end|>
<|im_end|>
{{.Input -}}
<|im_start|>assistant
chat: |
{{.Input -}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- '<|im_end|>'
- '<dummy32000>'
- '</s>'
- '<|endoftext|>'
backend: llama-cpp
context_size: 4096
f16: true
known_usecases:
- chat
mmap: true
stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
template:
chat: |
{{.Input -}}
<|im_start|>assistant
chat_message: |
<|im_start|>{{ .RoleName }}
{{ if .FunctionCall -}}
<|tool_call_start|>
{{ else if eq .RoleName "tool" -}}
<|tool_response_start|>
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if eq .RoleName "tool" -}}
<|tool_response_end|>
{{ end -}}
{{ if .FunctionCall -}}
{{toJson .FunctionCall}}
{{ end -}}<|im_end|>
completion: |
{{.Input}}
function: |
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.
List of tools: <|tool_list_start|>[
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
]<|tool_list_end|>
<|im_end|>
{{.Input -}}
<|im_start|>assistant
name: lfm

View File

@@ -1,19 +1,20 @@
---
name: "moondream2"
config_file: |
backend: "llama-cpp"
backend: llama-cpp
context_size: 2046
roles:
user: "\nQuestion: "
system: "\nSystem: "
assistant: "\nAnswer: "
stopwords:
- "Question:"
- "<|endoftext|>"
f16: true
known_usecases:
- chat
roles:
assistant: "\nAnswer: "
system: "\nSystem: "
user: "\nQuestion: "
stopwords:
- 'Question:'
- <|endoftext|>
template:
completion: |
Complete the following sentence: {{.Input}}
chat: "{{.Input}}\nAnswer:\n"
chat: |
{{.Input}}
Answer:
completion: |
Complete the following sentence: {{.Input}}
name: moondream2

View File

@@ -1,16 +1,15 @@
---
name: nanbeige4.1
config_file: |
backend: llama-cpp
function:
grammar:
disable: true
known_usecases:
- chat
options:
- use_jinja:true
parameters:
model: llama-cpp/models/nanbeige4.1-3b-q8_0.gguf
template:
use_tokenizer_template: true
backend: llama-cpp
function:
grammar:
disable: true
known_usecases:
- chat
- completion
options:
- use_jinja:true
parameters:
model: llama-cpp/models/nanbeige4.1-3b-q8_0.gguf
template:
use_tokenizer_template: true
name: nanbeige4.1

View File

@@ -1,9 +1,9 @@
---
name: openvino
config_file: |
backend: transformers
context_size: 8192
type: OVModelForCausalLM
template:
use_tokenizer_template: true
backend: transformers
context_size: 8192
known_usecases:
- embeddings
template:
use_tokenizer_template: true
type: OVModelForCausalLM
name: openvino

View File

@@ -1,46 +1,46 @@
---
name: "qwen3"
config_file: |
parameters:
context_size: 8192
f16: true
mmap: true
backend: "llama-cpp"
template:
chat_message: |
<|im_start|>{{if eq .RoleName "tool" }}user{{else}}{{ .RoleName }}{{end}}
{{ if eq .RoleName "tool" -}}
<tool_response>
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if eq .RoleName "tool" -}}
</tool_response>
{{ end -}}
{{ if .FunctionCall -}}
<tool_call>
{{toJson .FunctionCall}}
</tool_call>
{{ end -}}<|im_end|>
function: |
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments: {"name": <function-name>, "arguments": <json-arguments-object>}
<|im_end|>
{{.Input -}}
<|im_start|>assistant
chat: |
{{.Input -}}
<|im_start|>assistant
completion: |
{{.Input}}
stopwords:
- '<|im_end|>'
- '<dummy32000>'
- '</s>'
- '<|endoftext|>'
backend: llama-cpp
known_usecases:
- chat
parameters:
context_size: 8192
f16: true
mmap: true
stopwords:
- <|im_end|>
- <dummy32000>
- </s>
- <|endoftext|>
template:
chat: |
{{.Input -}}
<|im_start|>assistant
chat_message: |
<|im_start|>{{if eq .RoleName "tool" }}user{{else}}{{ .RoleName }}{{end}}
{{ if eq .RoleName "tool" -}}
<tool_response>
{{ end -}}
{{ if .Content -}}
{{.Content }}
{{ end -}}
{{ if eq .RoleName "tool" -}}
</tool_response>
{{ end -}}
{{ if .FunctionCall -}}
<tool_call>
{{toJson .FunctionCall}}
</tool_call>
{{ end -}}<|im_end|>
completion: |
{{.Input}}
function: |
<|im_start|>system
You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
{{range .Functions}}
{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
{{end}}
For each function call return a json object with function name and arguments: {"name": <function-name>, "arguments": <json-arguments-object>}
<|im_end|>
{{.Input -}}
<|im_start|>assistant
name: qwen3

View File

@@ -1,20 +1,21 @@
---
name: smolvlm
# yamllint disable-line rule:trailing-spaces
config_file: |
backend: "llama-cpp"
mmap: true
template:
chat_message: |
{{if eq .RoleName "assistant"}}Assistant{{else if eq .RoleName "system"}}System{{else if eq .RoleName "user"}}User{{end}}: {{.Content }}<end_of_utterance>
chat: "<|im_start|>\n{{.Input -}}\nAssistant: "
completion: |
{{-.Input}}
backend: llama-cpp
f16: true
known_usecases:
- chat
- vision
mmap: true
stopwords:
- '<|im_end|>'
- '<dummy32000>'
- '</s>'
- '<|'
- '<end_of_utterance>'
- '<|endoftext|>'
- <|im_end|>
- <dummy32000>
- </s>
- <|
- <end_of_utterance>
- <|endoftext|>
template:
chat: "<|im_start|>\n{{.Input -}}\nAssistant: "
chat_message: |
{{if eq .RoleName "assistant"}}Assistant{{else if eq .RoleName "system"}}System{{else if eq .RoleName "user"}}User{{end}}: {{.Content }}<end_of_utterance>
completion: |
{{-.Input}}
name: smolvlm

View File

@@ -3,94 +3,93 @@ package vram
import (
"context"
"sync"
"time"
)
const defaultEstimateCacheTTL = 15 * time.Minute
// galleryGenFunc returns the current gallery generation counter.
// When set, cache entries are invalidated when the generation changes.
// When nil (e.g., in tests or non-gallery contexts), entries never expire.
var galleryGenFunc func() uint64
// SetGalleryGenerationFunc wires the gallery generation counter into the
// VRAM caches. Call this once at application startup.
func SetGalleryGenerationFunc(fn func() uint64) {
galleryGenFunc = fn
}
func currentGeneration() uint64 {
if galleryGenFunc != nil {
return galleryGenFunc()
}
return 0
}
type sizeCacheEntry struct {
size int64
err error
until time.Time
size int64
err error
generation uint64
}
type cachedSizeResolver struct {
underlying SizeResolver
ttl time.Duration
mu sync.Mutex
cache map[string]sizeCacheEntry
}
func (c *cachedSizeResolver) ContentLength(ctx context.Context, uri string) (int64, error) {
gen := currentGeneration()
c.mu.Lock()
e, ok := c.cache[uri]
c.mu.Unlock()
if ok && time.Now().Before(e.until) {
if ok && e.generation == gen {
return e.size, e.err
}
size, err := c.underlying.ContentLength(ctx, uri)
c.mu.Lock()
if c.cache == nil {
c.cache = make(map[string]sizeCacheEntry)
}
c.cache[uri] = sizeCacheEntry{size: size, err: err, until: time.Now().Add(c.ttl)}
c.cache[uri] = sizeCacheEntry{size: size, err: err, generation: gen}
c.mu.Unlock()
return size, err
}
type ggufCacheEntry struct {
meta *GGUFMeta
err error
until time.Time
meta *GGUFMeta
err error
generation uint64
}
type cachedGGUFReader struct {
underlying GGUFMetadataReader
ttl time.Duration
mu sync.Mutex
cache map[string]ggufCacheEntry
}
func (c *cachedGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMeta, error) {
gen := currentGeneration()
c.mu.Lock()
e, ok := c.cache[uri]
c.mu.Unlock()
if ok && time.Now().Before(e.until) {
if ok && e.generation == gen {
return e.meta, e.err
}
meta, err := c.underlying.ReadMetadata(ctx, uri)
c.mu.Lock()
if c.cache == nil {
c.cache = make(map[string]ggufCacheEntry)
}
c.cache[uri] = ggufCacheEntry{meta: meta, err: err, until: time.Now().Add(c.ttl)}
c.cache[uri] = ggufCacheEntry{meta: meta, err: err, generation: gen}
c.mu.Unlock()
return meta, err
}
// CachedSizeResolver returns a SizeResolver that caches ContentLength results by URI for the given TTL.
func CachedSizeResolver(underlying SizeResolver, ttl time.Duration) SizeResolver {
return &cachedSizeResolver{underlying: underlying, ttl: ttl, cache: make(map[string]sizeCacheEntry)}
}
// CachedGGUFReader returns a GGUFMetadataReader that caches ReadMetadata results by URI for the given TTL.
func CachedGGUFReader(underlying GGUFMetadataReader, ttl time.Duration) GGUFMetadataReader {
return &cachedGGUFReader{underlying: underlying, ttl: ttl, cache: make(map[string]ggufCacheEntry)}
}
// DefaultCachedSizeResolver returns a cached SizeResolver using the default implementation and default TTL (15 min).
// A single shared cache is used so repeated HEAD requests for the same URI are avoided across requests.
// DefaultCachedSizeResolver returns a cached SizeResolver using the default implementation.
// Entries are invalidated when the gallery generation changes.
func DefaultCachedSizeResolver() SizeResolver {
return defaultCachedSizeResolver
}
// DefaultCachedGGUFReader returns a cached GGUFMetadataReader using the default implementation and default TTL (15 min).
// A single shared cache is used so repeated GGUF metadata fetches for the same URI are avoided across requests.
// DefaultCachedGGUFReader returns a cached GGUFMetadataReader using the default implementation.
// Entries are invalidated when the gallery generation changes.
func DefaultCachedGGUFReader() GGUFMetadataReader {
return defaultCachedGGUFReader
}
var (
defaultCachedSizeResolver = CachedSizeResolver(defaultSizeResolver{}, defaultEstimateCacheTTL)
defaultCachedGGUFReader = CachedGGUFReader(defaultGGUFReader{}, defaultEstimateCacheTTL)
defaultCachedSizeResolver = &cachedSizeResolver{underlying: defaultSizeResolver{}, cache: make(map[string]sizeCacheEntry)}
defaultCachedGGUFReader = &cachedGGUFReader{underlying: defaultGGUFReader{}, cache: make(map[string]ggufCacheEntry)}
)

View File

@@ -23,17 +23,19 @@ func IsGGUF(nameOrURI string) bool {
return strings.ToLower(path.Ext(path.Base(nameOrURI))) == ".gguf"
}
func Estimate(ctx context.Context, files []FileInput, opts EstimateOptions, sizeResolver SizeResolver, ggufReader GGUFMetadataReader) (EstimateResult, error) {
if opts.ContextLength == 0 {
opts.ContextLength = 8192
}
if opts.KVQuantBits == 0 {
opts.KVQuantBits = 16
}
// modelProfile captures the "fixed" properties of a model after I/O.
// Everything except context length is constant for a given model.
type modelProfile struct {
sizeBytes uint64 // total weight file size
ggufSize uint64 // GGUF file size (subset of sizeBytes)
meta *GGUFMeta // nil if no GGUF metadata available
}
var sizeBytes uint64
var ggufSize uint64
// resolveProfile does all I/O: iterates files, fetches sizes and GGUF metadata.
func resolveProfile(ctx context.Context, files []FileInput, sizeResolver SizeResolver, ggufReader GGUFMetadataReader) modelProfile {
var p modelProfile
var firstGGUFURI string
for i := range files {
f := &files[i]
if !IsWeightFile(f.URI) {
@@ -47,23 +49,32 @@ func Estimate(ctx context.Context, files []FileInput, opts EstimateOptions, size
continue
}
}
sizeBytes += uint64(sz)
p.sizeBytes += uint64(sz)
if IsGGUF(f.URI) {
ggufSize += uint64(sz)
p.ggufSize += uint64(sz)
if firstGGUFURI == "" {
firstGGUFURI = f.URI
}
}
}
sizeDisplay := FormatBytes(sizeBytes)
if p.ggufSize > 0 && ggufReader != nil && firstGGUFURI != "" {
p.meta, _ = ggufReader.ReadMetadata(ctx, firstGGUFURI)
}
var vramBytes uint64
if ggufSize > 0 {
var meta *GGUFMeta
if ggufReader != nil && firstGGUFURI != "" {
meta, _ = ggufReader.ReadMetadata(ctx, firstGGUFURI)
}
return p
}
// computeVRAM is pure arithmetic — no I/O. Returns VRAM bytes for a given
// model profile and context length.
func computeVRAM(p modelProfile, ctxLen uint32, opts EstimateOptions) uint64 {
kvQuantBits := opts.KVQuantBits
if kvQuantBits == 0 {
kvQuantBits = 16
}
if p.ggufSize > 0 {
meta := p.meta
if meta != nil && (meta.BlockCount > 0 || meta.EmbeddingLength > 0) {
nLayers := meta.BlockCount
if nLayers == 0 {
@@ -84,36 +95,29 @@ func Estimate(ctx context.Context, files []FileInput, opts EstimateOptions, size
if gpuLayers <= 0 {
gpuLayers = int(nLayers)
}
ctxLen := opts.ContextLength
bKV := uint32(opts.KVQuantBits / 8)
bKV := uint32(kvQuantBits / 8)
if bKV == 0 {
bKV = 4
}
M_model := ggufSize
M_KV := uint64(bKV) * uint64(dModel) * uint64(nLayers) * uint64(ctxLen)
if headCountKV > 0 && meta.HeadCount > 0 {
M_KV = uint64(bKV) * uint64(dModel) * uint64(headCountKV) * uint64(ctxLen)
}
M_model := p.ggufSize
M_KV := uint64(bKV) * uint64(dModel) * uint64(headCountKV) * uint64(ctxLen)
P := M_model * 2
M_overhead := uint64(0.02*float64(P) + 0.15*1e9)
vramBytes = M_model + M_KV + M_overhead
vramBytes := M_model + M_KV + M_overhead
if nLayers > 0 && gpuLayers < int(nLayers) {
layerRatio := float64(gpuLayers) / float64(nLayers)
vramBytes = uint64(layerRatio*float64(M_model)) + M_KV + M_overhead
}
} else {
vramBytes = sizeOnlyVRAM(ggufSize, opts.ContextLength)
return vramBytes
}
} else if sizeBytes > 0 {
vramBytes = sizeOnlyVRAM(sizeBytes, opts.ContextLength)
return sizeOnlyVRAM(p.ggufSize, ctxLen)
}
return EstimateResult{
SizeBytes: sizeBytes,
SizeDisplay: sizeDisplay,
VRAMBytes: vramBytes,
VRAMDisplay: FormatBytes(vramBytes),
}, nil
if p.sizeBytes > 0 {
return sizeOnlyVRAM(p.sizeBytes, ctxLen)
}
return 0
}
func sizeOnlyVRAM(sizeOnDisk uint64, ctxLen uint32) uint64 {
@@ -125,6 +129,45 @@ func sizeOnlyVRAM(sizeOnDisk uint64, ctxLen uint32) uint64 {
return vram
}
// buildEstimates computes VRAMAt entries for each context size from a profile.
func buildEstimates(p modelProfile, contextSizes []uint32, opts EstimateOptions) map[string]VRAMAt {
m := make(map[string]VRAMAt, len(contextSizes))
for _, ctxLen := range contextSizes {
vramBytes := computeVRAM(p, ctxLen, opts)
m[fmt.Sprint(ctxLen)] = VRAMAt{
ContextLength: ctxLen,
VRAMBytes: vramBytes,
VRAMDisplay: FormatBytes(vramBytes),
}
}
return m
}
// EstimateMultiContext estimates model size and VRAM at multiple context sizes.
// It performs I/O once (resolveProfile) then computes VRAM for each context size.
func EstimateMultiContext(ctx context.Context, files []FileInput, contextSizes []uint32,
opts EstimateOptions, sizeResolver SizeResolver, ggufReader GGUFMetadataReader) (MultiContextEstimate, error) {
if len(contextSizes) == 0 {
contextSizes = []uint32{8192}
}
p := resolveProfile(ctx, files, sizeResolver, ggufReader)
result := MultiContextEstimate{
SizeBytes: p.sizeBytes,
SizeDisplay: FormatBytes(p.sizeBytes),
Estimates: buildEstimates(p, contextSizes, opts),
}
if p.meta != nil && p.meta.MaximumContextLength > 0 {
result.ModelMaxContext = p.meta.MaximumContextLength
}
return result, nil
}
// ParseSizeString parses a human-readable size string (e.g. "500MB", "14.5 GB", "2tb")
// into bytes. Supports B, KB, MB, GB, TB, PB (case-insensitive, space optional).
// Uses SI units (1 KB = 1000 B).
@@ -136,7 +179,6 @@ func ParseSizeString(s string) (uint64, error) {
s = strings.ToUpper(s)
// Find where the numeric part ends
i := 0
for i < len(s) && (s[i] == '.' || (s[i] >= '0' && s[i] <= '9')) {
i++
@@ -177,17 +219,6 @@ func ParseSizeString(s string) (uint64, error) {
return uint64(num * float64(multiplier)), nil
}
// EstimateFromSize builds an EstimateResult from a raw byte count.
func EstimateFromSize(sizeBytes uint64) EstimateResult {
vramBytes := sizeOnlyVRAM(sizeBytes, 8192)
return EstimateResult{
SizeBytes: sizeBytes,
SizeDisplay: FormatBytes(sizeBytes),
VRAMBytes: vramBytes,
VRAMDisplay: FormatBytes(vramBytes),
}
}
func FormatBytes(n uint64) string {
const unit = 1000
if n < unit {
@@ -216,24 +247,29 @@ func DefaultGGUFReader() GGUFMetadataReader {
}
// ModelEstimateInput describes the inputs for a unified VRAM/size estimation.
// The estimator cascades through available data: files size string HF repo zero.
// The estimator cascades through available data: files -> size string -> HF repo -> zero.
type ModelEstimateInput struct {
Files []FileInput // weight files with optional pre-known sizes
Size string // gallery hardcoded size (e.g. "14.5GB")
HFRepo string // HF repo ID or URL
Options EstimateOptions // context length, GPU layers, KV quant bits
Options EstimateOptions // GPU layers, KV quant bits
}
// EstimateModel provides a unified VRAM estimation entry point.
// EstimateModelMultiContext provides a unified VRAM estimation entry point
// that returns estimates at multiple context sizes.
// It tries (in order):
// 1. Direct file-based estimation (GGUF metadata or file size heuristic)
// 2. ParseSizeString from Size field
// 3. EstimateFromHFRepo
// 3. HuggingFace repo file listing
// 4. Zero result
func EstimateModel(ctx context.Context, input ModelEstimateInput) (EstimateResult, error) {
func EstimateModelMultiContext(ctx context.Context, input ModelEstimateInput, contextSizes []uint32) (MultiContextEstimate, error) {
if len(contextSizes) == 0 {
contextSizes = []uint32{8192}
}
// 1. Try direct file estimation
if len(input.Files) > 0 {
result, err := Estimate(ctx, input.Files, input.Options, DefaultCachedSizeResolver(), DefaultCachedGGUFReader())
result, err := EstimateMultiContext(ctx, input.Files, contextSizes, input.Options, DefaultCachedSizeResolver(), DefaultCachedGGUFReader())
if err != nil {
xlog.Debug("VRAM estimation from files failed", "error", err)
}
@@ -247,7 +283,11 @@ func EstimateModel(ctx context.Context, input ModelEstimateInput) (EstimateResul
if sizeBytes, err := ParseSizeString(input.Size); err != nil {
xlog.Debug("VRAM estimation from size string failed", "error", err, "size", input.Size)
} else if sizeBytes > 0 {
return EstimateFromSize(sizeBytes), nil
return MultiContextEstimate{
SizeBytes: sizeBytes,
SizeDisplay: FormatBytes(sizeBytes),
Estimates: buildEstimates(modelProfile{sizeBytes: sizeBytes}, contextSizes, EstimateOptions{}),
}, nil
}
}
@@ -257,15 +297,19 @@ func EstimateModel(ctx context.Context, input ModelEstimateInput) (EstimateResul
hfRepo = repoID
}
if hfRepo != "" {
result, err := EstimateFromHFRepo(ctx, hfRepo)
totalBytes, err := hfRepoWeightSize(ctx, hfRepo)
if err != nil {
xlog.Debug("VRAM estimation from HF repo failed", "error", err, "repo", hfRepo)
}
if err == nil && result.SizeBytes > 0 {
return result, nil
if err == nil && totalBytes > 0 {
return MultiContextEstimate{
SizeBytes: totalBytes,
SizeDisplay: FormatBytes(totalBytes),
Estimates: buildEstimates(modelProfile{sizeBytes: totalBytes}, contextSizes, EstimateOptions{}),
}, nil
}
}
// 4. No estimation possible
return EstimateResult{}, nil
return MultiContextEstimate{}, nil
}

View File

@@ -23,26 +23,25 @@ func (f fakeGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMeta
return f[uri], nil
}
var _ = Describe("Estimate", func() {
var _ = Describe("EstimateMultiContext", func() {
ctx := context.Background()
defaultCtx := []uint32{8192}
Describe("empty or non-GGUF inputs", func() {
It("returns zero size and vram for nil files", func() {
opts := EstimateOptions{ContextLength: 8192}
res, err := Estimate(ctx, nil, opts, nil, nil)
res, err := EstimateMultiContext(ctx, nil, defaultCtx, EstimateOptions{}, nil, nil)
Expect(err).ToNot(HaveOccurred())
Expect(res.SizeBytes).To(Equal(uint64(0)))
Expect(res.VRAMBytes).To(Equal(uint64(0)))
Expect(res.Estimates["8192"].VRAMBytes).To(Equal(uint64(0)))
Expect(res.SizeDisplay).To(Equal("0 B"))
})
It("counts only .gguf files and ignores other extensions", func() {
It("counts only weight files and ignores other extensions", func() {
files := []FileInput{
{URI: "http://a/model.gguf", Size: 1_000_000_000},
{URI: "http://a/readme.txt", Size: 100},
}
opts := EstimateOptions{ContextLength: 8192}
res, err := Estimate(ctx, files, opts, nil, nil)
res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, nil, nil)
Expect(err).ToNot(HaveOccurred())
Expect(res.SizeBytes).To(Equal(uint64(1_000_000_000)))
})
@@ -52,8 +51,7 @@ var _ = Describe("Estimate", func() {
{URI: "http://hf.co/model/model.safetensors", Size: 2_000_000_000},
{URI: "http://hf.co/model/model2.safetensors", Size: 3_000_000_000},
}
opts := EstimateOptions{ContextLength: 8192}
res, err := Estimate(ctx, files, opts, nil, nil)
res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, nil, nil)
Expect(err).ToNot(HaveOccurred())
Expect(res.SizeBytes).To(Equal(uint64(5_000_000_000)))
})
@@ -62,24 +60,22 @@ var _ = Describe("Estimate", func() {
Describe("GGUF size and resolver", func() {
It("uses size resolver when file size is not set", func() {
sizes := fakeSizeResolver{"http://example.com/model.gguf": 1_500_000_000}
opts := EstimateOptions{ContextLength: 8192}
files := []FileInput{{URI: "http://example.com/model.gguf"}}
res, err := Estimate(ctx, files, opts, sizes, nil)
res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, sizes, nil)
Expect(err).ToNot(HaveOccurred())
Expect(res.SizeBytes).To(Equal(uint64(1_500_000_000)))
Expect(res.VRAMBytes).To(BeNumerically(">=", res.SizeBytes))
Expect(res.Estimates["8192"].VRAMBytes).To(BeNumerically(">=", res.SizeBytes))
Expect(res.SizeDisplay).To(Equal("1.5 GB"))
})
It("uses size-only VRAM formula when metadata is missing and size is large", func() {
sizes := fakeSizeResolver{"http://a/model.gguf": 10_000_000_000}
opts := EstimateOptions{ContextLength: 8192}
files := []FileInput{{URI: "http://a/model.gguf"}}
res, err := Estimate(ctx, files, opts, sizes, nil)
res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, sizes, nil)
Expect(err).ToNot(HaveOccurred())
Expect(res.VRAMBytes).To(BeNumerically(">", 10_000_000_000))
Expect(res.Estimates["8192"].VRAMBytes).To(BeNumerically(">", 10_000_000_000))
})
It("sums size for multiple GGUF shards", func() {
@@ -87,18 +83,16 @@ var _ = Describe("Estimate", func() {
{URI: "http://a/shard1.gguf", Size: 10_000_000_000},
{URI: "http://a/shard2.gguf", Size: 5_000_000_000},
}
opts := EstimateOptions{ContextLength: 8192}
res, err := Estimate(ctx, files, opts, nil, nil)
res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, nil, nil)
Expect(err).ToNot(HaveOccurred())
Expect(res.SizeBytes).To(Equal(uint64(15_000_000_000)))
})
It("formats size display correctly", func() {
files := []FileInput{{URI: "http://a/model.gguf", Size: 2_500_000_000}}
opts := EstimateOptions{ContextLength: 8192}
res, err := Estimate(ctx, files, opts, nil, nil)
res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, nil, nil)
Expect(err).ToNot(HaveOccurred())
Expect(res.SizeDisplay).To(Equal("2.5 GB"))
})
@@ -108,24 +102,94 @@ var _ = Describe("Estimate", func() {
It("uses metadata for VRAM when reader returns meta and partial offload", func() {
meta := &GGUFMeta{BlockCount: 32, EmbeddingLength: 4096}
reader := fakeGGUFReader{"http://a/model.gguf": meta}
opts := EstimateOptions{ContextLength: 8192, GPULayers: 20}
opts := EstimateOptions{GPULayers: 20}
files := []FileInput{{URI: "http://a/model.gguf", Size: 8_000_000_000}}
res, err := Estimate(ctx, files, opts, nil, reader)
res, err := EstimateMultiContext(ctx, files, defaultCtx, opts, nil, reader)
Expect(err).ToNot(HaveOccurred())
Expect(res.VRAMBytes).To(BeNumerically(">", 0))
Expect(res.Estimates["8192"].VRAMBytes).To(BeNumerically(">", 0))
})
It("uses metadata head counts for KV and yields vram > size", func() {
files := []FileInput{{URI: "http://a/model.gguf", Size: 15_000_000_000}}
meta := &GGUFMeta{BlockCount: 32, EmbeddingLength: 4096, HeadCount: 32, HeadCountKV: 8}
reader := fakeGGUFReader{"http://a/model.gguf": meta}
opts := EstimateOptions{ContextLength: 8192}
res, err := Estimate(ctx, files, opts, nil, reader)
res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, nil, reader)
Expect(err).ToNot(HaveOccurred())
Expect(res.SizeBytes).To(Equal(uint64(15_000_000_000)))
Expect(res.VRAMBytes).To(BeNumerically(">", res.SizeBytes))
Expect(res.Estimates["8192"].VRAMBytes).To(BeNumerically(">", res.SizeBytes))
})
It("populates ModelMaxContext from GGUF metadata", func() {
meta := &GGUFMeta{BlockCount: 32, EmbeddingLength: 4096, MaximumContextLength: 131072}
reader := fakeGGUFReader{"http://a/model.gguf": meta}
files := []FileInput{{URI: "http://a/model.gguf", Size: 8_000_000_000}}
res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, nil, reader)
Expect(err).ToNot(HaveOccurred())
Expect(res.ModelMaxContext).To(Equal(uint64(131072)))
})
})
Describe("multi-context behavior", func() {
It("returns estimates for all requested context sizes", func() {
files := []FileInput{{URI: "http://a/model.gguf", Size: 4_000_000_000}}
sizes := []uint32{8192, 32768, 131072}
res, err := EstimateMultiContext(ctx, files, sizes, EstimateOptions{}, nil, nil)
Expect(err).ToNot(HaveOccurred())
Expect(res.Estimates).To(HaveLen(3))
Expect(res.Estimates).To(HaveKey("8192"))
Expect(res.Estimates).To(HaveKey("32768"))
Expect(res.Estimates).To(HaveKey("131072"))
})
It("VRAM increases monotonically with context size", func() {
files := []FileInput{{URI: "http://a/model.gguf", Size: 4_000_000_000}}
meta := &GGUFMeta{BlockCount: 32, EmbeddingLength: 4096, HeadCount: 32, HeadCountKV: 8}
reader := fakeGGUFReader{"http://a/model.gguf": meta}
sizes := []uint32{8192, 16384, 32768, 65536, 131072, 262144}
res, err := EstimateMultiContext(ctx, files, sizes, EstimateOptions{}, nil, reader)
Expect(err).ToNot(HaveOccurred())
prev := uint64(0)
for _, sz := range sizes {
v := res.VRAMForContext(sz)
Expect(v).To(BeNumerically(">", prev), "VRAM should increase at context %d", sz)
prev = v
}
})
It("size is constant across context sizes", func() {
files := []FileInput{{URI: "http://a/model.gguf", Size: 4_000_000_000}}
sizes := []uint32{8192, 32768}
res, err := EstimateMultiContext(ctx, files, sizes, EstimateOptions{}, nil, nil)
Expect(err).ToNot(HaveOccurred())
Expect(res.SizeBytes).To(Equal(uint64(4_000_000_000)))
})
It("defaults to [8192] when contextSizes is empty", func() {
files := []FileInput{{URI: "http://a/model.gguf", Size: 4_000_000_000}}
res, err := EstimateMultiContext(ctx, files, nil, EstimateOptions{}, nil, nil)
Expect(err).ToNot(HaveOccurred())
Expect(res.Estimates).To(HaveLen(1))
Expect(res.Estimates).To(HaveKey("8192"))
})
})
Describe("VRAMForContext helper", func() {
It("returns 0 for missing context size", func() {
res := MultiContextEstimate{
Estimates: map[string]VRAMAt{
"8192": {VRAMBytes: 5000},
},
}
Expect(res.VRAMForContext(99999)).To(Equal(uint64(0)))
Expect(res.VRAMForContext(8192)).To(Equal(uint64(5000)))
})
})
})

View File

@@ -4,7 +4,6 @@ import (
"context"
"strings"
"sync"
"time"
hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
)
@@ -15,13 +14,11 @@ var (
)
type hfSizeCacheEntry struct {
result EstimateResult
err error
expiresAt time.Time
totalBytes uint64
err error
generation uint64
}
const hfSizeCacheTTL = 15 * time.Minute
// ExtractHFRepoID extracts a HuggingFace repo ID from a string.
// It handles both short form ("org/model") and full URL form
// ("https://huggingface.co/org/model", "huggingface.co/org/model").
@@ -62,30 +59,31 @@ func ExtractHFRepoID(s string) (string, bool) {
return "", false
}
// EstimateFromHFRepo estimates model size by querying the HuggingFace API for file listings.
// Results are cached for 15 minutes.
func EstimateFromHFRepo(ctx context.Context, repoID string) (EstimateResult, error) {
// hfRepoWeightSize returns the total weight file size for a HuggingFace repo.
// Results are cached and invalidated when the gallery generation changes.
func hfRepoWeightSize(ctx context.Context, repoID string) (uint64, error) {
gen := currentGeneration()
hfSizeCacheMu.Lock()
if entry, ok := hfSizeCacheData[repoID]; ok && time.Now().Before(entry.expiresAt) {
if entry, ok := hfSizeCacheData[repoID]; ok && entry.generation == gen {
hfSizeCacheMu.Unlock()
return entry.result, entry.err
return entry.totalBytes, entry.err
}
hfSizeCacheMu.Unlock()
result, err := estimateFromHFRepoUncached(ctx, repoID)
totalBytes, err := hfRepoWeightSizeUncached(ctx, repoID)
hfSizeCacheMu.Lock()
hfSizeCacheData[repoID] = hfSizeCacheEntry{
result: result,
err: err,
expiresAt: time.Now().Add(hfSizeCacheTTL),
totalBytes: totalBytes,
err: err,
generation: gen,
}
hfSizeCacheMu.Unlock()
return result, err
return totalBytes, err
}
func estimateFromHFRepoUncached(ctx context.Context, repoID string) (EstimateResult, error) {
func hfRepoWeightSizeUncached(ctx context.Context, repoID string) (uint64, error) {
client := hfapi.NewClient()
type listResult struct {
@@ -100,17 +98,17 @@ func estimateFromHFRepoUncached(ctx context.Context, repoID string) (EstimateRes
select {
case <-ctx.Done():
return EstimateResult{}, ctx.Err()
return 0, ctx.Err()
case res := <-ch:
if res.err != nil {
return EstimateResult{}, res.err
return 0, res.err
}
return estimateFromFileInfos(res.files), nil
return sumWeightFileBytes(res.files), nil
}
}
func estimateFromFileInfos(files []hfapi.FileInfo) EstimateResult {
var totalSize int64
func sumWeightFileBytes(files []hfapi.FileInfo) uint64 {
var total int64
for _, f := range files {
if f.Type != "file" {
continue
@@ -128,20 +126,10 @@ func estimateFromFileInfos(files []hfapi.FileInfo) EstimateResult {
if f.LFS != nil && f.LFS.Size > 0 {
size = f.LFS.Size
}
totalSize += size
total += size
}
if totalSize <= 0 {
return EstimateResult{}
}
sizeBytes := uint64(totalSize)
vramBytes := sizeOnlyVRAM(sizeBytes, 8192)
return EstimateResult{
SizeBytes: sizeBytes,
SizeDisplay: FormatBytes(sizeBytes),
VRAMBytes: vramBytes,
VRAMDisplay: FormatBytes(vramBytes),
if total < 0 {
return 0
}
return uint64(total)
}

View File

@@ -1,6 +1,9 @@
package vram
import "context"
import (
"context"
"fmt"
)
// FileInput represents a single model file for estimation (URI and optional pre-known size).
type FileInput struct {
@@ -28,16 +31,45 @@ type GGUFMetadataReader interface {
}
// EstimateOptions configures VRAM/size estimation.
// GPULayers and KVQuantBits apply uniformly across all context sizes.
type EstimateOptions struct {
ContextLength uint32
GPULayers int
KVQuantBits int
GPULayers int
KVQuantBits int
}
// EstimateResult holds estimated download size and VRAM with display strings.
type EstimateResult struct {
SizeBytes uint64 `json:"sizeBytes"` // total model weight size in bytes
SizeDisplay string `json:"sizeDisplay"` // human-readable size (e.g. "4.2 GB")
VRAMBytes uint64 `json:"vramBytes"` // estimated VRAM usage in bytes
VRAMDisplay string `json:"vramDisplay"` // human-readable VRAM (e.g. "6.1 GB")
// VRAMAt holds the VRAM estimate at a specific context size.
type VRAMAt struct {
ContextLength uint32 `json:"contextLength"`
VRAMBytes uint64 `json:"vramBytes"`
VRAMDisplay string `json:"vramDisplay"`
}
// EstimateResult is a flat single-context view of an estimate, suitable for
// the REST /api/models/vram-estimate response and the MCP vram_estimate tool.
// It is the legacy shape the LLM and HTTP clients expect (size_bytes /
// size_display / vram_bytes / vram_display).
type EstimateResult struct {
SizeBytes uint64 `json:"size_bytes"`
SizeDisplay string `json:"size_display"`
ContextLength uint32 `json:"context_length,omitempty"`
VRAMBytes uint64 `json:"vram_bytes"`
VRAMDisplay string `json:"vram_display"`
}
// MultiContextEstimate holds VRAM estimates for one or more context sizes,
// computed from a single metadata fetch.
type MultiContextEstimate struct {
SizeBytes uint64 `json:"sizeBytes"`
SizeDisplay string `json:"sizeDisplay"`
Estimates map[string]VRAMAt `json:"estimates"` // keys: context size as string
ModelMaxContext uint64 `json:"modelMaxContext,omitempty"` // from GGUF metadata
}
// VRAMForContext is a convenience method that returns the VRAMBytes for a
// specific context size, or 0 if not present.
func (m MultiContextEstimate) VRAMForContext(ctxLen uint32) uint64 {
if e, ok := m.Estimates[fmt.Sprint(ctxLen)]; ok {
return e.VRAMBytes
}
return 0
}