feat(gallery): Speed up load times and clean gallery entries (#9211)

* feat: Rework VRAM estimation and use known_usecases in gallery Signed-off-by: Richard Palethorpe <io@richiejp.com> Assisted-by: Claude:claude-opus-4-7[1m] [Claude Code] * chore(gallery): regenerate gallery index and add known_usecases to model entries Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-05-16 20:52:08 -04:00 · 2026-05-06 13:51:38 +01:00
parent 6d56bf98fe
commit 969005b2a1
47 changed files with 17089 additions and 5345 deletions
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -17,6 +17,7 @@ import (
 	"github.com/mudler/LocalAI/core/services/jobs"
 	"github.com/mudler/LocalAI/core/services/nodes"
 	"github.com/mudler/LocalAI/core/services/storage"
+	"github.com/mudler/LocalAI/pkg/vram"
 	coreStartup "github.com/mudler/LocalAI/core/startup"
 	"github.com/mudler/LocalAI/internal"

@@ -251,6 +252,10 @@ func New(opts ...config.AppOption) (*Application, error) {
 		go uc.Run(options.Context)
 	}

+	// Wire gallery generation counter into VRAM caches so they invalidate
+	// when gallery data refreshes instead of using a fixed TTL.
+	vram.SetGalleryGenerationFunc(gallery.GalleryGeneration)
+
 	if options.ConfigFile != "" {
 		if err := application.ModelConfigLoader().LoadMultipleModelConfigsSingleFile(options.ConfigFile, configLoaderOpts...); err != nil {
 			xlog.Error("error loading config file", "error", err)
--- a/core/config/backend_capabilities.go
+++ b/core/config/backend_capabilities.go
@@ -0,0 +1,480 @@
+package config
+
+import (
+	"slices"
+	"strings"
+)
+
+// Usecase name constants — the canonical string values used in gallery entries,
+// model configs (known_usecases), and UsecaseInfoMap keys.
+const (
+	UsecaseChat            = "chat"
+	UsecaseCompletion      = "completion"
+	UsecaseEdit            = "edit"
+	UsecaseVision          = "vision"
+	UsecaseEmbeddings      = "embeddings"
+	UsecaseTokenize        = "tokenize"
+	UsecaseImage           = "image"
+	UsecaseVideo           = "video"
+	UsecaseTranscript      = "transcript"
+	UsecaseTTS             = "tts"
+	UsecaseSoundGeneration = "sound_generation"
+	UsecaseRerank          = "rerank"
+	UsecaseDetection       = "detection"
+	UsecaseVAD             = "vad"
+	UsecaseAudioTransform  = "audio_transform"
+	UsecaseDiarization     = "diarization"
+)
+
+// GRPCMethod identifies a Backend service RPC from backend.proto.
+type GRPCMethod string
+
+const (
+	MethodPredict            GRPCMethod = "Predict"
+	MethodPredictStream      GRPCMethod = "PredictStream"
+	MethodEmbedding          GRPCMethod = "Embedding"
+	MethodGenerateImage      GRPCMethod = "GenerateImage"
+	MethodGenerateVideo      GRPCMethod = "GenerateVideo"
+	MethodAudioTranscription GRPCMethod = "AudioTranscription"
+	MethodTTS                GRPCMethod = "TTS"
+	MethodTTSStream          GRPCMethod = "TTSStream"
+	MethodSoundGeneration    GRPCMethod = "SoundGeneration"
+	MethodTokenizeString     GRPCMethod = "TokenizeString"
+	MethodDetect             GRPCMethod = "Detect"
+	MethodRerank             GRPCMethod = "Rerank"
+	MethodVAD                GRPCMethod = "VAD"
+	MethodAudioTransform     GRPCMethod = "AudioTransform"
+	MethodDiarize            GRPCMethod = "Diarize"
+)
+
+// UsecaseInfo describes a single known_usecase value and how it maps
+// to the gRPC backend API.
+type UsecaseInfo struct {
+	// Flag is the ModelConfigUsecase bitmask value.
+	Flag ModelConfigUsecase
+	// GRPCMethod is the primary Backend service RPC this usecase maps to.
+	GRPCMethod GRPCMethod
+	// IsModifier is true when this usecase doesn't map to its own gRPC RPC
+	// but modifies how another RPC behaves (e.g., vision uses Predict with images).
+	IsModifier bool
+	// DependsOn names the usecase(s) this modifier requires (e.g., "chat").
+	DependsOn string
+	// Description is a human/LLM-readable explanation of what this usecase means.
+	Description string
+}
+
+// UsecaseInfoMap maps each known_usecase string to its gRPC and semantic info.
+var UsecaseInfoMap = map[string]UsecaseInfo{
+	UsecaseChat: {
+		Flag:        FLAG_CHAT,
+		GRPCMethod:  MethodPredict,
+		Description: "Conversational/instruction-following via the Predict RPC with chat templates.",
+	},
+	UsecaseCompletion: {
+		Flag:        FLAG_COMPLETION,
+		GRPCMethod:  MethodPredict,
+		Description: "Text completion via the Predict RPC with a completion template.",
+	},
+	UsecaseEdit: {
+		Flag:        FLAG_EDIT,
+		GRPCMethod:  MethodPredict,
+		Description: "Text editing via the Predict RPC with an edit template.",
+	},
+	UsecaseVision: {
+		Flag:        FLAG_VISION,
+		GRPCMethod:  MethodPredict,
+		IsModifier:  true,
+		DependsOn:   UsecaseChat,
+		Description: "The model accepts images alongside text in the Predict RPC. For llama-cpp this requires an mmproj file.",
+	},
+	UsecaseEmbeddings: {
+		Flag:        FLAG_EMBEDDINGS,
+		GRPCMethod:  MethodEmbedding,
+		Description: "Vector embedding generation via the Embedding RPC.",
+	},
+	UsecaseTokenize: {
+		Flag:        FLAG_TOKENIZE,
+		GRPCMethod:  MethodTokenizeString,
+		Description: "Tokenization via the TokenizeString RPC without running inference.",
+	},
+	UsecaseImage: {
+		Flag:        FLAG_IMAGE,
+		GRPCMethod:  MethodGenerateImage,
+		Description: "Image generation via the GenerateImage RPC (Stable Diffusion, Flux, etc.).",
+	},
+	UsecaseVideo: {
+		Flag:        FLAG_VIDEO,
+		GRPCMethod:  MethodGenerateVideo,
+		Description: "Video generation via the GenerateVideo RPC.",
+	},
+	UsecaseTranscript: {
+		Flag:        FLAG_TRANSCRIPT,
+		GRPCMethod:  MethodAudioTranscription,
+		Description: "Speech-to-text via the AudioTranscription RPC.",
+	},
+	UsecaseTTS: {
+		Flag:        FLAG_TTS,
+		GRPCMethod:  MethodTTS,
+		Description: "Text-to-speech via the TTS RPC.",
+	},
+	UsecaseSoundGeneration: {
+		Flag:        FLAG_SOUND_GENERATION,
+		GRPCMethod:  MethodSoundGeneration,
+		Description: "Music/sound generation via the SoundGeneration RPC (not speech).",
+	},
+	UsecaseRerank: {
+		Flag:        FLAG_RERANK,
+		GRPCMethod:  MethodRerank,
+		Description: "Document reranking via the Rerank RPC.",
+	},
+	UsecaseDetection: {
+		Flag:        FLAG_DETECTION,
+		GRPCMethod:  MethodDetect,
+		Description: "Object detection via the Detect RPC with bounding boxes.",
+	},
+	UsecaseVAD: {
+		Flag:        FLAG_VAD,
+		GRPCMethod:  MethodVAD,
+		Description: "Voice activity detection via the VAD RPC.",
+	},
+	UsecaseAudioTransform: {
+		Flag:        FLAG_AUDIO_TRANSFORM,
+		GRPCMethod:  MethodAudioTransform,
+		Description: "Audio-in / audio-out transformations (echo cancellation, noise suppression, dereverberation, voice conversion) via the AudioTransform RPC.",
+	},
+	UsecaseDiarization: {
+		Flag:        FLAG_DIARIZATION,
+		GRPCMethod:  MethodDiarize,
+		Description: "Speaker diarization (who-spoke-when, per-speaker segments) via the Diarize RPC.",
+	},
+}
+
+// BackendCapability describes which gRPC methods and usecases a backend supports.
+// Derived from reviewing actual implementations in backend/go/ and backend/python/.
+type BackendCapability struct {
+	// GRPCMethods lists the Backend service RPCs this backend implements.
+	GRPCMethods []GRPCMethod
+	// PossibleUsecases lists all usecase strings this backend can support.
+	PossibleUsecases []string
+	// DefaultUsecases lists the conservative safe defaults.
+	DefaultUsecases []string
+	// AcceptsImages indicates multimodal image input in Predict.
+	AcceptsImages bool
+	// AcceptsVideos indicates multimodal video input in Predict.
+	AcceptsVideos bool
+	// AcceptsAudios indicates multimodal audio input in Predict.
+	AcceptsAudios bool
+	// Description is a human-readable summary of the backend.
+	Description string
+}
+
+// BackendCapabilities maps each backend name (as used in model configs and gallery
+// entries) to its verified capabilities. This is the single source of truth for
+// what each backend supports.
+//
+// Backend names use hyphens (e.g., "llama-cpp") matching the gallery convention.
+// Use NormalizeBackendName() for names with dots (e.g., "llama.cpp").
+var BackendCapabilities = map[string]BackendCapability{
+	// --- LLM / text generation backends ---
+	"llama-cpp": {
+		GRPCMethods:      []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding, MethodTokenizeString},
+		PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEdit, UsecaseEmbeddings, UsecaseTokenize, UsecaseVision},
+		DefaultUsecases:  []string{UsecaseChat},
+		AcceptsImages:    true, // requires mmproj
+		Description:      "llama.cpp GGUF models — LLM inference with optional vision via mmproj",
+	},
+	"vllm": {
+		GRPCMethods:      []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding},
+		PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings, UsecaseVision},
+		DefaultUsecases:  []string{UsecaseChat},
+		AcceptsImages:    true,
+		AcceptsVideos:    true,
+		Description:      "vLLM engine — high-throughput LLM serving with optional multimodal",
+	},
+	"vllm-omni": {
+		GRPCMethods:      []GRPCMethod{MethodPredict, MethodPredictStream, MethodGenerateImage, MethodGenerateVideo, MethodTTS},
+		PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseImage, UsecaseVideo, UsecaseTTS, UsecaseVision},
+		DefaultUsecases:  []string{UsecaseChat},
+		AcceptsImages:    true,
+		AcceptsVideos:    true,
+		AcceptsAudios:    true,
+		Description:      "vLLM omni-modal — supports text, image, video generation and TTS",
+	},
+	"transformers": {
+		GRPCMethods:      []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding, MethodTTS, MethodSoundGeneration},
+		PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings, UsecaseTTS, UsecaseSoundGeneration},
+		DefaultUsecases:  []string{UsecaseChat},
+		Description:      "HuggingFace transformers — general-purpose Python inference",
+	},
+	"mlx": {
+		GRPCMethods:      []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding},
+		PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings},
+		DefaultUsecases:  []string{UsecaseChat},
+		Description:      "Apple MLX framework — optimized for Apple Silicon",
+	},
+	"mlx-distributed": {
+		GRPCMethods:      []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding},
+		PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings},
+		DefaultUsecases:  []string{UsecaseChat},
+		Description:      "MLX distributed inference across multiple Apple Silicon devices",
+	},
+	"mlx-vlm": {
+		GRPCMethods:      []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding},
+		PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings, UsecaseVision},
+		DefaultUsecases:  []string{UsecaseChat, UsecaseVision},
+		AcceptsImages:    true,
+		AcceptsAudios:    true,
+		Description:      "MLX vision-language models with multimodal input",
+	},
+	"mlx-audio": {
+		GRPCMethods:      []GRPCMethod{MethodPredict, MethodTTS},
+		PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseChat},
+		Description:      "MLX audio models — text generation and TTS",
+	},
+
+	// --- Image/video generation backends ---
+	"diffusers": {
+		GRPCMethods:      []GRPCMethod{MethodGenerateImage, MethodGenerateVideo},
+		PossibleUsecases: []string{UsecaseImage, UsecaseVideo},
+		DefaultUsecases:  []string{UsecaseImage},
+		Description:      "HuggingFace diffusers — Stable Diffusion, Flux, video generation",
+	},
+	"stablediffusion": {
+		GRPCMethods:      []GRPCMethod{MethodGenerateImage},
+		PossibleUsecases: []string{UsecaseImage},
+		DefaultUsecases:  []string{UsecaseImage},
+		Description:      "Stable Diffusion native backend",
+	},
+	"stablediffusion-ggml": {
+		GRPCMethods:      []GRPCMethod{MethodGenerateImage},
+		PossibleUsecases: []string{UsecaseImage},
+		DefaultUsecases:  []string{UsecaseImage},
+		Description:      "Stable Diffusion via GGML quantized models",
+	},
+
+	// --- Speech-to-text backends ---
+	"whisper": {
+		GRPCMethods:      []GRPCMethod{MethodAudioTranscription, MethodVAD},
+		PossibleUsecases: []string{UsecaseTranscript, UsecaseVAD},
+		DefaultUsecases:  []string{UsecaseTranscript},
+		Description:      "OpenAI Whisper — speech recognition and voice activity detection",
+	},
+	"faster-whisper": {
+		GRPCMethods:      []GRPCMethod{MethodAudioTranscription},
+		PossibleUsecases: []string{UsecaseTranscript},
+		DefaultUsecases:  []string{UsecaseTranscript},
+		Description:      "CTranslate2-accelerated Whisper for faster transcription",
+	},
+	"whisperx": {
+		GRPCMethods:      []GRPCMethod{MethodAudioTranscription},
+		PossibleUsecases: []string{UsecaseTranscript},
+		DefaultUsecases:  []string{UsecaseTranscript},
+		Description:      "WhisperX — Whisper with word-level timestamps and speaker diarization",
+	},
+	"moonshine": {
+		GRPCMethods:      []GRPCMethod{MethodAudioTranscription},
+		PossibleUsecases: []string{UsecaseTranscript},
+		DefaultUsecases:  []string{UsecaseTranscript},
+		Description:      "Moonshine speech recognition",
+	},
+	"nemo": {
+		GRPCMethods:      []GRPCMethod{MethodAudioTranscription},
+		PossibleUsecases: []string{UsecaseTranscript},
+		DefaultUsecases:  []string{UsecaseTranscript},
+		Description:      "NVIDIA NeMo speech recognition",
+	},
+	"qwen-asr": {
+		GRPCMethods:      []GRPCMethod{MethodAudioTranscription},
+		PossibleUsecases: []string{UsecaseTranscript},
+		DefaultUsecases:  []string{UsecaseTranscript},
+		Description:      "Qwen automatic speech recognition",
+	},
+	"voxtral": {
+		GRPCMethods:      []GRPCMethod{MethodAudioTranscription},
+		PossibleUsecases: []string{UsecaseTranscript},
+		DefaultUsecases:  []string{UsecaseTranscript},
+		Description:      "Voxtral speech recognition",
+	},
+	"vibevoice": {
+		GRPCMethods:      []GRPCMethod{MethodAudioTranscription, MethodTTS},
+		PossibleUsecases: []string{UsecaseTranscript, UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseTranscript, UsecaseTTS},
+		Description:      "VibeVoice — bidirectional speech (transcription and synthesis)",
+	},
+
+	// --- TTS backends ---
+	"piper": {
+		GRPCMethods:      []GRPCMethod{MethodTTS},
+		PossibleUsecases: []string{UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseTTS},
+		Description:      "Piper — fast neural TTS optimized for Raspberry Pi",
+	},
+	"kokoro": {
+		GRPCMethods:      []GRPCMethod{MethodTTS},
+		PossibleUsecases: []string{UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseTTS},
+		Description:      "Kokoro TTS",
+	},
+	"coqui": {
+		GRPCMethods:      []GRPCMethod{MethodTTS},
+		PossibleUsecases: []string{UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseTTS},
+		Description:      "Coqui TTS — multi-speaker neural synthesis",
+	},
+	"kitten-tts": {
+		GRPCMethods:      []GRPCMethod{MethodTTS},
+		PossibleUsecases: []string{UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseTTS},
+		Description:      "Kitten TTS",
+	},
+	"outetts": {
+		GRPCMethods:      []GRPCMethod{MethodTTS},
+		PossibleUsecases: []string{UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseTTS},
+		Description:      "OuteTTS",
+	},
+	"pocket-tts": {
+		GRPCMethods:      []GRPCMethod{MethodTTS},
+		PossibleUsecases: []string{UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseTTS},
+		Description:      "Pocket TTS — lightweight text-to-speech",
+	},
+	"qwen-tts": {
+		GRPCMethods:      []GRPCMethod{MethodTTS},
+		PossibleUsecases: []string{UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseTTS},
+		Description:      "Qwen TTS",
+	},
+	"faster-qwen3-tts": {
+		GRPCMethods:      []GRPCMethod{MethodTTS},
+		PossibleUsecases: []string{UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseTTS},
+		Description:      "Faster Qwen3 TTS — accelerated Qwen TTS",
+	},
+	"fish-speech": {
+		GRPCMethods:      []GRPCMethod{MethodTTS},
+		PossibleUsecases: []string{UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseTTS},
+		Description:      "Fish Speech TTS",
+	},
+	"neutts": {
+		GRPCMethods:      []GRPCMethod{MethodTTS},
+		PossibleUsecases: []string{UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseTTS},
+		Description:      "NeuTTS — neural text-to-speech",
+	},
+	"chatterbox": {
+		GRPCMethods:      []GRPCMethod{MethodTTS},
+		PossibleUsecases: []string{UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseTTS},
+		Description:      "Chatterbox TTS",
+	},
+	"voxcpm": {
+		GRPCMethods:      []GRPCMethod{MethodTTS, MethodTTSStream},
+		PossibleUsecases: []string{UsecaseTTS},
+		DefaultUsecases:  []string{UsecaseTTS},
+		Description:      "VoxCPM TTS with streaming support",
+	},
+
+	// --- Sound generation backends ---
+	"ace-step": {
+		GRPCMethods:      []GRPCMethod{MethodTTS, MethodSoundGeneration},
+		PossibleUsecases: []string{UsecaseTTS, UsecaseSoundGeneration},
+		DefaultUsecases:  []string{UsecaseSoundGeneration},
+		Description:      "ACE-Step — music and sound generation",
+	},
+	"acestep-cpp": {
+		GRPCMethods:      []GRPCMethod{MethodSoundGeneration},
+		PossibleUsecases: []string{UsecaseSoundGeneration},
+		DefaultUsecases:  []string{UsecaseSoundGeneration},
+		Description:      "ACE-Step C++ — native sound generation",
+	},
+	"transformers-musicgen": {
+		GRPCMethods:      []GRPCMethod{MethodTTS, MethodSoundGeneration},
+		PossibleUsecases: []string{UsecaseTTS, UsecaseSoundGeneration},
+		DefaultUsecases:  []string{UsecaseSoundGeneration},
+		Description:      "Meta MusicGen via transformers — music generation from text",
+	},
+
+	// --- Audio transform backends ---
+	"localvqe": {
+		GRPCMethods:      []GRPCMethod{MethodAudioTransform},
+		PossibleUsecases: []string{UsecaseAudioTransform},
+		DefaultUsecases:  []string{UsecaseAudioTransform},
+		Description:      "LocalVQE — joint AEC, noise suppression, and dereverberation for 16 kHz mono speech",
+	},
+
+	// --- Utility backends ---
+	"rerankers": {
+		GRPCMethods:      []GRPCMethod{MethodRerank},
+		PossibleUsecases: []string{UsecaseRerank},
+		DefaultUsecases:  []string{UsecaseRerank},
+		Description:      "Cross-encoder reranking models",
+	},
+	"rfdetr": {
+		GRPCMethods:      []GRPCMethod{MethodDetect},
+		PossibleUsecases: []string{UsecaseDetection},
+		DefaultUsecases:  []string{UsecaseDetection},
+		Description:      "RF-DETR object detection",
+	},
+	"silero-vad": {
+		GRPCMethods:      []GRPCMethod{MethodVAD},
+		PossibleUsecases: []string{UsecaseVAD},
+		DefaultUsecases:  []string{UsecaseVAD},
+		Description:      "Silero VAD — voice activity detection",
+	},
+}
+
+// NormalizeBackendName converts backend names to the canonical hyphenated form
+// used in gallery entries (e.g., "llama.cpp" → "llama-cpp").
+func NormalizeBackendName(backend string) string {
+	return strings.ReplaceAll(backend, ".", "-")
+}
+
+// GetBackendCapability returns the capability info for a backend, or nil if unknown.
+// Handles backend name normalization.
+func GetBackendCapability(backend string) *BackendCapability {
+	if cap, ok := BackendCapabilities[NormalizeBackendName(backend)]; ok {
+		return &cap
+	}
+	return nil
+}
+
+// PossibleUsecasesForBackend returns all usecases a backend can support.
+// Returns nil if the backend is unknown.
+func PossibleUsecasesForBackend(backend string) []string {
+	if cap := GetBackendCapability(backend); cap != nil {
+		return cap.PossibleUsecases
+	}
+	return nil
+}
+
+// DefaultUsecasesForBackend returns the conservative default usecases.
+// Returns nil if the backend is unknown.
+func DefaultUsecasesForBackendCap(backend string) []string {
+	if cap := GetBackendCapability(backend); cap != nil {
+		return cap.DefaultUsecases
+	}
+	return nil
+}
+
+// IsValidUsecaseForBackend checks whether a usecase is in a backend's possible set.
+// Returns true for unknown backends (permissive fallback).
+func IsValidUsecaseForBackend(backend, usecase string) bool {
+	cap := GetBackendCapability(backend)
+	if cap == nil {
+		return true // unknown backend — don't restrict
+	}
+	return slices.Contains(cap.PossibleUsecases, usecase)
+}
+
+// AllBackendNames returns a sorted list of all known backend names.
+func AllBackendNames() []string {
+	names := make([]string, 0, len(BackendCapabilities))
+	for name := range BackendCapabilities {
+		names = append(names, name)
+	}
+	slices.Sort(names)
+	return names
+}
--- a/core/config/backend_capabilities_test.go
+++ b/core/config/backend_capabilities_test.go
@@ -0,0 +1,95 @@
+package config
+
+import (
+	"slices"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("BackendCapabilities", func() {
+	It("every backend declares possible/default usecases and gRPC methods", func() {
+		for name, cap := range BackendCapabilities {
+			Expect(cap.PossibleUsecases).NotTo(BeEmpty(), "backend %q has no possible usecases", name)
+			Expect(cap.DefaultUsecases).NotTo(BeEmpty(), "backend %q has no default usecases", name)
+			Expect(cap.GRPCMethods).NotTo(BeEmpty(), "backend %q has no gRPC methods", name)
+		}
+	})
+
+	It("default usecases are a subset of possible usecases", func() {
+		for name, cap := range BackendCapabilities {
+			for _, d := range cap.DefaultUsecases {
+				Expect(cap.PossibleUsecases).To(ContainElement(d), "backend %q: default %q not in possible %v", name, d, cap.PossibleUsecases)
+			}
+		}
+	})
+
+	It("every backend's possible usecases map to a known FLAG_*", func() {
+		allFlags := GetAllModelConfigUsecases()
+		for name, cap := range BackendCapabilities {
+			for _, u := range cap.PossibleUsecases {
+				info, ok := UsecaseInfoMap[u]
+				Expect(ok).To(BeTrue(), "backend %q: usecase %q not in UsecaseInfoMap", name, u)
+				flagName := "FLAG_" + strings.ToUpper(u)
+				if _, ok := allFlags[flagName]; ok {
+					continue
+				}
+				// Some usecase names don't transform exactly to FLAG_<UPPER>; fall back to flag value lookup.
+				found := false
+				for _, flag := range allFlags {
+					if flag == info.Flag {
+						found = true
+						break
+					}
+				}
+				Expect(found).To(BeTrue(), "backend %q: usecase %q flag %d not in GetAllModelConfigUsecases", name, u, info.Flag)
+			}
+		}
+	})
+
+	It("every UsecaseInfoMap entry has a non-zero flag and a gRPC method", func() {
+		for name, info := range UsecaseInfoMap {
+			Expect(info.Flag).NotTo(Equal(FLAG_ANY), "usecase %q has FLAG_ANY (zero) — should have a real flag", name)
+			Expect(info.GRPCMethod).NotTo(BeEmpty(), "usecase %q has no gRPC method", name)
+		}
+	})
+})
+
+var _ = Describe("GetBackendCapability", func() {
+	It("returns the capability for a known backend", func() {
+		cap := GetBackendCapability("llama-cpp")
+		Expect(cap).NotTo(BeNil())
+		Expect(cap.PossibleUsecases).To(ContainElement("chat"))
+	})
+
+	It("normalizes hyphenated names so llama.cpp resolves to llama-cpp", func() {
+		Expect(GetBackendCapability("llama.cpp")).NotTo(BeNil())
+	})
+
+	It("returns nil for unknown backends", func() {
+		Expect(GetBackendCapability("nonexistent")).To(BeNil())
+	})
+})
+
+var _ = Describe("IsValidUsecaseForBackend", func() {
+	It("accepts a backend's declared usecases", func() {
+		Expect(IsValidUsecaseForBackend("piper", "tts")).To(BeTrue())
+	})
+
+	It("rejects usecases outside a backend's possible set", func() {
+		Expect(IsValidUsecaseForBackend("piper", "chat")).To(BeFalse())
+	})
+
+	It("is permissive for unknown backends", func() {
+		Expect(IsValidUsecaseForBackend("unknown", "anything")).To(BeTrue())
+	})
+})
+
+var _ = Describe("AllBackendNames", func() {
+	It("returns 30+ backends in sorted order", func() {
+		names := AllBackendNames()
+		Expect(len(names)).To(BeNumerically(">=", 30))
+		Expect(slices.IsSorted(names)).To(BeTrue())
+	})
+})
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -630,16 +630,45 @@ const (
 	FLAG_TOKENIZE         ModelConfigUsecase = 0b001000000000
 	FLAG_VAD              ModelConfigUsecase = 0b010000000000
 	FLAG_VIDEO            ModelConfigUsecase = 0b100000000000
-	FLAG_DETECTION        ModelConfigUsecase = 0b1000000000000
-	FLAG_FACE_RECOGNITION    ModelConfigUsecase = 0b10000000000000
-	FLAG_SPEAKER_RECOGNITION ModelConfigUsecase = 0b100000000000000
-	FLAG_AUDIO_TRANSFORM     ModelConfigUsecase = 0b1000000000000000
-	FLAG_DIARIZATION         ModelConfigUsecase = 0b10000000000000000
+	FLAG_DETECTION           ModelConfigUsecase = 0b1000000000000
+	FLAG_VISION              ModelConfigUsecase = 0b10000000000000
+	FLAG_FACE_RECOGNITION    ModelConfigUsecase = 0b100000000000000
+	FLAG_SPEAKER_RECOGNITION ModelConfigUsecase = 0b1000000000000000
+	FLAG_AUDIO_TRANSFORM     ModelConfigUsecase = 0b10000000000000000
+	FLAG_DIARIZATION         ModelConfigUsecase = 0b100000000000000000

 	// Common Subsets
 	FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
 )

+// ModalityGroups defines groups of usecases that belong to the same modality.
+// Flags within the same group are NOT orthogonal (e.g., chat and completion are
+// both text/language). A model is multimodal when its usecases span 2+ groups.
+var ModalityGroups = []ModelConfigUsecase{
+	FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT, // text/language
+	FLAG_VISION | FLAG_DETECTION,            // visual understanding
+	FLAG_TRANSCRIPT,                         // speech input
+	FLAG_TTS | FLAG_SOUND_GENERATION,        // audio output
+	FLAG_AUDIO_TRANSFORM,                    // audio in/out transforms
+	FLAG_IMAGE | FLAG_VIDEO,                 // visual generation
+}
+
+// IsMultimodal returns true if the given usecases span two or more orthogonal
+// modality groups. For example chat+vision is multimodal, but chat+completion
+// is not (both belong to the text/language group).
+func IsMultimodal(usecases ModelConfigUsecase) bool {
+	groupCount := 0
+	for _, group := range ModalityGroups {
+		if usecases&group != 0 {
+			groupCount++
+			if groupCount >= 2 {
+				return true
+			}
+		}
+	}
+	return false
+}
+
 func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
 	return map[string]ModelConfigUsecase{
 		// Note: FLAG_ANY is intentionally excluded from this map
@@ -657,7 +686,8 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
 		"FLAG_VAD":              FLAG_VAD,
 		"FLAG_LLM":              FLAG_LLM,
 		"FLAG_VIDEO":            FLAG_VIDEO,
-		"FLAG_DETECTION":        FLAG_DETECTION,
+		"FLAG_DETECTION":           FLAG_DETECTION,
+		"FLAG_VISION":              FLAG_VISION,
 		"FLAG_FACE_RECOGNITION":    FLAG_FACE_RECOGNITION,
 		"FLAG_SPEAKER_RECOGNITION": FLAG_SPEAKER_RECOGNITION,
 		"FLAG_AUDIO_TRANSFORM":     FLAG_AUDIO_TRANSFORM,
--- a/core/gallery/gallery.go
+++ b/core/gallery/gallery.go
@@ -7,6 +7,8 @@ import (
 	"path/filepath"
 	"slices"
 	"strings"
+	"sync"
+	"sync/atomic"
 	"time"

 	"github.com/lithammer/fuzzysearch/fuzzy"
@@ -92,6 +94,34 @@ func (gm GalleryElements[T]) Search(term string) GalleryElements[T] {
 	return filteredModels
 }

+// FilterGalleryModelsByUsecase returns models whose known_usecases include all
+// the bits set in usecase. For example, passing FLAG_CHAT matches any model
+// with the chat usecase; passing FLAG_CHAT|FLAG_VISION matches only models
+// that have both.
+func FilterGalleryModelsByUsecase(models GalleryElements[*GalleryModel], usecase config.ModelConfigUsecase) GalleryElements[*GalleryModel] {
+	var filtered GalleryElements[*GalleryModel]
+	for _, m := range models {
+		u := m.GetKnownUsecases()
+		if u != nil && (*u&usecase) == usecase {
+			filtered = append(filtered, m)
+		}
+	}
+	return filtered
+}
+
+// FilterGalleryModelsByMultimodal returns models whose known_usecases span two
+// or more orthogonal modality groups (e.g. chat+vision, tts+transcript).
+func FilterGalleryModelsByMultimodal(models GalleryElements[*GalleryModel]) GalleryElements[*GalleryModel] {
+	var filtered GalleryElements[*GalleryModel]
+	for _, m := range models {
+		u := m.GetKnownUsecases()
+		if u != nil && config.IsMultimodal(*u) {
+			filtered = append(filtered, m)
+		}
+	}
+	return filtered
+}
+
 func (gm GalleryElements[T]) FilterByTag(tag string) GalleryElements[T] {
 	var filtered GalleryElements[T]
 	for _, m := range gm {
@@ -267,6 +297,77 @@ func AvailableGalleryModels(galleries []config.Gallery, systemState *system.Syst
 	return models, nil
 }

+var (
+	availableModelsMu    sync.RWMutex
+	availableModelsCache GalleryElements[*GalleryModel]
+	refreshing           atomic.Bool
+	galleryGeneration    atomic.Uint64
+)
+
+// GalleryGeneration returns a counter that increments each time the gallery
+// model list is refreshed from upstream. VRAM estimation caches use this to
+// invalidate entries when the gallery data changes.
+func GalleryGeneration() uint64 { return galleryGeneration.Load() }
+
+// AvailableGalleryModelsCached returns gallery models from an in-memory cache.
+// Local-only fields (installed status) are refreshed on every call. A background
+// goroutine is triggered to re-fetch the full model list (including network
+// calls) so subsequent requests pick up changes without blocking the caller.
+// The first call with an empty cache blocks until the initial load completes.
+func AvailableGalleryModelsCached(galleries []config.Gallery, systemState *system.SystemState) (GalleryElements[*GalleryModel], error) {
+	availableModelsMu.RLock()
+	cached := availableModelsCache
+	availableModelsMu.RUnlock()
+
+	if cached != nil {
+		// Refresh installed status under write lock to avoid races with
+		// concurrent readers and the background refresh goroutine.
+		availableModelsMu.Lock()
+		for _, m := range cached {
+			_, err := os.Stat(filepath.Join(systemState.Model.ModelsPath, fmt.Sprintf("%s.yaml", m.GetName())))
+			m.SetInstalled(err == nil)
+		}
+		availableModelsMu.Unlock()
+		// Trigger a background refresh if one is not already running.
+		triggerGalleryRefresh(galleries, systemState)
+		return cached, nil
+	}
+
+	// No cache yet — must do a blocking load.
+	models, err := AvailableGalleryModels(galleries, systemState)
+	if err != nil {
+		return nil, err
+	}
+
+	availableModelsMu.Lock()
+	availableModelsCache = models
+	galleryGeneration.Add(1)
+	availableModelsMu.Unlock()
+
+	return models, nil
+}
+
+// triggerGalleryRefresh starts a background goroutine that refreshes the
+// gallery model cache. Only one refresh runs at a time; concurrent calls
+// are no-ops.
+func triggerGalleryRefresh(galleries []config.Gallery, systemState *system.SystemState) {
+	if !refreshing.CompareAndSwap(false, true) {
+		return
+	}
+	go func() {
+		defer refreshing.Store(false)
+		models, err := AvailableGalleryModels(galleries, systemState)
+		if err != nil {
+			xlog.Error("background gallery refresh failed", "error", err)
+			return
+		}
+		availableModelsMu.Lock()
+		availableModelsCache = models
+		galleryGeneration.Add(1)
+		availableModelsMu.Unlock()
+	}()
+}
+
 // List available backends
 func AvailableBackends(galleries []config.Gallery, systemState *system.SystemState) (GalleryElements[*GalleryBackend], error) {
 	return availableBackendsWithFilter(galleries, systemState, true)
--- a/core/gallery/gallery_test.go
+++ b/core/gallery/gallery_test.go
@@ -581,4 +581,42 @@ var _ = Describe("Gallery", func() {
 			Expect(mergedParams["model"]).To(Equal("nanbeige4.1-3b-q4_k_m.gguf"))
 		})
 	})
+
+	Describe("GetKnownUsecases", func() {
+		It("uses explicit known_usecases from overrides when present", func() {
+			m := &GalleryModel{
+				Metadata: Metadata{Backend: "stablediffusion-ggml"},
+				Overrides: map[string]any{
+					"known_usecases": []any{"chat"},
+				},
+			}
+			u := m.GetKnownUsecases()
+			Expect(u).NotTo(BeNil())
+			// Override wins over the backend's image default.
+			Expect(*u & config.FLAG_CHAT).To(Equal(config.FLAG_CHAT))
+			Expect(*u & config.FLAG_IMAGE).To(Equal(config.ModelConfigUsecase(0)))
+		})
+
+		It("falls back to backend defaults when no override is set", func() {
+			m := &GalleryModel{Metadata: Metadata{Backend: "stablediffusion-ggml"}}
+			u := m.GetKnownUsecases()
+			Expect(u).NotTo(BeNil())
+			Expect(*u & config.FLAG_IMAGE).To(Equal(config.FLAG_IMAGE))
+		})
+
+		It("returns nil when neither overrides nor a known backend provide usecases", func() {
+			m := &GalleryModel{}
+			Expect(m.GetKnownUsecases()).To(BeNil())
+		})
+
+		It("filters models without explicit known_usecases via backend defaults", func() {
+			models := GalleryElements[*GalleryModel]{
+				&GalleryModel{Metadata: Metadata{Name: "sd-model", Backend: "stablediffusion-ggml"}},
+				&GalleryModel{Metadata: Metadata{Name: "whisper-model", Backend: "whisper"}},
+			}
+			filtered := FilterGalleryModelsByUsecase(models, config.FLAG_IMAGE)
+			Expect(filtered).To(HaveLen(1))
+			Expect(filtered[0].Name).To(Equal("sd-model"))
+		})
+	})
 })
--- a/core/gallery/importers/diffuser.go
+++ b/core/gallery/importers/diffuser.go
@@ -97,7 +97,7 @@ func (i *DiffuserImporter) Import(details Details) (gallery.ModelConfig, error)
 	modelConfig := config.ModelConfig{
 		Name:                name,
 		Description:         description,
-		KnownUsecaseStrings: []string{"image"},
+		KnownUsecaseStrings: []string{config.UsecaseImage},
 		Backend:             backend,
 		PredictionOptions: schema.PredictionOptions{
 			BasicModelRequest: schema.BasicModelRequest{
--- a/core/gallery/importers/llama-cpp.go
+++ b/core/gallery/importers/llama-cpp.go
@@ -135,7 +135,7 @@ func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error)
 	modelConfig := config.ModelConfig{
 		Name:                name,
 		Description:         description,
-		KnownUsecaseStrings: []string{"chat"},
+		KnownUsecaseStrings: []string{config.UsecaseChat},
 		Options:             []string{"use_jinja:true"},
 		Backend:             backend,
 		TemplateConfig: config.TemplateConfig{
--- a/core/gallery/importers/local.go
+++ b/core/gallery/importers/local.go
@@ -45,7 +45,7 @@ func ImportLocalPath(dirPath, name string) (*config.ModelConfig, error) {
 		cfg := &config.ModelConfig{
 			Name:                name,
 			Backend:             "llama-cpp",
-			KnownUsecaseStrings: []string{"chat"},
+			KnownUsecaseStrings: []string{config.UsecaseChat},
 			Options:             []string{"use_jinja:true"},
 		}
 		cfg.Model = relPath(ggufFile)
@@ -104,7 +104,7 @@ func ImportLocalPath(dirPath, name string) (*config.ModelConfig, error) {
 		cfg := &config.ModelConfig{
 			Name:                name,
 			Backend:             "transformers",
-			KnownUsecaseStrings: []string{"chat"},
+			KnownUsecaseStrings: []string{config.UsecaseChat},
 		}
 		cfg.Model = baseModel
 		cfg.TemplateConfig.UseTokenizerTemplate = true
@@ -120,7 +120,7 @@ func ImportLocalPath(dirPath, name string) (*config.ModelConfig, error) {
 		cfg := &config.ModelConfig{
 			Name:                name,
 			Backend:             "transformers",
-			KnownUsecaseStrings: []string{"chat"},
+			KnownUsecaseStrings: []string{config.UsecaseChat},
 		}
 		cfg.Model = baseModel
 		cfg.TemplateConfig.UseTokenizerTemplate = true
@@ -135,7 +135,7 @@ func ImportLocalPath(dirPath, name string) (*config.ModelConfig, error) {
 		cfg := &config.ModelConfig{
 			Name:                name,
 			Backend:             "transformers",
-			KnownUsecaseStrings: []string{"chat"},
+			KnownUsecaseStrings: []string{config.UsecaseChat},
 		}
 		cfg.Model = relPath(dirPath)
 		cfg.TemplateConfig.UseTokenizerTemplate = true
--- a/core/gallery/importers/mlx.go
+++ b/core/gallery/importers/mlx.go
@@ -73,7 +73,7 @@ func (i *MLXImporter) Import(details Details) (gallery.ModelConfig, error) {
 	modelConfig := config.ModelConfig{
 		Name:                name,
 		Description:         description,
-		KnownUsecaseStrings: []string{"chat"},
+		KnownUsecaseStrings: []string{config.UsecaseChat},
 		Backend:             backend,
 		PredictionOptions: schema.PredictionOptions{
 			BasicModelRequest: schema.BasicModelRequest{
--- a/core/gallery/importers/transformers.go
+++ b/core/gallery/importers/transformers.go
@@ -87,7 +87,7 @@ func (i *TransformersImporter) Import(details Details) (gallery.ModelConfig, err
 	modelConfig := config.ModelConfig{
 		Name:                name,
 		Description:         description,
-		KnownUsecaseStrings: []string{"chat"},
+		KnownUsecaseStrings: []string{config.UsecaseChat},
 		Backend:             backend,
 		PredictionOptions: schema.PredictionOptions{
 			BasicModelRequest: schema.BasicModelRequest{
--- a/core/gallery/importers/vllm.go
+++ b/core/gallery/importers/vllm.go
@@ -77,7 +77,7 @@ func (i *VLLMImporter) Import(details Details) (gallery.ModelConfig, error) {
 	modelConfig := config.ModelConfig{
 		Name:                name,
 		Description:         description,
-		KnownUsecaseStrings: []string{"chat"},
+		KnownUsecaseStrings: []string{config.UsecaseChat},
 		Backend:             backend,
 		PredictionOptions: schema.PredictionOptions{
 			BasicModelRequest: schema.BasicModelRequest{
--- a/core/gallery/models_types.go
+++ b/core/gallery/models_types.go
@@ -52,3 +52,39 @@ func (m *GalleryModel) GetTags() []string {
 func (m *GalleryModel) GetDescription() string {
 	return m.Description
 }
+
+// GetKnownUsecases returns the usecase flags declared by the gallery entry,
+// falling back to the resolved backend's default usecases when the entry has
+// none of its own. Returns nil only when neither source provides any.
+//
+// Why the fallback: many gallery entries omit known_usecases because their
+// backend has only one sensible mode (e.g. stablediffusion-ggml is always
+// image generation). Without this fallback such models silently disappear
+// from usecase-based filtering in the UI.
+func (m *GalleryModel) GetKnownUsecases() *config.ModelConfigUsecase {
+	if strs := overrideUsecaseStrings(m.Overrides); len(strs) > 0 {
+		return config.GetUsecasesFromYAML(strs)
+	}
+	if defaults := config.DefaultUsecasesForBackendCap(m.Backend); len(defaults) > 0 {
+		return config.GetUsecasesFromYAML(defaults)
+	}
+	return nil
+}
+
+func overrideUsecaseStrings(overrides map[string]any) []string {
+	raw, ok := overrides["known_usecases"]
+	if !ok {
+		return nil
+	}
+	list, ok := raw.([]any)
+	if !ok {
+		return nil
+	}
+	strs := make([]string, 0, len(list))
+	for _, v := range list {
+		if s, ok := v.(string); ok {
+			strs = append(strs, s)
+		}
+	}
+	return strs
+}
--- a/core/http/endpoints/localai/config_meta.go
+++ b/core/http/endpoints/localai/config_meta.go
@@ -116,13 +116,13 @@ func AutocompleteEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, a
 			capability := strings.TrimPrefix(provider, "models:")
 			var filterFn config.ModelConfigFilterFn
 			switch capability {
-			case "chat":
+			case config.UsecaseChat:
 				filterFn = config.BuildUsecaseFilterFn(config.FLAG_CHAT)
-			case "tts":
+			case config.UsecaseTTS:
 				filterFn = config.BuildUsecaseFilterFn(config.FLAG_TTS)
-			case "vad":
+			case config.UsecaseVAD:
 				filterFn = config.BuildUsecaseFilterFn(config.FLAG_VAD)
-			case "transcript":
+			case config.UsecaseTranscript:
 				filterFn = config.BuildUsecaseFilterFn(config.FLAG_TRANSCRIPT)
 			default:
 				filterFn = config.NoFilterFn
--- a/core/http/endpoints/localai/import_model.go
+++ b/core/http/endpoints/localai/import_model.go
@@ -77,18 +77,17 @@ func ImportModelURIEndpoint(cl *config.ModelConfigLoader, appConfig *config.Appl
 			}
 			estCtx, cancel := context.WithTimeout(c.Request().Context(), 5*time.Second)
 			defer cancel()
-			result, err := vram.EstimateModel(estCtx, vram.ModelEstimateInput{
-				Files:   files,
-				Options: vram.EstimateOptions{ContextLength: 8192},
-			})
+			result, err := vram.EstimateModelMultiContext(estCtx, vram.ModelEstimateInput{
+				Files: files,
+			}, []uint32{8192})
 			if err == nil {
 				if result.SizeBytes > 0 {
 					resp.EstimatedSizeBytes = result.SizeBytes
 					resp.EstimatedSizeDisplay = result.SizeDisplay
 				}
-				if result.VRAMBytes > 0 {
-					resp.EstimatedVRAMBytes = result.VRAMBytes
-					resp.EstimatedVRAMDisplay = result.VRAMDisplay
+				if v := result.VRAMForContext(8192); v > 0 {
+					resp.EstimatedVRAMBytes = v
+					resp.EstimatedVRAMDisplay = vram.FormatBytes(v)
 				}
 			}
 		}
--- a/core/http/endpoints/localai/vram.go
+++ b/core/http/endpoints/localai/vram.go
@@ -9,10 +9,9 @@ import (
 )

 // VRAMEstimateEndpoint returns a handler that estimates VRAM usage for an
-// installed model configuration. For uninstalled models (gallery URLs), use
-// the gallery-level estimates in /api/models instead.
+// installed model configuration at multiple context sizes.
 // @Summary Estimate VRAM usage for a model
-// @Description Estimates VRAM based on model weight files, context size, and GPU layers
+// @Description Estimates VRAM based on model weight files at multiple context sizes
 // @Tags config
 // @Accept json
 // @Produce json
--- a/core/http/endpoints/localai/vram_test.go
+++ b/core/http/endpoints/localai/vram_test.go
@@ -121,13 +121,13 @@ var _ = Describe("VRAM Estimate Endpoint", func() {
 		Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
 		// The response should have non-zero size and vram estimates.
 		// JSON numbers unmarshal as float64.
-		sizeBytes, ok := resp["sizeBytes"].(float64)
-		Expect(ok).To(BeTrue(), "sizeBytes should be a number, got: %v (response: %s)", resp["sizeBytes"], rec.Body.String())
+		sizeBytes, ok := resp["size_bytes"].(float64)
+		Expect(ok).To(BeTrue(), "size_bytes should be a number, got: %v (response: %s)", resp["size_bytes"], rec.Body.String())
 		Expect(sizeBytes).To(BeNumerically(">", 0))
-		vramBytes, ok := resp["vramBytes"].(float64)
-		Expect(ok).To(BeTrue(), "vramBytes should be a number")
+		vramBytes, ok := resp["vram_bytes"].(float64)
+		Expect(ok).To(BeTrue(), "vram_bytes should be a number")
 		Expect(vramBytes).To(BeNumerically(">", 0))
-		Expect(resp["sizeDisplay"]).NotTo(BeEmpty())
-		Expect(resp["vramDisplay"]).NotTo(BeEmpty())
+		Expect(resp["size_display"]).NotTo(BeEmpty())
+		Expect(resp["vram_display"]).NotTo(BeEmpty())
 	})
 })
--- a/core/http/react-ui/e2e/models-gallery.spec.js
+++ b/core/http/react-ui/e2e/models-gallery.spec.js
@@ -2,13 +2,13 @@ import { test, expect } from '@playwright/test'

 const MOCK_MODELS_RESPONSE = {
  models: [
-    { name: 'llama-model', description: 'A llama model', backend: 'llama-cpp', installed: false, tags: ['llm'] },
-    { name: 'whisper-model', description: 'A whisper model', backend: 'whisper', installed: true, tags: ['stt'] },
+    { name: 'llama-model', description: 'A llama model', backend: 'llama-cpp', installed: false, tags: ['chat'] },
+    { name: 'whisper-model', description: 'A whisper model', backend: 'whisper', installed: true, tags: ['transcript'] },
    { name: 'stablediffusion-model', description: 'An image model', backend: 'stablediffusion', installed: false, tags: ['sd'] },
    { name: 'unknown-model', description: 'No backend', backend: '', installed: false, tags: [] },
  ],
  allBackends: ['llama-cpp', 'stablediffusion', 'whisper'],
-  allTags: ['llm', 'sd', 'stt'],
+  allTags: ['chat', 'sd', 'transcript'],
  availableModels: 4,
  installedModels: 1,
  totalPages: 1,
@@ -78,3 +78,121 @@ test.describe('Models Gallery - Backend Features', () => {
    await expect(detail.locator('text=llama-cpp')).toBeVisible()
  })
 })
+
+const BACKEND_USECASES_MOCK = {
+  'llama-cpp': ['chat', 'embeddings', 'vision'],
+  'whisper': ['transcript'],
+  'stablediffusion': ['image'],
+}
+
+test.describe('Models Gallery - Multi-select Filters', () => {
+  test.beforeEach(async ({ page }) => {
+    await page.route('**/api/models*', (route) => {
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify(MOCK_MODELS_RESPONSE),
+      })
+    })
+    await page.route('**/api/backends/usecases', (route) => {
+      route.fulfill({
+        contentType: 'application/json',
+        body: JSON.stringify(BACKEND_USECASES_MOCK),
+      })
+    })
+    await page.goto('/app/models')
+    await expect(page.locator('th', { hasText: 'Backend' })).toBeVisible({ timeout: 10_000 })
+  })
+
+  test('multi-select toggle: click Chat, TTS, then Chat again', async ({ page }) => {
+    const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
+    const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
+
+    await chatBtn.click()
+    await expect(chatBtn).toHaveClass(/active/)
+
+    await ttsBtn.click()
+    await expect(chatBtn).toHaveClass(/active/)
+    await expect(ttsBtn).toHaveClass(/active/)
+
+    // Click Chat again to deselect it
+    await chatBtn.click()
+    await expect(chatBtn).not.toHaveClass(/active/)
+    await expect(ttsBtn).toHaveClass(/active/)
+  })
+
+  test('"All" clears selection', async ({ page }) => {
+    const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
+    const allBtn = page.locator('.filter-btn', { hasText: 'All' })
+
+    await chatBtn.click()
+    await expect(chatBtn).toHaveClass(/active/)
+
+    await allBtn.click()
+    await expect(allBtn).toHaveClass(/active/)
+    await expect(chatBtn).not.toHaveClass(/active/)
+  })
+
+  test('query param sent correctly with multiple filters', async ({ page }) => {
+    const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
+    const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
+
+    // Click Chat and wait for its request to settle
+    await chatBtn.click()
+    await page.waitForResponse(resp => resp.url().includes('/api/models'))
+
+    // Now click TTS and capture the resulting request
+    const [request] = await Promise.all([
+      page.waitForRequest(req => {
+        if (!req.url().includes('/api/models')) return false
+        const u = new URL(req.url())
+        const tag = u.searchParams.get('tag')
+        return tag && tag.split(',').length >= 2
+      }),
+      ttsBtn.click(),
+    ])
+
+    const url = new URL(request.url())
+    const tags = url.searchParams.get('tag').split(',').sort()
+    expect(tags).toEqual(['chat', 'tts'])
+  })
+
+  test('backend greys out unavailable filters', async ({ page }) => {
+    // Select llama-cpp backend via dropdown
+    await page.locator('button', { hasText: 'All Backends' }).click()
+    const dropdown = page.locator('input[placeholder="Search backends..."]').locator('..').locator('..')
+    await dropdown.locator('text=llama-cpp').click()
+
+    // Wait for filter state to update
+    const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
+    const sttBtn = page.locator('.filter-btn', { hasText: 'STT' })
+    const imageBtn = page.locator('.filter-btn', { hasText: 'Image' })
+
+    // TTS, STT, Image should be disabled for llama-cpp
+    await expect(ttsBtn).toBeDisabled()
+    await expect(sttBtn).toBeDisabled()
+    await expect(imageBtn).toBeDisabled()
+
+    // Chat, Embeddings, Vision should remain enabled
+    const chatBtn = page.locator('.filter-btn', { hasText: 'Chat' })
+    const embBtn = page.locator('.filter-btn', { hasText: 'Embeddings' })
+    const visBtn = page.locator('.filter-btn', { hasText: 'Vision' })
+    await expect(chatBtn).toBeEnabled()
+    await expect(embBtn).toBeEnabled()
+    await expect(visBtn).toBeEnabled()
+  })
+
+  test('backend clears incompatible filters', async ({ page }) => {
+    // Select TTS filter first
+    const ttsBtn = page.locator('.filter-btn', { hasText: 'TTS' })
+    await ttsBtn.click()
+    await expect(ttsBtn).toHaveClass(/active/)
+
+    // Now select llama-cpp backend (which doesn't support TTS)
+    await page.locator('button', { hasText: 'All Backends' }).click()
+    const dropdown = page.locator('input[placeholder="Search backends..."]').locator('..').locator('..')
+    await dropdown.locator('text=llama-cpp').click()
+
+    // TTS should be auto-removed from selection
+    await expect(ttsBtn).not.toHaveClass(/active/)
+  })
+})
--- a/core/http/react-ui/public/locales/de/models.json
+++ b/core/http/react-ui/public/locales/de/models.json
@@ -20,6 +20,7 @@
    "vision": "Vision",
    "tts": "TTS",
    "stt": "STT",
+    "diarization": "Diarisierung",
    "embedding": "Embedding",
    "rerank": "Rerank",
    "allBackends": "Alle Backends",
--- a/core/http/react-ui/public/locales/en/models.json
+++ b/core/http/react-ui/public/locales/en/models.json
@@ -14,14 +14,20 @@
  },
  "filters": {
    "all": "All",
-    "llm": "LLM",
+    "llm": "Chat",
    "image": "Image",
+    "video": "Video",
    "multimodal": "Multimodal",
    "vision": "Vision",
    "tts": "TTS",
    "stt": "STT",
-    "embedding": "Embedding",
+    "diarization": "Diarization",
+    "soundGen": "Sound",
+    "audioTransform": "Audio FX",
+    "embedding": "Embeddings",
    "rerank": "Rerank",
+    "detection": "Detection",
+    "vad": "VAD",
    "allBackends": "All Backends",
    "searchBackends": "Search backends..."
  },
--- a/core/http/react-ui/public/locales/es/models.json
+++ b/core/http/react-ui/public/locales/es/models.json
@@ -20,6 +20,7 @@
    "vision": "Visión",
    "tts": "TTS",
    "stt": "STT",
+    "diarization": "Diarización",
    "embedding": "Embedding",
    "rerank": "Rerank",
    "allBackends": "Todos los backends",
--- a/core/http/react-ui/public/locales/it/models.json
+++ b/core/http/react-ui/public/locales/it/models.json
@@ -20,6 +20,7 @@
    "vision": "Visione",
    "tts": "TTS",
    "stt": "STT",
+    "diarization": "Diarizzazione",
    "embedding": "Embedding",
    "rerank": "Rerank",
    "allBackends": "Tutti i backend",
--- a/core/http/react-ui/public/locales/zh-CN/models.json
+++ b/core/http/react-ui/public/locales/zh-CN/models.json
@@ -20,6 +20,7 @@
    "vision": "视觉",
    "tts": "TTS",
    "stt": "STT",
+    "diarization": "说话人分离",
    "embedding": "嵌入",
    "rerank": "重排",
    "allBackends": "所有后端",
--- a/core/http/react-ui/src/pages/Backends.jsx
+++ b/core/http/react-ui/src/pages/Backends.jsx
@@ -296,11 +296,11 @@ export default function Backends() {

  const FILTERS = [
    { key: '', label: 'All', icon: 'fa-layer-group' },
-    { key: 'llm', label: 'LLM', icon: 'fa-brain' },
+    { key: 'chat', label: 'Chat', icon: 'fa-brain' },
    { key: 'image', label: 'Image', icon: 'fa-image' },
    { key: 'video', label: 'Video', icon: 'fa-video' },
    { key: 'tts', label: 'TTS', icon: 'fa-microphone' },
-    { key: 'stt', label: 'STT', icon: 'fa-headphones' },
+    { key: 'transcript', label: 'STT', icon: 'fa-headphones' },
    { key: 'vision', label: 'Vision', icon: 'fa-eye' },
  ]

--- a/core/http/react-ui/src/pages/Models.jsx
+++ b/core/http/react-ui/src/pages/Models.jsx
@@ -11,16 +11,26 @@ import GalleryLoader from '../components/GalleryLoader'
 import React from 'react'


+const CONTEXT_SIZES = [8192, 16384, 32768, 65536, 131072, 262144]
+const CONTEXT_LABELS = ['8K', '16K', '32K', '64K', '128K', '256K']
+
+
 const FILTERS = [
  { key: '', labelKey: 'filters.all', icon: 'fa-layer-group' },
-  { key: 'llm', labelKey: 'filters.llm', icon: 'fa-brain' },
-  { key: 'sd', labelKey: 'filters.image', icon: 'fa-image' },
+  { key: 'chat', labelKey: 'filters.llm', icon: 'fa-brain' },
+  { key: 'image', labelKey: 'filters.image', icon: 'fa-image' },
+  { key: 'video', labelKey: 'filters.video', icon: 'fa-video' },
  { key: 'multimodal', labelKey: 'filters.multimodal', icon: 'fa-shapes' },
  { key: 'vision', labelKey: 'filters.vision', icon: 'fa-eye' },
  { key: 'tts', labelKey: 'filters.tts', icon: 'fa-microphone' },
-  { key: 'stt', labelKey: 'filters.stt', icon: 'fa-headphones' },
-  { key: 'embedding', labelKey: 'filters.embedding', icon: 'fa-vector-square' },
-  { key: 'reranker', labelKey: 'filters.rerank', icon: 'fa-sort' },
+  { key: 'transcript', labelKey: 'filters.stt', icon: 'fa-headphones' },
+  { key: 'diarization', labelKey: 'filters.diarization', icon: 'fa-users' },
+  { key: 'sound_generation', labelKey: 'filters.soundGen', icon: 'fa-music' },
+  { key: 'audio_transform', labelKey: 'filters.audioTransform', icon: 'fa-sliders' },
+  { key: 'embeddings', labelKey: 'filters.embedding', icon: 'fa-vector-square' },
+  { key: 'rerank', labelKey: 'filters.rerank', icon: 'fa-sort' },
+  { key: 'detection', labelKey: 'filters.detection', icon: 'fa-bullseye' },
+  { key: 'vad', labelKey: 'filters.vad', icon: 'fa-wave-square' },
 ]

 export default function Models() {
@@ -34,7 +44,7 @@ export default function Models() {
  const [page, setPage] = useState(1)
  const [totalPages, setTotalPages] = useState(1)
  const [search, setSearch] = useState('')
-  const [filter, setFilter] = useState('')
+  const [filters, setFilters] = useState([])
  const [sort, setSort] = useState('')
  const [order, setOrder] = useState('asc')
  const [installing, setInstalling] = useState(new Map())
@@ -43,6 +53,9 @@ export default function Models() {
  const [stats, setStats] = useState({ total: 0, installed: 0, repositories: 0 })
  const [backendFilter, setBackendFilter] = useState('')
  const [allBackends, setAllBackends] = useState([])
+  const [backendUsecases, setBackendUsecases] = useState({})
+  const [estimates, setEstimates] = useState({})
+  const [contextSize, setContextSize] = useState(CONTEXT_SIZES[0])
  const [confirmDialog, setConfirmDialog] = useState(null)

  // Total GPU memory for "fits" check
@@ -52,14 +65,14 @@ export default function Models() {
    try {
      setLoading(true)
      const searchVal = params.search !== undefined ? params.search : search
-      const filterVal = params.filter !== undefined ? params.filter : filter
+      const filtersVal = params.filters !== undefined ? params.filters : filters
      const sortVal = params.sort !== undefined ? params.sort : sort
      const backendVal = params.backendFilter !== undefined ? params.backendFilter : backendFilter
      const queryParams = {
        page: params.page || page,
        items: 9,
      }
-      if (filterVal) queryParams.tag = filterVal
+      if (filtersVal.length > 0) queryParams.tag = filtersVal.join(',')
      if (searchVal) queryParams.term = searchVal
      if (backendVal) queryParams.backend = backendVal
      if (sortVal) {
@@ -79,11 +92,27 @@ export default function Models() {
    } finally {
      setLoading(false)
    }
-  }, [page, search, filter, sort, order, backendFilter, addToast, t])
+  }, [page, search, filters, sort, order, backendFilter, addToast, t])

  useEffect(() => {
    fetchModels()
-  }, [page, filter, sort, order, backendFilter])
+  }, [page, filters, sort, order, backendFilter])
+
+  // Fetch backend→usecase mapping once on mount
+  useEffect(() => {
+    modelsApi.backendUsecases().then(setBackendUsecases).catch(() => {})
+  }, [])
+
+  // When backend changes, remove selected filters that aren't available
+  useEffect(() => {
+    if (backendFilter && backendUsecases[backendFilter]) {
+      setFilters(prev => {
+        const possible = backendUsecases[backendFilter]
+        const filtered = prev.filter(k => k === 'multimodal' || possible.includes(k))
+        return filtered.length !== prev.length ? filtered : prev
+      })
+    }
+  }, [backendFilter, backendUsecases])

  // Re-fetch when operations change (install/delete completion)
  useEffect(() => {
@@ -95,11 +124,42 @@ export default function Models() {
    fetchModels({ search: value, page: 1 })
  })

+  // Fetch VRAM/size estimates asynchronously for visible models.
+  useEffect(() => {
+    if (models.length === 0) return
+    let cancelled = false
+    models.forEach(model => {
+      const id = model.name || model.id
+      if (estimates[id]) return
+      modelsApi.estimate(id, CONTEXT_SIZES).then(est => {
+        if (cancelled) return
+        if (est && (est.sizeBytes || est.estimates)) {
+          setEstimates(prev => ({ ...prev, [id]: est }))
+        }
+      }).catch(() => {})
+    })
+    return () => { cancelled = true }
+  }, [models])
+
  const handleSearch = (value) => {
    setSearch(value)
    debouncedFetch(value)
  }

+  const toggleFilter = (key) => {
+    if (key === '') { setFilters([]); setPage(1); return }
+    setFilters(prev =>
+      prev.includes(key) ? prev.filter(k => k !== key) : [...prev, key]
+    )
+    setPage(1)
+  }
+
+  const isFilterAvailable = (key) => {
+    if (!backendFilter || key === '' || key === 'multimodal') return true
+    const possible = backendUsecases[backendFilter]
+    return !possible || possible.includes(key)
+  }
+
  const handleSort = (col) => {
    if (sort === col) {
      setOrder(o => o === 'asc' ? 'desc' : 'asc')
@@ -221,16 +281,23 @@ export default function Models() {

      {/* Filter buttons */}
      <div className="filter-bar">
-        {FILTERS.map(f => (
-          <button
-            key={f.key}
-            className={`filter-btn ${filter === f.key ? 'active' : ''}`}
-            onClick={() => { setFilter(f.key); setPage(1) }}
-          >
-            <i className={`fas ${f.icon}`} style={{ marginRight: 4 }} />
-            {t(f.labelKey)}
-          </button>
-        ))}
+        {FILTERS.map(f => {
+          const isAll = f.key === ''
+          const active = isAll ? filters.length === 0 : filters.includes(f.key)
+          const available = isFilterAvailable(f.key)
+          return (
+            <button
+              key={f.key}
+              className={`filter-btn ${active ? 'active' : ''}`}
+              disabled={!available}
+              style={!available ? { opacity: 0.4, cursor: 'not-allowed' } : undefined}
+              onClick={() => toggleFilter(f.key)}
+            >
+              <i className={`fas ${f.icon}`} style={{ marginRight: 4 }} />
+              {t(f.labelKey)}
+            </button>
+          )
+        })}
        {allBackends.length > 0 && (
          <SearchableSelect
            value={backendFilter}
@@ -244,6 +311,25 @@ export default function Models() {
        )}
      </div>

+      {/* Context size slider for VRAM estimates */}
+      <div style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)', marginBottom: 'var(--spacing-md)', fontSize: '0.8125rem' }}>
+        <label style={{ color: 'var(--color-text-muted)', whiteSpace: 'nowrap' }}>
+          <i className="fas fa-memory" style={{ marginRight: 4 }} />
+          Context:
+        </label>
+        <input
+          type="range"
+          min={0}
+          max={CONTEXT_SIZES.length - 1}
+          value={CONTEXT_SIZES.indexOf(contextSize)}
+          onChange={(e) => setContextSize(CONTEXT_SIZES[e.target.value])}
+          style={{ width: 140, accentColor: 'var(--color-primary)' }}
+        />
+        <span style={{ fontWeight: 600, minWidth: '3em' }}>
+          {CONTEXT_LABELS[CONTEXT_SIZES.indexOf(contextSize)]}
+        </span>
+      </div>
+
      {/* Table */}
      {loading ? (
        <GalleryLoader />
@@ -252,12 +338,12 @@ export default function Models() {
          <div className="empty-state-icon"><i className="fas fa-search" /></div>
          <h2 className="empty-state-title">{t('empty.title')}</h2>
          <p className="empty-state-text">
-            {search || filter || backendFilter ? t('empty.withFilters') : t('empty.noFilters')}
+            {search || filters.length > 0 || backendFilter ? t('empty.withFilters') : t('empty.noFilters')}
          </p>
-          {(search || filter || backendFilter) && (
+          {(search || filters.length > 0 || backendFilter) && (
            <button
              className="btn btn-secondary btn-sm"
-              onClick={() => { handleSearch(''); setFilter(''); setBackendFilter(''); setPage(1) }}
+              onClick={() => { handleSearch(''); setFilters([]); setBackendFilter(''); setPage(1) }}
            >
              <i className="fas fa-times" /> {t('search.clearFilters')}
            </button>
@@ -286,9 +372,14 @@ export default function Models() {
              <tbody>
                {models.map((model, idx) => {
                  const name = model.name || model.id
+                  const estData = estimates[name]
+                  const sizeDisplay = estData?.sizeDisplay
+                  const ctxEst = estData?.estimates?.[String(contextSize)]
+                  const vramDisplay = ctxEst?.vramDisplay
+                  const vramBytes = ctxEst?.vramBytes
                  const installing = isInstalling(name)
                  const progress = getOperationProgress(name)
-                  const fit = fitsGpu(model.estimated_vram_bytes)
+                  const fit = fitsGpu(vramBytes)
                  const isExpanded = expandedRow === idx

                  return (
@@ -355,15 +446,15 @@ export default function Models() {
                      {/* Size / VRAM */}
                      <td>
                        <div style={{ display: 'flex', flexDirection: 'column', gap: '2px' }}>
-                          {(model.estimated_size_display || model.estimated_vram_display) ? (
+                          {(sizeDisplay || vramDisplay) ? (
                            <>
                              <span style={{ fontSize: '0.75rem', color: 'var(--color-text-secondary)' }}>
-                                {model.estimated_size_display && model.estimated_size_display !== '0 B' && (
-                                  <span>{t('table.size', { size: model.estimated_size_display })}</span>
+                                {sizeDisplay && sizeDisplay !== '0 B' && (
+                                  <span>{t('table.size', { size: sizeDisplay })}</span>
                                )}
-                                {model.estimated_size_display && model.estimated_size_display !== '0 B' && model.estimated_vram_display && model.estimated_vram_display !== '0 B' && ' · '}
-                                {model.estimated_vram_display && model.estimated_vram_display !== '0 B' && (
-                                  <span>{t('table.vram', { vram: model.estimated_vram_display })}</span>
+                                {sizeDisplay && sizeDisplay !== '0 B' && vramDisplay && vramDisplay !== '0 B' && ' · '}
+                                {vramDisplay && vramDisplay !== '0 B' && (
+                                  <span>{t('table.vram', { vram: vramDisplay })}</span>
                                )}
                              </span>
                              {fit !== null && (
@@ -437,7 +528,7 @@ export default function Models() {
                    {isExpanded && (
                      <tr>
                        <td colSpan="8" style={{ padding: 0 }}>
-                          <ModelDetail model={model} fit={fit} expandedFiles={expandedFiles} setExpandedFiles={setExpandedFiles} t={t} />
+                          <ModelDetail model={model} fit={fit} sizeDisplay={sizeDisplay} vramDisplay={vramDisplay} expandedFiles={expandedFiles} setExpandedFiles={setExpandedFiles} t={t} />
                        </td>
                      </tr>
                    )}
@@ -490,7 +581,7 @@ function DetailRow({ label, children }) {
  )
 }

-function ModelDetail({ model, fit, expandedFiles, setExpandedFiles, t }) {
+function ModelDetail({ model, fit, sizeDisplay, vramDisplay, expandedFiles, setExpandedFiles, t }) {
  const files = model.additionalFiles || model.files || []
  return (
    <div style={{ padding: 'var(--spacing-md) var(--spacing-lg)', background: 'var(--color-bg-primary)', borderTop: '1px solid var(--color-border-subtle)' }}>
@@ -516,12 +607,12 @@ function ModelDetail({ model, fit, expandedFiles, setExpandedFiles, t }) {
            )}
          </DetailRow>
          <DetailRow label={t('detail.size')}>
-            {model.estimated_size_display && model.estimated_size_display !== '0 B' ? model.estimated_size_display : null}
+            {sizeDisplay && sizeDisplay !== '0 B' ? sizeDisplay : null}
          </DetailRow>
          <DetailRow label={t('detail.vram')}>
-            {model.estimated_vram_display && model.estimated_vram_display !== '0 B' ? (
+            {vramDisplay && vramDisplay !== '0 B' ? (
              <span style={{ display: 'flex', alignItems: 'center', gap: 'var(--spacing-sm)' }}>
-                {model.estimated_vram_display}
+                {vramDisplay}
                {fit !== null && (
                  <span style={{ fontSize: '0.75rem', color: fit ? 'var(--color-success)' : 'var(--color-error)' }}>
                    <i className="fas fa-microchip" /> {fit ? t('detail.fitsGpu') : t('detail.mayNotFitGpu')}
--- a/core/http/react-ui/src/utils/api.js
+++ b/core/http/react-ui/src/utils/api.js
@@ -86,6 +86,10 @@ export const modelsApi = {
  listCapabilities: () => fetchJSON(API_CONFIG.endpoints.modelsCapabilities),
  install: (id) => postJSON(API_CONFIG.endpoints.installModel(id), {}),
  delete: (id) => postJSON(API_CONFIG.endpoints.deleteModel(id), {}),
+  estimate: (id, contexts) => fetchJSON(
+    buildUrl(API_CONFIG.endpoints.modelEstimate(id),
+      contexts?.length ? { contexts: contexts.join(',') } : {})
+  ),
  getConfig: (id) => postJSON(API_CONFIG.endpoints.modelConfig(id), {}),
  getConfigJson: (name) => fetchJSON(API_CONFIG.endpoints.modelConfigJson(name)),
  getJob: (uid) => fetchJSON(API_CONFIG.endpoints.modelJob(uid)),
@@ -116,6 +120,7 @@ export const modelsApi = {
    method: 'PATCH',
    body: JSON.stringify(patch),
  }),
+  backendUsecases: () => fetchJSON('/api/backends/usecases'),
 }

 // Backends API
--- a/core/http/react-ui/src/utils/config.js
+++ b/core/http/react-ui/src/utils/config.js
@@ -9,6 +9,7 @@ export const API_CONFIG = {
    models: '/api/models',
    installModel: (id) => `/api/models/install/${id}`,
    deleteModel: (id) => `/api/models/delete/${id}`,
+    modelEstimate: (id) => `/api/models/estimate/${id}`,
    modelConfig: (id) => `/api/models/config/${id}`,
    modelConfigJson: (name) => `/api/models/config-json/${name}`,
    configMetadata: '/api/models/config-metadata',
--- a/core/http/routes/ui_api.go
+++ b/core/http/routes/ui_api.go
@@ -9,11 +9,9 @@ import (
 	"math"
 	"net/http"
 	"net/url"
-	"path"
 	"slices"
 	"strconv"
 	"strings"
-	"sync"
 	"time"

 	"github.com/google/uuid"
@@ -37,8 +35,81 @@ const (
 	licenseSortFieldName    = "license"
 	statusSortFieldName     = "status"
 	ascSortOrder            = "asc"
+	multimodalFilterKey     = "multimodal"
 )

+// usecaseFilters maps UI filter keys to ModelConfigUsecase flags for
+// capability-based gallery filtering.
+var usecaseFilters = map[string]config.ModelConfigUsecase{
+	config.UsecaseChat:            config.FLAG_CHAT,
+	config.UsecaseImage:           config.FLAG_IMAGE,
+	config.UsecaseVideo:           config.FLAG_VIDEO,
+	config.UsecaseVision:          config.FLAG_VISION,
+	config.UsecaseTTS:             config.FLAG_TTS,
+	config.UsecaseTranscript:      config.FLAG_TRANSCRIPT,
+	config.UsecaseSoundGeneration: config.FLAG_SOUND_GENERATION,
+	config.UsecaseEmbeddings:      config.FLAG_EMBEDDINGS,
+	config.UsecaseRerank:          config.FLAG_RERANK,
+	config.UsecaseDetection:       config.FLAG_DETECTION,
+	config.UsecaseVAD:             config.FLAG_VAD,
+	config.UsecaseAudioTransform:  config.FLAG_AUDIO_TRANSFORM,
+	config.UsecaseDiarization:     config.FLAG_DIARIZATION,
+}
+
+
+// extractHFRepo tries to find a HuggingFace repo ID from model overrides or URLs.
+func extractHFRepo(overrides map[string]any, urls []string) string {
+	if overrides != nil {
+		if params, ok := overrides["parameters"].(map[string]any); ok {
+			if modelRef, ok := params["model"].(string); ok {
+				if repoID, ok := vram.ExtractHFRepoID(modelRef); ok {
+					return repoID
+				}
+			}
+		}
+	}
+	for _, u := range urls {
+		if repoID, ok := vram.ExtractHFRepoID(u); ok {
+			return repoID
+		}
+	}
+	return ""
+}
+
+// buildEstimateInput creates a vram.ModelEstimateInput from gallery model metadata.
+func buildEstimateInput(m *gallery.GalleryModel) vram.ModelEstimateInput {
+	var input vram.ModelEstimateInput
+	input.Size = m.Size
+	if hfRepoID := extractHFRepo(m.Overrides, m.URLs); hfRepoID != "" {
+		input.HFRepo = hfRepoID
+	}
+	for _, f := range m.AdditionalFiles {
+		if vram.IsWeightFile(f.URI) {
+			input.Files = append(input.Files, vram.FileInput{URI: f.URI, Size: 0})
+		}
+	}
+	return input
+}
+
+// parseContextSizes parses a comma-separated list of context sizes from a query param.
+// Returns a default of [8192] if the param is empty or unparseable.
+func parseContextSizes(raw string) []uint32 {
+	if raw == "" {
+		return []uint32{8192}
+	}
+	var sizes []uint32
+	for _, s := range strings.Split(raw, ",") {
+		s = strings.TrimSpace(s)
+		if v, err := strconv.ParseUint(s, 10, 32); err == nil && v > 0 {
+			sizes = append(sizes, uint32(v))
+		}
+	}
+	if len(sizes) == 0 {
+		return []uint32{8192}
+	}
+	return sizes
+}
+
 // getDirectorySize calculates the total size of files in a directory
 // metaParentOf returns the name of the auto-resolving (meta) backend that
 // declares `name` as one of its hardware-specific variants in its
@@ -260,7 +331,7 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 			items = "9"
 		}

-		models, err := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.SystemState)
+		models, err := gallery.AvailableGalleryModelsCached(appConfig.Galleries, appConfig.SystemState)
 		if err != nil {
 			xlog.Error("could not list models from galleries", "error", err)
 			return c.JSON(http.StatusInternalServerError, map[string]any{
@@ -294,8 +365,30 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 		}
 		slices.Sort(backendNames)

+		// Filter by usecase tags (comma-separated for multi-select).
 		if tag != "" {
-			models = gallery.GalleryElements[*gallery.GalleryModel](models).FilterByTag(tag)
+			var combinedFlag config.ModelConfigUsecase
+			hasMultimodal := false
+			var plainTags []string
+			for _, t := range strings.Split(tag, ",") {
+				t = strings.TrimSpace(t)
+				if t == multimodalFilterKey {
+					hasMultimodal = true
+				} else if flag, ok := usecaseFilters[t]; ok {
+					combinedFlag |= flag
+				} else if t != "" {
+					plainTags = append(plainTags, t)
+				}
+			}
+			if hasMultimodal {
+				models = gallery.FilterGalleryModelsByMultimodal(models)
+			}
+			if combinedFlag != config.FLAG_ANY {
+				models = gallery.FilterGalleryModelsByUsecase(models, combinedFlag)
+			}
+			for _, pt := range plainTags {
+				models = gallery.GalleryElements[*gallery.GalleryModel](models).FilterByTag(pt)
+			}
 		}
 		if term != "" {
 			models = gallery.GalleryElements[*gallery.GalleryModel](models).Search(term)
@@ -355,41 +448,6 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 		modelsJSON := make([]map[string]any, 0, len(models))
 		seenIDs := make(map[string]bool)

-		weightExts := map[string]bool{".gguf": true, ".safetensors": true, ".bin": true, ".pt": true}
-		extractHFRepo := func(overrides map[string]any, urls []string) string {
-			// Try overrides.parameters.model first
-			if overrides != nil {
-				if params, ok := overrides["parameters"].(map[string]any); ok {
-					if modelRef, ok := params["model"].(string); ok {
-						if repoID, ok := vram.ExtractHFRepoID(modelRef); ok {
-							return repoID
-						}
-					}
-				}
-			}
-			// Fall back to the first HuggingFace URL in the metadata urls list
-			for _, u := range urls {
-				if repoID, ok := vram.ExtractHFRepoID(u); ok {
-					return repoID
-				}
-			}
-			return ""
-		}
-		hasWeightFiles := func(files []gallery.File) bool {
-			for _, f := range files {
-				ext := strings.ToLower(path.Ext(path.Base(f.URI)))
-				if weightExts[ext] {
-					return true
-				}
-			}
-			return false
-		}
-
-		const hfEstimateTimeout = 10 * time.Second
-		const estimateConcurrency = 3
-		sem := make(chan struct{}, estimateConcurrency)
-		var wg sync.WaitGroup
-
 		for _, m := range models {
 			modelID := m.ID()

@@ -431,63 +489,9 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 				"backend":         m.Backend,
 			}

-			// Build EstimateModel input from available metadata
-			var estimateInput vram.ModelEstimateInput
-			estimateInput.Options = vram.EstimateOptions{ContextLength: 8192}
-			estimateInput.Size = m.Size
-			if hfRepoID := extractHFRepo(m.Overrides, m.URLs); hfRepoID != "" {
-				estimateInput.HFRepo = hfRepoID
-			}
-
-			if hasWeightFiles(m.AdditionalFiles) {
-				files := make([]gallery.File, len(m.AdditionalFiles))
-				copy(files, m.AdditionalFiles)
-				for _, f := range files {
-					ext := strings.ToLower(path.Ext(path.Base(f.URI)))
-					if weightExts[ext] {
-						estimateInput.Files = append(estimateInput.Files, vram.FileInput{URI: f.URI, Size: 0})
-					}
-				}
-			}
-
-			// Run estimation (async for file-based and HF repo, sync for size string only)
-			needsAsync := len(estimateInput.Files) > 0 || estimateInput.HFRepo != ""
-			if needsAsync {
-				input := estimateInput
-				wg.Go(func() {
-					sem <- struct{}{}
-					defer func() { <-sem }()
-					ctx, cancel := context.WithTimeout(context.Background(), hfEstimateTimeout)
-					defer cancel()
-					result, err := vram.EstimateModel(ctx, input)
-					if err == nil {
-						if result.SizeBytes > 0 {
-							obj["estimated_size_bytes"] = result.SizeBytes
-							obj["estimated_size_display"] = result.SizeDisplay
-						}
-						if result.VRAMBytes > 0 {
-							obj["estimated_vram_bytes"] = result.VRAMBytes
-							obj["estimated_vram_display"] = result.VRAMDisplay
-						}
-					}
-				})
-			} else if estimateInput.Size != "" {
-				result, _ := vram.EstimateModel(context.Background(), estimateInput)
-				if result.SizeBytes > 0 {
-					obj["estimated_size_bytes"] = result.SizeBytes
-					obj["estimated_size_display"] = result.SizeDisplay
-				}
-				if result.VRAMBytes > 0 {
-					obj["estimated_vram_bytes"] = result.VRAMBytes
-					obj["estimated_vram_display"] = result.VRAMDisplay
-				}
-			}
-
 			modelsJSON = append(modelsJSON, obj)
 		}

-		wg.Wait()
-
 		prevPage := pageNum - 1
 		nextPage := pageNum + 1
 		if prevPage < 1 {
@@ -639,6 +643,65 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 		})
 	})

+	// Returns a mapping of backend names to the usecase filter keys they support.
+	// Used by the gallery frontend to grey out usecase filter buttons when a
+	// backend is selected.
+	app.GET("/api/backends/usecases", func(c echo.Context) error {
+		result := make(map[string][]string, len(config.BackendCapabilities))
+		for name, cap := range config.BackendCapabilities {
+			var keys []string
+			for _, uc := range cap.PossibleUsecases {
+				if _, ok := usecaseFilters[uc]; ok {
+					keys = append(keys, uc)
+				}
+			}
+			slices.Sort(keys)
+			result[name] = keys
+		}
+
+		return c.JSON(200, result)
+	}, adminMiddleware)
+
+	// Returns VRAM/size estimates for a single gallery model at multiple
+	// context sizes. The frontend calls this per-model so the gallery page
+	// can load instantly and fill in estimates asynchronously.
+	// Query params:
+	//   contexts - comma-separated context sizes (default: 8192)
+	app.GET("/api/models/estimate/:id", func(c echo.Context) error {
+		modelID, err := url.QueryUnescape(c.Param("id"))
+		if err != nil {
+			return c.JSON(http.StatusBadRequest, map[string]any{"error": "invalid model ID"})
+		}
+
+		contextSizes := parseContextSizes(c.QueryParam("contexts"))
+
+		// Look up the model from the gallery to build the estimate input.
+		models, err := gallery.AvailableGalleryModelsCached(appConfig.Galleries, appConfig.SystemState)
+		if err != nil {
+			return c.JSON(http.StatusInternalServerError, map[string]any{"error": err.Error()})
+		}
+
+		model := gallery.FindGalleryElement(models, modelID)
+		if model == nil {
+			return c.JSON(http.StatusNotFound, map[string]any{"error": "model not found"})
+		}
+
+		input := buildEstimateInput(model)
+		if len(input.Files) == 0 && input.HFRepo == "" && input.Size == "" {
+			return c.JSON(200, vram.MultiContextEstimate{})
+		}
+
+		ctx, cancel := context.WithTimeout(c.Request().Context(), 10*time.Second)
+		defer cancel()
+		result, err := vram.EstimateModelMultiContext(ctx, input, contextSizes)
+		if err != nil {
+			xlog.Debug("model estimate failed", "model", modelID, "error", err)
+			return c.JSON(200, vram.MultiContextEstimate{})
+		}
+
+		return c.JSON(200, result)
+	}, adminMiddleware)
+
 	app.POST("/api/models/install/:id", func(c echo.Context) error {
 		galleryID := c.Param("id")
 		// URL decode the gallery ID (e.g., "localai%40model" -> "localai@model")
@@ -742,7 +805,7 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 		}
 		xlog.Debug("API job submitted to get config", "galleryID", galleryID)

-		models, err := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.SystemState)
+		models, err := gallery.AvailableGalleryModelsCached(appConfig.Galleries, appConfig.SystemState)
 		if err != nil {
 			return c.JSON(http.StatusInternalServerError, map[string]any{
 				"error": err.Error(),
--- a/core/services/modeladmin/vram.go
+++ b/core/services/modeladmin/vram.go
@@ -43,17 +43,16 @@ func EstimateVRAM(ctx context.Context, req VRAMRequest, cl *config.ModelConfigLo
 	modelsPath := sysState.Model.ModelsPath

 	var files []vram.FileInput
-	var firstGGUF string
 	seen := make(map[string]bool)

 	for _, f := range cfg.DownloadFiles {
-		addWeightFile(string(f.URI), modelsPath, &files, &firstGGUF, seen)
+		addWeightFile(string(f.URI), modelsPath, &files, seen)
 	}
 	if cfg.Model != "" {
-		addWeightFile(cfg.Model, modelsPath, &files, &firstGGUF, seen)
+		addWeightFile(cfg.Model, modelsPath, &files, seen)
 	}
 	if cfg.MMProj != "" {
-		addWeightFile(cfg.MMProj, modelsPath, &files, &firstGGUF, seen)
+		addWeightFile(cfg.MMProj, modelsPath, &files, seen)
 	}

 	if len(files) == 0 {
@@ -64,39 +63,46 @@ func EstimateVRAM(ctx context.Context, req VRAMRequest, cl *config.ModelConfigLo
 	}

 	contextDefaulted := false
-	opts := vram.EstimateOptions{
-		ContextLength: req.ContextSize,
-		GPULayers:     req.GPULayers,
-		KVQuantBits:   req.KVQuantBits,
-	}
-	if opts.ContextLength == 0 {
+	ctxLen := req.ContextSize
+	if ctxLen == 0 {
 		if cfg.ContextSize != nil {
-			opts.ContextLength = uint32(*cfg.ContextSize)
+			ctxLen = uint32(*cfg.ContextSize)
 		} else {
-			opts.ContextLength = 8192
+			ctxLen = 8192
 			contextDefaulted = true
 		}
 	}

+	opts := vram.EstimateOptions{
+		GPULayers:   req.GPULayers,
+		KVQuantBits: req.KVQuantBits,
+	}
+
 	subCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
 	defer cancel()

-	result, err := vram.Estimate(subCtx, files, opts, vram.DefaultCachedSizeResolver(), vram.DefaultCachedGGUFReader())
+	multi, err := vram.EstimateMultiContext(subCtx, files, []uint32{ctxLen}, opts, vram.DefaultCachedSizeResolver(), vram.DefaultCachedGGUFReader())
 	if err != nil {
 		return nil, fmt.Errorf("vram estimate: %w", err)
 	}

-	resp := &VRAMResponse{EstimateResult: result}
+	at := multi.Estimates[fmt.Sprint(ctxLen)]
+	resp := &VRAMResponse{
+		EstimateResult: vram.EstimateResult{
+			SizeBytes:     multi.SizeBytes,
+			SizeDisplay:   multi.SizeDisplay,
+			ContextLength: at.ContextLength,
+			VRAMBytes:     at.VRAMBytes,
+			VRAMDisplay:   at.VRAMDisplay,
+		},
+		ModelMaxContext: multi.ModelMaxContext,
+	}

-	if contextDefaulted && firstGGUF != "" {
-		ggufMeta, err := vram.DefaultCachedGGUFReader().ReadMetadata(subCtx, firstGGUF)
-		if err == nil && ggufMeta != nil && ggufMeta.MaximumContextLength > 0 {
-			resp.ModelMaxContext = ggufMeta.MaximumContextLength
-			resp.ContextNote = fmt.Sprintf(
-				"Estimate used default context_size=8192. The model's trained maximum context is %d; VRAM usage will be higher at larger context sizes.",
-				ggufMeta.MaximumContextLength,
-			)
-		}
+	if contextDefaulted && multi.ModelMaxContext > 0 {
+		resp.ContextNote = fmt.Sprintf(
+			"Estimate used default context_size=8192. The model's trained maximum context is %d; VRAM usage will be higher at larger context sizes.",
+			multi.ModelMaxContext,
+		)
 	}
 	return resp, nil
 }
@@ -111,8 +117,8 @@ func resolveModelURI(uri, modelsPath string) string {
 	return "file://" + filepath.Join(modelsPath, uri)
 }

-// addWeightFile appends a resolved weight file to files and tracks the first GGUF.
-func addWeightFile(uri, modelsPath string, files *[]vram.FileInput, firstGGUF *string, seen map[string]bool) {
+// addWeightFile appends a resolved weight file to files.
+func addWeightFile(uri, modelsPath string, files *[]vram.FileInput, seen map[string]bool) {
 	if !vram.IsWeightFile(uri) {
 		return
 	}
@@ -122,7 +128,4 @@ func addWeightFile(uri, modelsPath string, files *[]vram.FileInput, firstGGUF *s
 	}
 	seen[resolved] = true
 	*files = append(*files, vram.FileInput{URI: resolved, Size: 0})
-	if *firstGGUF == "" && vram.IsGGUF(uri) {
-		*firstGGUF = resolved
-	}
 }
--- a/core/services/nodes/router.go
+++ b/core/services/nodes/router.go
@@ -628,10 +628,14 @@ func (r *SmartRouter) estimateModelVRAM(ctx context.Context, opts *pb.ModelOptio
 	estCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
 	defer cancel()

+	ctxSize := uint32(opts.ContextSize)
+	if ctxSize == 0 {
+		ctxSize = 8192
+	}
+
 	input := vram.ModelEstimateInput{
 		Options: vram.EstimateOptions{
-			ContextLength: uint32(opts.ContextSize),
-			GPULayers:     int(opts.NGPULayers),
+			GPULayers: int(opts.NGPULayers),
 		},
 	}

@@ -649,28 +653,15 @@ func (r *SmartRouter) estimateModelVRAM(ctx context.Context, opts *pb.ModelOptio
 		}
 	}

-	// If model file exists, get its size as fallback
-	if opts.ModelFile != "" && len(input.Files) == 0 {
-		if info, err := os.Stat(opts.ModelFile); err == nil {
-			return vram.EstimateFromSize(uint64(info.Size())).VRAMBytes
-		}
-	}
-
 	if len(input.Files) == 0 && input.HFRepo == "" && input.Size == "" {
 		return 0
 	}

-	result, err := vram.EstimateModel(estCtx, input)
-	if err != nil || result.VRAMBytes == 0 {
-		// Last resort: try model file size
-		if opts.ModelFile != "" {
-			if info, statErr := os.Stat(opts.ModelFile); statErr == nil {
-				return vram.EstimateFromSize(uint64(info.Size())).VRAMBytes
-			}
-		}
+	result, err := vram.EstimateModelMultiContext(estCtx, input, []uint32{ctxSize})
+	if err != nil {
 		return 0
 	}
-	return result.VRAMBytes
+	return result.VRAMForContext(ctxSize)
 }

 // installBackendOnNode sends a NATS backend.install request-reply to the node.
--- a/gallery/chatml.yaml
+++ b/gallery/chatml.yaml
@@ -1,42 +1,42 @@
---
-name: "chatml"
-
 config_file: |
-  backend: "llama-cpp"
-  mmap: true
-  template:
-    chat_message: |
-      <|im_start|>{{ .RoleName }}
-      {{ if .FunctionCall -}}
-      Function call:
-      {{ else if eq .RoleName "tool" -}}
-      Function response:
-      {{ end -}}
-      {{ if .Content -}}
-      {{.Content }}
-      {{ end -}}
-      {{ if .FunctionCall -}}
-      {{toJson .FunctionCall}}
-      {{ end -}}<|im_end|>
-    function: |
-      <|im_start|>system
-      You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-      {{range .Functions}}
-      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-      {{end}}
-      For each function call return a json object with function name and arguments
-      <|im_end|>
-      {{.Input -}}
-      <|im_start|>assistant
-    chat: |
-      {{.Input -}}
-      <|im_start|>assistant
-    completion: |
-      {{.Input}}
-  context_size: 4096
-  f16: true
-  stopwords:
-  - '<|im_end|>'
-  - '<dummy32000>'
-  - '</s>'
-  - '<|endoftext|>'
+    backend: llama-cpp
+    context_size: 4096
+    f16: true
+    known_usecases:
+        - chat
+    mmap: true
+    stopwords:
+        - <|im_end|>
+        - <dummy32000>
+        - </s>
+        - <|endoftext|>
+    template:
+        chat: |
+            {{.Input -}}
+            <|im_start|>assistant
+        chat_message: |
+            <|im_start|>{{ .RoleName }}
+            {{ if .FunctionCall -}}
+            Function call:
+            {{ else if eq .RoleName "tool" -}}
+            Function response:
+            {{ end -}}
+            {{ if .Content -}}
+            {{.Content }}
+            {{ end -}}
+            {{ if .FunctionCall -}}
+            {{toJson .FunctionCall}}
+            {{ end -}}<|im_end|>
+        completion: |
+            {{.Input}}
+        function: |
+            <|im_start|>system
+            You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+            {{range .Functions}}
+            {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+            {{end}}
+            For each function call return a json object with function name and arguments
+            <|im_end|>
+            {{.Input -}}
+            <|im_start|>assistant
+name: chatml
--- a/gallery/deepseek-r1.yaml
+++ b/gallery/deepseek-r1.yaml
@@ -1,24 +1,24 @@
---
-name: "deepseek-r1"
-
 config_file: |
-  backend: "llama-cpp"
-  context_size: 131072
-  mmap: true
-  f16: true
-  stopwords:
-    - <｜begin▁of▁sentence｜>
-    - <｜end▁of▁sentence｜>
-    - <｜User｜>
-    - <｜Assistant｜>
-  template:
-    chat_message: |
-      {{if eq .RoleName "system" -}}{{.Content }}
-      {{ end -}}
-      {{if eq .RoleName "user" -}}<｜User｜>{{.Content}}
-      {{end -}}
-      {{if eq .RoleName "assistant" -}}<｜Assistant｜>{{.Content}}<｜end▁of▁sentence｜>{{end}}
-    completion: |
-      {{.Input}}
-    chat: |
-      {{.Input -}}<｜Assistant｜>
+    backend: llama-cpp
+    context_size: 131072
+    f16: true
+    known_usecases:
+        - chat
+    mmap: true
+    stopwords:
+        - <｜begin▁of▁sentence｜>
+        - <｜end▁of▁sentence｜>
+        - <｜User｜>
+        - <｜Assistant｜>
+    template:
+        chat: |
+            {{.Input -}}<｜Assistant｜>
+        chat_message: |
+            {{if eq .RoleName "system" -}}{{.Content }}
+            {{ end -}}
+            {{if eq .RoleName "user" -}}<｜User｜>{{.Content}}
+            {{end -}}
+            {{if eq .RoleName "assistant" -}}<｜Assistant｜>{{.Content}}<｜end▁of▁sentence｜>{{end}}
+        completion: |
+            {{.Input}}
+name: deepseek-r1
--- a/gallery/gemma.yaml
+++ b/gallery/gemma.yaml
@@ -1,41 +1,42 @@
---
-name: "gemma"
-
 config_file: |
-  backend: "llama-cpp"
-  mmap: true
-  context_size: 8192
-  template:
-    chat_message: |-
-      <start_of_turn>{{if eq .RoleName "assistant" }}model{{else}}{{ .RoleName }}{{end}}
-      {{ if .FunctionCall -}}
-      {{ else if eq .RoleName "tool" -}}
-      {{ end -}}
-      {{ if .Content -}}
-      {{.Content -}}
-      {{ end -}}
-      {{ if .FunctionCall -}}
-      {{toJson .FunctionCall}}
-      {{ end -}}<end_of_turn>
-    chat: |
-      {{.Input }}
-      <start_of_turn>model
-    completion: |
-      {{.Input}}
-    function: |
-      <start_of_turn>system
-      You have access to functions. If you decide to invoke any of the function(s),
-      you MUST put it in the format of
-      {"name": function name, "parameters": dictionary of argument name and its value}
+    backend: llama-cpp
+    context_size: 8192
+    known_usecases:
+        - chat
+        - completion
+    mmap: true
+    stopwords:
+        - <|im_end|>
+        - <end_of_turn>
+        - <start_of_turn>
+    template:
+        chat: |
+            {{.Input }}
+            <start_of_turn>model
+        chat_message: |-
+            <start_of_turn>{{if eq .RoleName "assistant" }}model{{else}}{{ .RoleName }}{{end}}
+            {{ if .FunctionCall -}}
+            {{ else if eq .RoleName "tool" -}}
+            {{ end -}}
+            {{ if .Content -}}
+            {{.Content -}}
+            {{ end -}}
+            {{ if .FunctionCall -}}
+            {{toJson .FunctionCall}}
+            {{ end -}}<end_of_turn>
+        completion: |
+            {{.Input}}
+        function: |
+            <start_of_turn>system
+            You have access to functions. If you decide to invoke any of the function(s),
+            you MUST put it in the format of
+            {"name": function name, "parameters": dictionary of argument name and its value}

-      You SHOULD NOT include any other text in the response if you call a function
-      {{range .Functions}}
-      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-      {{end}}
-      <end_of_turn>
-      {{.Input -}}
-      <start_of_turn>model
-  stopwords:
-  - '<|im_end|>'
-  - '<end_of_turn>'
-  - '<start_of_turn>'
+            You SHOULD NOT include any other text in the response if you call a function
+            {{range .Functions}}
+            {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+            {{end}}
+            <end_of_turn>
+            {{.Input -}}
+            <start_of_turn>model
+name: gemma
--- a/gallery/granite4.yaml
+++ b/gallery/granite4.yaml
@@ -1,48 +1,49 @@
---
-name: "granite-3.2"
-
 config_file: |
-  backend: "llama-cpp"
-  mmap: true
-  template:
-    chat_message: |
-      <|start_of_role|>{{ .RoleName }}<|end_of_role|>
-      {{ if .FunctionCall -}}
-      <tool_call>
-      {{ else if eq .RoleName "tool" -}}
-      <tool_response>
-      {{ end -}}
-      {{ if .Content -}}
-      {{.Content }}
-      {{ end -}}
-      {{ if eq .RoleName "tool" -}}
-      </tool_response>
-      {{ end -}}
-      {{ if .FunctionCall -}}
-      {{toJson .FunctionCall}}
-      </tool_call>
-      {{ end -}}
-      <|end_of_text|>
-    function: |
-      <|start_of_role|>system<|end_of_role|>
-      You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.
+    backend: llama-cpp
+    context_size: 8192
+    f16: true
+    known_usecases:
+        - chat
+        - completion
+    mmap: true
+    stopwords:
+        - <|im_end|>
+        - <dummy32000>
+        - </s>
+        - <|end_of_text|>
+    template:
+        chat: |
+            {{.Input -}}
+            <|start_of_role|>assistant<|end_of_role|>
+        chat_message: |
+            <|start_of_role|>{{ .RoleName }}<|end_of_role|>
+            {{ if .FunctionCall -}}
+            <tool_call>
+            {{ else if eq .RoleName "tool" -}}
+            <tool_response>
+            {{ end -}}
+            {{ if .Content -}}
+            {{.Content }}
+            {{ end -}}
+            {{ if eq .RoleName "tool" -}}
+            </tool_response>
+            {{ end -}}
+            {{ if .FunctionCall -}}
+            {{toJson .FunctionCall}}
+            </tool_call>
+            {{ end -}}
+            <|end_of_text|>
+        completion: |
+            {{.Input}}
+        function: |
+            <|start_of_role|>system<|end_of_role|>
+            You are a helpful AI assistant with access to the following tools. When a tool is required to answer the user's query, respond with <|tool_call|> followed by a JSON list of tools used. If a tool does not exist in the provided list of tools, notify the user that you do not have the ability to fulfill the request.

-      Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.
-      {{range .Functions}}
-      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-      {{end}}
-      For each function call return a json object with function name and arguments
-      {{.Input -}}
-      <|start_of_role|>assistant<|end_of_role|>
-    chat: |
-      {{.Input -}}
-      <|start_of_role|>assistant<|end_of_role|>
-    completion: |
-      {{.Input}}
-  context_size: 8192
-  f16: true
-  stopwords:
-  - '<|im_end|>'
-  - '<dummy32000>'
-  - '</s>'
-  - '<|end_of_text|>'
+            Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.
+            {{range .Functions}}
+            {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+            {{end}}
+            For each function call return a json object with function name and arguments
+            {{.Input -}}
+            <|start_of_role|>assistant<|end_of_role|>
+name: granite-3.2
--- a/gallery/harmony.yaml
+++ b/gallery/harmony.yaml
@@ -1,69 +1,69 @@
---
-name: "harmony"
-
 config_file: |
-  mmap: true
-  backend: "llama-cpp"
-  template:
-    chat_message: |-
-      <|start|>{{ if .FunctionCall -}}functions.{{ .FunctionCall.Name }} to=assistant{{ else if eq .RoleName "assistant"}}assistant<|channel|>final<|message|>{{else}}{{ .RoleName }}{{end}}<|message|>
-      {{- if .Content -}}
-      {{- .Content -}}
-      {{- end -}}
-      {{- if .FunctionCall -}}
-      {{- toJson .FunctionCall -}}
-      {{- end -}}<|end|>
-    function: |-
-      <|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
-      Knowledge cutoff: 2024-06
-      Current date: {{ now | date "Mon Jan 2 15:04:05 MST 2006" }}
+    backend: llama-cpp
+    context_size: 8192
+    f16: true
+    known_usecases:
+        - chat
+    mmap: true
+    stopwords:
+        - <|im_end|>
+        - <dummy32000>
+        - </s>
+        - <|endoftext|>
+        - <|return|>
+    template:
+        chat: |-
+            <|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
+            Knowledge cutoff: 2024-06
+            Current date: {{ now | date "Mon Jan 2 15:04:05 MST 2006" }}

-      Reasoning: {{if eq .ReasoningEffort ""}}medium{{else}}{{.ReasoningEffort}}{{end}}
+            Reasoning: {{if eq .ReasoningEffort ""}}medium{{else}}{{.ReasoningEffort}}{{end}}

-      # {{with .Metadata}}{{ if ne .system_prompt "" }}{{ .system_prompt }}{{ end }}{{else}}You are a friendly and helpful assistant.{{ end }}<|end|>{{- .Input -}}<|start|>assistant
+            # {{with .Metadata}}{{ if ne .system_prompt "" }}{{ .system_prompt }}{{ end }}{{else}}You are a friendly and helpful assistant.{{ end }}<|end|>{{- .Input -}}<|start|>assistant
+        chat_message: |-
+            <|start|>{{ if .FunctionCall -}}functions.{{ .FunctionCall.Name }} to=assistant{{ else if eq .RoleName "assistant"}}assistant<|channel|>final<|message|>{{else}}{{ .RoleName }}{{end}}<|message|>
+            {{- if .Content -}}
+            {{- .Content -}}
+            {{- end -}}
+            {{- if .FunctionCall -}}
+            {{- toJson .FunctionCall -}}
+            {{- end -}}<|end|>
+        completion: |
+            {{.Input}}
+        function: |-
+            <|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
+            Knowledge cutoff: 2024-06
+            Current date: {{ now | date "Mon Jan 2 15:04:05 MST 2006" }}

-      # Tools
+            Reasoning: {{if eq .ReasoningEffort ""}}medium{{else}}{{.ReasoningEffort}}{{end}}

-      ## functions
+            # {{with .Metadata}}{{ if ne .system_prompt "" }}{{ .system_prompt }}{{ end }}{{else}}You are a friendly and helpful assistant.{{ end }}<|end|>{{- .Input -}}<|start|>assistant

-      namespace functions {
-      {{-range .Functions}}
-      {{if .Description }}
-      // {{ .Description }}
-      {{- end }}
-      {{- if and .Parameters.Properties (gt (len .Parameters.Properties) 0) }}
-      type {{ .Name }} = (_: {
-      {{- range $name, $prop := .Parameters.Properties }}
-      {{- if $prop.Description }}
-        // {{ $prop.Description }}
-      {{- end }}
-        {{ $name }}: {{ if gt (len $prop.Type) 1 }}{{ range $i, $t := $prop.Type }}{{ if $i }} | {{ end }}{{ $t }}{{ end }}{{ else }}{{ index $prop.Type 0 }}{{ end }},
-      {{- end }}
-      }) => any;
-      {{- else }}
-      type {{ .Function.Name }} = () => any;
-      {{- end }}
-      {{- end }}{{/* end of range .Functions */}}
-      } // namespace functions
+            # Tools

-      # Instructions
+            ## functions

-      <|end|>{{.Input -}}<|start|>assistant
-    chat: |-
-      <|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
-      Knowledge cutoff: 2024-06
-      Current date: {{ now | date "Mon Jan 2 15:04:05 MST 2006" }}
+            namespace functions {
+            {{-range .Functions}}
+            {{if .Description }}
+            // {{ .Description }}
+            {{- end }}
+            {{- if and .Parameters.Properties (gt (len .Parameters.Properties) 0) }}
+            type {{ .Name }} = (_: {
+            {{- range $name, $prop := .Parameters.Properties }}
+            {{- if $prop.Description }}
+              // {{ $prop.Description }}
+            {{- end }}
+              {{ $name }}: {{ if gt (len $prop.Type) 1 }}{{ range $i, $t := $prop.Type }}{{ if $i }} | {{ end }}{{ $t }}{{ end }}{{ else }}{{ index $prop.Type 0 }}{{ end }},
+            {{- end }}
+            }) => any;
+            {{- else }}
+            type {{ .Function.Name }} = () => any;
+            {{- end }}
+            {{- end }}{{/* end of range .Functions */}}
+            } // namespace functions

-      Reasoning: {{if eq .ReasoningEffort ""}}medium{{else}}{{.ReasoningEffort}}{{end}}
+            # Instructions

-      # {{with .Metadata}}{{ if ne .system_prompt "" }}{{ .system_prompt }}{{ end }}{{else}}You are a friendly and helpful assistant.{{ end }}<|end|>{{- .Input -}}<|start|>assistant
-    completion: |
-      {{.Input}}
-  context_size: 8192
-  f16: true
-  stopwords:
-  - '<|im_end|>'
-  - '<dummy32000>'
-  - '</s>'
-  - '<|endoftext|>'
-  - '<|return|>'
+            <|end|>{{.Input -}}<|start|>assistant
+name: harmony
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
--- a/gallery/lfm.yaml
+++ b/gallery/lfm.yaml
@@ -1,46 +1,46 @@
---
-name: "lfm"
-
 config_file: |
-  backend: "llama-cpp"
-  mmap: true
-  template:
-    chat_message: |
-      <|im_start|>{{ .RoleName }}
-      {{ if .FunctionCall -}}
-      <|tool_call_start|>
-      {{ else if eq .RoleName "tool" -}}
-      <|tool_response_start|>
-      {{ end -}}
-      {{ if .Content -}}
-      {{.Content }}
-      {{ end -}}
-      {{ if eq .RoleName "tool" -}}
-      <|tool_response_end|>
-      {{ end -}}
-      {{ if .FunctionCall -}}
-      {{toJson .FunctionCall}}
-      {{ end -}}<|im_end|>
-    function: |
-      <|im_start|>system
-      You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.
-      List of tools: <|tool_list_start|>[
-      {{range .Functions}}
-      {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-      {{end}}
-      ]<|tool_list_end|>
-      <|im_end|>
-      {{.Input -}}
-      <|im_start|>assistant
-    chat: |
-      {{.Input -}}
-      <|im_start|>assistant
-    completion: |
-      {{.Input}}
-  context_size: 4096
-  f16: true
-  stopwords:
-  - '<|im_end|>'
-  - '<dummy32000>'
-  - '</s>'
-  - '<|endoftext|>'
+    backend: llama-cpp
+    context_size: 4096
+    f16: true
+    known_usecases:
+        - chat
+    mmap: true
+    stopwords:
+        - <|im_end|>
+        - <dummy32000>
+        - </s>
+        - <|endoftext|>
+    template:
+        chat: |
+            {{.Input -}}
+            <|im_start|>assistant
+        chat_message: |
+            <|im_start|>{{ .RoleName }}
+            {{ if .FunctionCall -}}
+            <|tool_call_start|>
+            {{ else if eq .RoleName "tool" -}}
+            <|tool_response_start|>
+            {{ end -}}
+            {{ if .Content -}}
+            {{.Content }}
+            {{ end -}}
+            {{ if eq .RoleName "tool" -}}
+            <|tool_response_end|>
+            {{ end -}}
+            {{ if .FunctionCall -}}
+            {{toJson .FunctionCall}}
+            {{ end -}}<|im_end|>
+        completion: |
+            {{.Input}}
+        function: |
+            <|im_start|>system
+            You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.
+            List of tools: <|tool_list_start|>[
+            {{range .Functions}}
+            {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+            {{end}}
+            ]<|tool_list_end|>
+            <|im_end|>
+            {{.Input -}}
+            <|im_start|>assistant
+name: lfm
--- a/gallery/moondream.yaml
+++ b/gallery/moondream.yaml
@@ -1,19 +1,20 @@
---
-name: "moondream2"
-
-
 config_file: |
-    backend: "llama-cpp"
+    backend: llama-cpp
    context_size: 2046
-    roles:
-      user: "\nQuestion: "
-      system: "\nSystem: "
-      assistant: "\nAnswer: "
-    stopwords:
-    - "Question:"
-    - "<|endoftext|>"
    f16: true
+    known_usecases:
+        - chat
+    roles:
+        assistant: "\nAnswer: "
+        system: "\nSystem: "
+        user: "\nQuestion: "
+    stopwords:
+        - 'Question:'
+        - <|endoftext|>
    template:
-      completion: |
-        Complete the following sentence: {{.Input}}
-      chat: "{{.Input}}\nAnswer:\n"
+        chat: |
+            {{.Input}}
+            Answer:
+        completion: |
+            Complete the following sentence: {{.Input}}
+name: moondream2
--- a/gallery/nanbeige4.1.yaml
+++ b/gallery/nanbeige4.1.yaml
@@ -1,16 +1,15 @@
---
-name: nanbeige4.1
-
 config_file: |
-  backend: llama-cpp
-  function:
-      grammar:
-          disable: true
-  known_usecases:
-      - chat
-  options:
-      - use_jinja:true
-  parameters:
-      model: llama-cpp/models/nanbeige4.1-3b-q8_0.gguf
-  template:
-      use_tokenizer_template: true
+    backend: llama-cpp
+    function:
+        grammar:
+            disable: true
+    known_usecases:
+        - chat
+        - completion
+    options:
+        - use_jinja:true
+    parameters:
+        model: llama-cpp/models/nanbeige4.1-3b-q8_0.gguf
+    template:
+        use_tokenizer_template: true
+name: nanbeige4.1
--- a/gallery/openvino.yaml
+++ b/gallery/openvino.yaml
@@ -1,9 +1,9 @@
---
-name: openvino
-
 config_file: |
-  backend: transformers
-  context_size: 8192
-  type: OVModelForCausalLM
-  template:
-    use_tokenizer_template: true
+    backend: transformers
+    context_size: 8192
+    known_usecases:
+        - embeddings
+    template:
+        use_tokenizer_template: true
+    type: OVModelForCausalLM
+name: openvino
--- a/gallery/qwen3.yaml
+++ b/gallery/qwen3.yaml
@@ -1,46 +1,46 @@
---
-name: "qwen3"
-
 config_file: |
-  parameters:
-    context_size: 8192
-    f16: true
-    mmap: true
-  backend: "llama-cpp"
-  template:
-    chat_message: |
-      <|im_start|>{{if eq .RoleName "tool" }}user{{else}}{{ .RoleName }}{{end}}
-      {{ if eq .RoleName "tool" -}}
-      <tool_response>
-      {{ end -}}
-      {{ if .Content -}}
-      {{.Content }}
-      {{ end -}}
-      {{ if eq .RoleName "tool" -}}
-      </tool_response>
-      {{ end -}}
-      {{ if .FunctionCall -}}
-      <tool_call>
-      {{toJson .FunctionCall}}
-      </tool_call>
-      {{ end -}}<|im_end|>
-    function: |
-      <|im_start|>system
-      You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
-      {{range .Functions}}
-      {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
-      {{end}}
-      For each function call return a json object with function name and arguments: {"name": <function-name>, "arguments": <json-arguments-object>}
-      <|im_end|>
-      {{.Input -}}
-      <|im_start|>assistant
-    chat: |
-      {{.Input -}}
-      <|im_start|>assistant
-    completion: |
-      {{.Input}}
-  stopwords:
-  - '<|im_end|>'
-  - '<dummy32000>'
-  - '</s>'
-  - '<|endoftext|>'
+    backend: llama-cpp
+    known_usecases:
+        - chat
+    parameters:
+        context_size: 8192
+        f16: true
+        mmap: true
+    stopwords:
+        - <|im_end|>
+        - <dummy32000>
+        - </s>
+        - <|endoftext|>
+    template:
+        chat: |
+            {{.Input -}}
+            <|im_start|>assistant
+        chat_message: |
+            <|im_start|>{{if eq .RoleName "tool" }}user{{else}}{{ .RoleName }}{{end}}
+            {{ if eq .RoleName "tool" -}}
+            <tool_response>
+            {{ end -}}
+            {{ if .Content -}}
+            {{.Content }}
+            {{ end -}}
+            {{ if eq .RoleName "tool" -}}
+            </tool_response>
+            {{ end -}}
+            {{ if .FunctionCall -}}
+            <tool_call>
+            {{toJson .FunctionCall}}
+            </tool_call>
+            {{ end -}}<|im_end|>
+        completion: |
+            {{.Input}}
+        function: |
+            <|im_start|>system
+            You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+            {{range .Functions}}
+            {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
+            {{end}}
+            For each function call return a json object with function name and arguments: {"name": <function-name>, "arguments": <json-arguments-object>}
+            <|im_end|>
+            {{.Input -}}
+            <|im_start|>assistant
+name: qwen3
--- a/gallery/smolvlm.yaml
+++ b/gallery/smolvlm.yaml
@@ -1,20 +1,21 @@
---
-name: smolvlm
-# yamllint disable-line rule:trailing-spaces
 config_file: |
-    backend: "llama-cpp"
-    mmap: true
-    template:
-      chat_message: |
-        {{if eq .RoleName "assistant"}}Assistant{{else if eq .RoleName "system"}}System{{else if eq .RoleName "user"}}User{{end}}: {{.Content }}<end_of_utterance>
-      chat: "<|im_start|>\n{{.Input -}}\nAssistant: "
-      completion: |
-        {{-.Input}}
+    backend: llama-cpp
    f16: true
+    known_usecases:
+        - chat
+        - vision
+    mmap: true
    stopwords:
-    - '<|im_end|>'
-    - '<dummy32000>'
-    - '</s>'
-    - '<|'
-    - '<end_of_utterance>'
-    - '<|endoftext|>'
+        - <|im_end|>
+        - <dummy32000>
+        - </s>
+        - <|
+        - <end_of_utterance>
+        - <|endoftext|>
+    template:
+        chat: "<|im_start|>\n{{.Input -}}\nAssistant: "
+        chat_message: |
+            {{if eq .RoleName "assistant"}}Assistant{{else if eq .RoleName "system"}}System{{else if eq .RoleName "user"}}User{{end}}: {{.Content }}<end_of_utterance>
+        completion: |
+            {{-.Input}}
+name: smolvlm
--- a/pkg/vram/cache.go
+++ b/pkg/vram/cache.go
@@ -3,94 +3,93 @@ package vram
 import (
 	"context"
 	"sync"
-	"time"
 )

-const defaultEstimateCacheTTL = 15 * time.Minute
+// galleryGenFunc returns the current gallery generation counter.
+// When set, cache entries are invalidated when the generation changes.
+// When nil (e.g., in tests or non-gallery contexts), entries never expire.
+var galleryGenFunc func() uint64
+
+// SetGalleryGenerationFunc wires the gallery generation counter into the
+// VRAM caches. Call this once at application startup.
+func SetGalleryGenerationFunc(fn func() uint64) {
+	galleryGenFunc = fn
+}
+
+func currentGeneration() uint64 {
+	if galleryGenFunc != nil {
+		return galleryGenFunc()
+	}
+	return 0
+}

 type sizeCacheEntry struct {
-	size  int64
-	err   error
-	until time.Time
+	size       int64
+	err        error
+	generation uint64
 }

 type cachedSizeResolver struct {
 	underlying SizeResolver
-	ttl        time.Duration
 	mu         sync.Mutex
 	cache      map[string]sizeCacheEntry
 }

 func (c *cachedSizeResolver) ContentLength(ctx context.Context, uri string) (int64, error) {
+	gen := currentGeneration()
 	c.mu.Lock()
 	e, ok := c.cache[uri]
 	c.mu.Unlock()
-	if ok && time.Now().Before(e.until) {
+	if ok && e.generation == gen {
 		return e.size, e.err
 	}
 	size, err := c.underlying.ContentLength(ctx, uri)
 	c.mu.Lock()
-	if c.cache == nil {
-		c.cache = make(map[string]sizeCacheEntry)
-	}
-	c.cache[uri] = sizeCacheEntry{size: size, err: err, until: time.Now().Add(c.ttl)}
+	c.cache[uri] = sizeCacheEntry{size: size, err: err, generation: gen}
 	c.mu.Unlock()
 	return size, err
 }

 type ggufCacheEntry struct {
-	meta  *GGUFMeta
-	err   error
-	until time.Time
+	meta       *GGUFMeta
+	err        error
+	generation uint64
 }

 type cachedGGUFReader struct {
 	underlying GGUFMetadataReader
-	ttl        time.Duration
 	mu         sync.Mutex
 	cache      map[string]ggufCacheEntry
 }

 func (c *cachedGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMeta, error) {
+	gen := currentGeneration()
 	c.mu.Lock()
 	e, ok := c.cache[uri]
 	c.mu.Unlock()
-	if ok && time.Now().Before(e.until) {
+	if ok && e.generation == gen {
 		return e.meta, e.err
 	}
 	meta, err := c.underlying.ReadMetadata(ctx, uri)
 	c.mu.Lock()
-	if c.cache == nil {
-		c.cache = make(map[string]ggufCacheEntry)
-	}
-	c.cache[uri] = ggufCacheEntry{meta: meta, err: err, until: time.Now().Add(c.ttl)}
+	c.cache[uri] = ggufCacheEntry{meta: meta, err: err, generation: gen}
 	c.mu.Unlock()
 	return meta, err
 }

-// CachedSizeResolver returns a SizeResolver that caches ContentLength results by URI for the given TTL.
-func CachedSizeResolver(underlying SizeResolver, ttl time.Duration) SizeResolver {
-	return &cachedSizeResolver{underlying: underlying, ttl: ttl, cache: make(map[string]sizeCacheEntry)}
-}
-
-// CachedGGUFReader returns a GGUFMetadataReader that caches ReadMetadata results by URI for the given TTL.
-func CachedGGUFReader(underlying GGUFMetadataReader, ttl time.Duration) GGUFMetadataReader {
-	return &cachedGGUFReader{underlying: underlying, ttl: ttl, cache: make(map[string]ggufCacheEntry)}
-}
-
-// DefaultCachedSizeResolver returns a cached SizeResolver using the default implementation and default TTL (15 min).
-// A single shared cache is used so repeated HEAD requests for the same URI are avoided across requests.
+// DefaultCachedSizeResolver returns a cached SizeResolver using the default implementation.
+// Entries are invalidated when the gallery generation changes.
 func DefaultCachedSizeResolver() SizeResolver {
 	return defaultCachedSizeResolver
 }

-// DefaultCachedGGUFReader returns a cached GGUFMetadataReader using the default implementation and default TTL (15 min).
-// A single shared cache is used so repeated GGUF metadata fetches for the same URI are avoided across requests.
+// DefaultCachedGGUFReader returns a cached GGUFMetadataReader using the default implementation.
+// Entries are invalidated when the gallery generation changes.
 func DefaultCachedGGUFReader() GGUFMetadataReader {
 	return defaultCachedGGUFReader
 }

 var (
-	defaultCachedSizeResolver = CachedSizeResolver(defaultSizeResolver{}, defaultEstimateCacheTTL)
-	defaultCachedGGUFReader   = CachedGGUFReader(defaultGGUFReader{}, defaultEstimateCacheTTL)
+	defaultCachedSizeResolver = &cachedSizeResolver{underlying: defaultSizeResolver{}, cache: make(map[string]sizeCacheEntry)}
+	defaultCachedGGUFReader   = &cachedGGUFReader{underlying: defaultGGUFReader{}, cache: make(map[string]ggufCacheEntry)}
 )
--- a/pkg/vram/estimate.go
+++ b/pkg/vram/estimate.go
@@ -23,17 +23,19 @@ func IsGGUF(nameOrURI string) bool {
 	return strings.ToLower(path.Ext(path.Base(nameOrURI))) == ".gguf"
 }

-func Estimate(ctx context.Context, files []FileInput, opts EstimateOptions, sizeResolver SizeResolver, ggufReader GGUFMetadataReader) (EstimateResult, error) {
-	if opts.ContextLength == 0 {
-		opts.ContextLength = 8192
-	}
-	if opts.KVQuantBits == 0 {
-		opts.KVQuantBits = 16
-	}
+// modelProfile captures the "fixed" properties of a model after I/O.
+// Everything except context length is constant for a given model.
+type modelProfile struct {
+	sizeBytes    uint64    // total weight file size
+	ggufSize     uint64    // GGUF file size (subset of sizeBytes)
+	meta         *GGUFMeta // nil if no GGUF metadata available
+}

-	var sizeBytes uint64
-	var ggufSize uint64
+// resolveProfile does all I/O: iterates files, fetches sizes and GGUF metadata.
+func resolveProfile(ctx context.Context, files []FileInput, sizeResolver SizeResolver, ggufReader GGUFMetadataReader) modelProfile {
+	var p modelProfile
 	var firstGGUFURI string
+
 	for i := range files {
 		f := &files[i]
 		if !IsWeightFile(f.URI) {
@@ -47,23 +49,32 @@ func Estimate(ctx context.Context, files []FileInput, opts EstimateOptions, size
 				continue
 			}
 		}
-		sizeBytes += uint64(sz)
+		p.sizeBytes += uint64(sz)
 		if IsGGUF(f.URI) {
-			ggufSize += uint64(sz)
+			p.ggufSize += uint64(sz)
 			if firstGGUFURI == "" {
 				firstGGUFURI = f.URI
 			}
 		}
 	}

-	sizeDisplay := FormatBytes(sizeBytes)
+	if p.ggufSize > 0 && ggufReader != nil && firstGGUFURI != "" {
+		p.meta, _ = ggufReader.ReadMetadata(ctx, firstGGUFURI)
+	}

-	var vramBytes uint64
-	if ggufSize > 0 {
-		var meta *GGUFMeta
-		if ggufReader != nil && firstGGUFURI != "" {
-			meta, _ = ggufReader.ReadMetadata(ctx, firstGGUFURI)
-		}
+	return p
+}
+
+// computeVRAM is pure arithmetic — no I/O. Returns VRAM bytes for a given
+// model profile and context length.
+func computeVRAM(p modelProfile, ctxLen uint32, opts EstimateOptions) uint64 {
+	kvQuantBits := opts.KVQuantBits
+	if kvQuantBits == 0 {
+		kvQuantBits = 16
+	}
+
+	if p.ggufSize > 0 {
+		meta := p.meta
 		if meta != nil && (meta.BlockCount > 0 || meta.EmbeddingLength > 0) {
 			nLayers := meta.BlockCount
 			if nLayers == 0 {
@@ -84,36 +95,29 @@ func Estimate(ctx context.Context, files []FileInput, opts EstimateOptions, size
 			if gpuLayers <= 0 {
 				gpuLayers = int(nLayers)
 			}
-			ctxLen := opts.ContextLength
-			bKV := uint32(opts.KVQuantBits / 8)
+			bKV := uint32(kvQuantBits / 8)
 			if bKV == 0 {
 				bKV = 4
 			}
-			M_model := ggufSize
-			M_KV := uint64(bKV) * uint64(dModel) * uint64(nLayers) * uint64(ctxLen)
-			if headCountKV > 0 && meta.HeadCount > 0 {
-				M_KV = uint64(bKV) * uint64(dModel) * uint64(headCountKV) * uint64(ctxLen)
-			}
+
+			M_model := p.ggufSize
+			M_KV := uint64(bKV) * uint64(dModel) * uint64(headCountKV) * uint64(ctxLen)
 			P := M_model * 2
 			M_overhead := uint64(0.02*float64(P) + 0.15*1e9)
-			vramBytes = M_model + M_KV + M_overhead
+			vramBytes := M_model + M_KV + M_overhead
 			if nLayers > 0 && gpuLayers < int(nLayers) {
 				layerRatio := float64(gpuLayers) / float64(nLayers)
 				vramBytes = uint64(layerRatio*float64(M_model)) + M_KV + M_overhead
 			}
-		} else {
-			vramBytes = sizeOnlyVRAM(ggufSize, opts.ContextLength)
+			return vramBytes
 		}
-	} else if sizeBytes > 0 {
-		vramBytes = sizeOnlyVRAM(sizeBytes, opts.ContextLength)
+		return sizeOnlyVRAM(p.ggufSize, ctxLen)
 	}

-	return EstimateResult{
-		SizeBytes:   sizeBytes,
-		SizeDisplay: sizeDisplay,
-		VRAMBytes:   vramBytes,
-		VRAMDisplay: FormatBytes(vramBytes),
-	}, nil
+	if p.sizeBytes > 0 {
+		return sizeOnlyVRAM(p.sizeBytes, ctxLen)
+	}
+	return 0
 }

 func sizeOnlyVRAM(sizeOnDisk uint64, ctxLen uint32) uint64 {
@@ -125,6 +129,45 @@ func sizeOnlyVRAM(sizeOnDisk uint64, ctxLen uint32) uint64 {
 	return vram
 }

+// buildEstimates computes VRAMAt entries for each context size from a profile.
+func buildEstimates(p modelProfile, contextSizes []uint32, opts EstimateOptions) map[string]VRAMAt {
+	m := make(map[string]VRAMAt, len(contextSizes))
+	for _, ctxLen := range contextSizes {
+		vramBytes := computeVRAM(p, ctxLen, opts)
+		m[fmt.Sprint(ctxLen)] = VRAMAt{
+			ContextLength: ctxLen,
+			VRAMBytes:     vramBytes,
+			VRAMDisplay:   FormatBytes(vramBytes),
+		}
+	}
+	return m
+}
+
+
+// EstimateMultiContext estimates model size and VRAM at multiple context sizes.
+// It performs I/O once (resolveProfile) then computes VRAM for each context size.
+func EstimateMultiContext(ctx context.Context, files []FileInput, contextSizes []uint32,
+	opts EstimateOptions, sizeResolver SizeResolver, ggufReader GGUFMetadataReader) (MultiContextEstimate, error) {
+
+	if len(contextSizes) == 0 {
+		contextSizes = []uint32{8192}
+	}
+
+	p := resolveProfile(ctx, files, sizeResolver, ggufReader)
+
+	result := MultiContextEstimate{
+		SizeBytes:   p.sizeBytes,
+		SizeDisplay: FormatBytes(p.sizeBytes),
+		Estimates:   buildEstimates(p, contextSizes, opts),
+	}
+
+	if p.meta != nil && p.meta.MaximumContextLength > 0 {
+		result.ModelMaxContext = p.meta.MaximumContextLength
+	}
+
+	return result, nil
+}
+
 // ParseSizeString parses a human-readable size string (e.g. "500MB", "14.5 GB", "2tb")
 // into bytes. Supports B, KB, MB, GB, TB, PB (case-insensitive, space optional).
 // Uses SI units (1 KB = 1000 B).
@@ -136,7 +179,6 @@ func ParseSizeString(s string) (uint64, error) {

 	s = strings.ToUpper(s)

-	// Find where the numeric part ends
 	i := 0
 	for i < len(s) && (s[i] == '.' || (s[i] >= '0' && s[i] <= '9')) {
 		i++
@@ -177,17 +219,6 @@ func ParseSizeString(s string) (uint64, error) {
 	return uint64(num * float64(multiplier)), nil
 }

-// EstimateFromSize builds an EstimateResult from a raw byte count.
-func EstimateFromSize(sizeBytes uint64) EstimateResult {
-	vramBytes := sizeOnlyVRAM(sizeBytes, 8192)
-	return EstimateResult{
-		SizeBytes:   sizeBytes,
-		SizeDisplay: FormatBytes(sizeBytes),
-		VRAMBytes:   vramBytes,
-		VRAMDisplay: FormatBytes(vramBytes),
-	}
-}
-
 func FormatBytes(n uint64) string {
 	const unit = 1000
 	if n < unit {
@@ -216,24 +247,29 @@ func DefaultGGUFReader() GGUFMetadataReader {
 }

 // ModelEstimateInput describes the inputs for a unified VRAM/size estimation.
-// The estimator cascades through available data: files → size string → HF repo → zero.
+// The estimator cascades through available data: files -> size string -> HF repo -> zero.
 type ModelEstimateInput struct {
 	Files   []FileInput     // weight files with optional pre-known sizes
 	Size    string          // gallery hardcoded size (e.g. "14.5GB")
 	HFRepo  string          // HF repo ID or URL
-	Options EstimateOptions // context length, GPU layers, KV quant bits
+	Options EstimateOptions // GPU layers, KV quant bits
 }

-// EstimateModel provides a unified VRAM estimation entry point.
+// EstimateModelMultiContext provides a unified VRAM estimation entry point
+// that returns estimates at multiple context sizes.
 // It tries (in order):
 //  1. Direct file-based estimation (GGUF metadata or file size heuristic)
 //  2. ParseSizeString from Size field
-//  3. EstimateFromHFRepo
+//  3. HuggingFace repo file listing
 //  4. Zero result
-func EstimateModel(ctx context.Context, input ModelEstimateInput) (EstimateResult, error) {
+func EstimateModelMultiContext(ctx context.Context, input ModelEstimateInput, contextSizes []uint32) (MultiContextEstimate, error) {
+	if len(contextSizes) == 0 {
+		contextSizes = []uint32{8192}
+	}
+
 	// 1. Try direct file estimation
 	if len(input.Files) > 0 {
-		result, err := Estimate(ctx, input.Files, input.Options, DefaultCachedSizeResolver(), DefaultCachedGGUFReader())
+		result, err := EstimateMultiContext(ctx, input.Files, contextSizes, input.Options, DefaultCachedSizeResolver(), DefaultCachedGGUFReader())
 		if err != nil {
 			xlog.Debug("VRAM estimation from files failed", "error", err)
 		}
@@ -247,7 +283,11 @@ func EstimateModel(ctx context.Context, input ModelEstimateInput) (EstimateResul
 		if sizeBytes, err := ParseSizeString(input.Size); err != nil {
 			xlog.Debug("VRAM estimation from size string failed", "error", err, "size", input.Size)
 		} else if sizeBytes > 0 {
-			return EstimateFromSize(sizeBytes), nil
+			return MultiContextEstimate{
+				SizeBytes:   sizeBytes,
+				SizeDisplay: FormatBytes(sizeBytes),
+				Estimates:   buildEstimates(modelProfile{sizeBytes: sizeBytes}, contextSizes, EstimateOptions{}),
+			}, nil
 		}
 	}

@@ -257,15 +297,19 @@ func EstimateModel(ctx context.Context, input ModelEstimateInput) (EstimateResul
 		hfRepo = repoID
 	}
 	if hfRepo != "" {
-		result, err := EstimateFromHFRepo(ctx, hfRepo)
+		totalBytes, err := hfRepoWeightSize(ctx, hfRepo)
 		if err != nil {
 			xlog.Debug("VRAM estimation from HF repo failed", "error", err, "repo", hfRepo)
 		}
-		if err == nil && result.SizeBytes > 0 {
-			return result, nil
+		if err == nil && totalBytes > 0 {
+			return MultiContextEstimate{
+				SizeBytes:   totalBytes,
+				SizeDisplay: FormatBytes(totalBytes),
+				Estimates:   buildEstimates(modelProfile{sizeBytes: totalBytes}, contextSizes, EstimateOptions{}),
+			}, nil
 		}
 	}

 	// 4. No estimation possible
-	return EstimateResult{}, nil
+	return MultiContextEstimate{}, nil
 }
--- a/pkg/vram/estimate_test.go
+++ b/pkg/vram/estimate_test.go
@@ -23,26 +23,25 @@ func (f fakeGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMeta
 	return f[uri], nil
 }

-var _ = Describe("Estimate", func() {
+var _ = Describe("EstimateMultiContext", func() {
 	ctx := context.Background()
+	defaultCtx := []uint32{8192}

 	Describe("empty or non-GGUF inputs", func() {
 		It("returns zero size and vram for nil files", func() {
-			opts := EstimateOptions{ContextLength: 8192}
-			res, err := Estimate(ctx, nil, opts, nil, nil)
+			res, err := EstimateMultiContext(ctx, nil, defaultCtx, EstimateOptions{}, nil, nil)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(res.SizeBytes).To(Equal(uint64(0)))
-			Expect(res.VRAMBytes).To(Equal(uint64(0)))
+			Expect(res.Estimates["8192"].VRAMBytes).To(Equal(uint64(0)))
 			Expect(res.SizeDisplay).To(Equal("0 B"))
 		})

-		It("counts only .gguf files and ignores other extensions", func() {
+		It("counts only weight files and ignores other extensions", func() {
 			files := []FileInput{
 				{URI: "http://a/model.gguf", Size: 1_000_000_000},
 				{URI: "http://a/readme.txt", Size: 100},
 			}
-			opts := EstimateOptions{ContextLength: 8192}
-			res, err := Estimate(ctx, files, opts, nil, nil)
+			res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, nil, nil)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(res.SizeBytes).To(Equal(uint64(1_000_000_000)))
 		})
@@ -52,8 +51,7 @@ var _ = Describe("Estimate", func() {
 				{URI: "http://hf.co/model/model.safetensors", Size: 2_000_000_000},
 				{URI: "http://hf.co/model/model2.safetensors", Size: 3_000_000_000},
 			}
-			opts := EstimateOptions{ContextLength: 8192}
-			res, err := Estimate(ctx, files, opts, nil, nil)
+			res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, nil, nil)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(res.SizeBytes).To(Equal(uint64(5_000_000_000)))
 		})
@@ -62,24 +60,22 @@ var _ = Describe("Estimate", func() {
 	Describe("GGUF size and resolver", func() {
 		It("uses size resolver when file size is not set", func() {
 			sizes := fakeSizeResolver{"http://example.com/model.gguf": 1_500_000_000}
-			opts := EstimateOptions{ContextLength: 8192}
 			files := []FileInput{{URI: "http://example.com/model.gguf"}}

-			res, err := Estimate(ctx, files, opts, sizes, nil)
+			res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, sizes, nil)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(res.SizeBytes).To(Equal(uint64(1_500_000_000)))
-			Expect(res.VRAMBytes).To(BeNumerically(">=", res.SizeBytes))
+			Expect(res.Estimates["8192"].VRAMBytes).To(BeNumerically(">=", res.SizeBytes))
 			Expect(res.SizeDisplay).To(Equal("1.5 GB"))
 		})

 		It("uses size-only VRAM formula when metadata is missing and size is large", func() {
 			sizes := fakeSizeResolver{"http://a/model.gguf": 10_000_000_000}
-			opts := EstimateOptions{ContextLength: 8192}
 			files := []FileInput{{URI: "http://a/model.gguf"}}

-			res, err := Estimate(ctx, files, opts, sizes, nil)
+			res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, sizes, nil)
 			Expect(err).ToNot(HaveOccurred())
-			Expect(res.VRAMBytes).To(BeNumerically(">", 10_000_000_000))
+			Expect(res.Estimates["8192"].VRAMBytes).To(BeNumerically(">", 10_000_000_000))
 		})

 		It("sums size for multiple GGUF shards", func() {
@@ -87,18 +83,16 @@ var _ = Describe("Estimate", func() {
 				{URI: "http://a/shard1.gguf", Size: 10_000_000_000},
 				{URI: "http://a/shard2.gguf", Size: 5_000_000_000},
 			}
-			opts := EstimateOptions{ContextLength: 8192}

-			res, err := Estimate(ctx, files, opts, nil, nil)
+			res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, nil, nil)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(res.SizeBytes).To(Equal(uint64(15_000_000_000)))
 		})

 		It("formats size display correctly", func() {
 			files := []FileInput{{URI: "http://a/model.gguf", Size: 2_500_000_000}}
-			opts := EstimateOptions{ContextLength: 8192}

-			res, err := Estimate(ctx, files, opts, nil, nil)
+			res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, nil, nil)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(res.SizeDisplay).To(Equal("2.5 GB"))
 		})
@@ -108,24 +102,94 @@ var _ = Describe("Estimate", func() {
 		It("uses metadata for VRAM when reader returns meta and partial offload", func() {
 			meta := &GGUFMeta{BlockCount: 32, EmbeddingLength: 4096}
 			reader := fakeGGUFReader{"http://a/model.gguf": meta}
-			opts := EstimateOptions{ContextLength: 8192, GPULayers: 20}
+			opts := EstimateOptions{GPULayers: 20}
 			files := []FileInput{{URI: "http://a/model.gguf", Size: 8_000_000_000}}

-			res, err := Estimate(ctx, files, opts, nil, reader)
+			res, err := EstimateMultiContext(ctx, files, defaultCtx, opts, nil, reader)
 			Expect(err).ToNot(HaveOccurred())
-			Expect(res.VRAMBytes).To(BeNumerically(">", 0))
+			Expect(res.Estimates["8192"].VRAMBytes).To(BeNumerically(">", 0))
 		})

 		It("uses metadata head counts for KV and yields vram > size", func() {
 			files := []FileInput{{URI: "http://a/model.gguf", Size: 15_000_000_000}}
 			meta := &GGUFMeta{BlockCount: 32, EmbeddingLength: 4096, HeadCount: 32, HeadCountKV: 8}
 			reader := fakeGGUFReader{"http://a/model.gguf": meta}
-			opts := EstimateOptions{ContextLength: 8192}

-			res, err := Estimate(ctx, files, opts, nil, reader)
+			res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, nil, reader)
 			Expect(err).ToNot(HaveOccurred())
 			Expect(res.SizeBytes).To(Equal(uint64(15_000_000_000)))
-			Expect(res.VRAMBytes).To(BeNumerically(">", res.SizeBytes))
+			Expect(res.Estimates["8192"].VRAMBytes).To(BeNumerically(">", res.SizeBytes))
+		})
+
+		It("populates ModelMaxContext from GGUF metadata", func() {
+			meta := &GGUFMeta{BlockCount: 32, EmbeddingLength: 4096, MaximumContextLength: 131072}
+			reader := fakeGGUFReader{"http://a/model.gguf": meta}
+			files := []FileInput{{URI: "http://a/model.gguf", Size: 8_000_000_000}}
+
+			res, err := EstimateMultiContext(ctx, files, defaultCtx, EstimateOptions{}, nil, reader)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.ModelMaxContext).To(Equal(uint64(131072)))
+		})
+	})
+
+	Describe("multi-context behavior", func() {
+		It("returns estimates for all requested context sizes", func() {
+			files := []FileInput{{URI: "http://a/model.gguf", Size: 4_000_000_000}}
+			sizes := []uint32{8192, 32768, 131072}
+
+			res, err := EstimateMultiContext(ctx, files, sizes, EstimateOptions{}, nil, nil)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.Estimates).To(HaveLen(3))
+			Expect(res.Estimates).To(HaveKey("8192"))
+			Expect(res.Estimates).To(HaveKey("32768"))
+			Expect(res.Estimates).To(HaveKey("131072"))
+		})
+
+		It("VRAM increases monotonically with context size", func() {
+			files := []FileInput{{URI: "http://a/model.gguf", Size: 4_000_000_000}}
+			meta := &GGUFMeta{BlockCount: 32, EmbeddingLength: 4096, HeadCount: 32, HeadCountKV: 8}
+			reader := fakeGGUFReader{"http://a/model.gguf": meta}
+			sizes := []uint32{8192, 16384, 32768, 65536, 131072, 262144}
+
+			res, err := EstimateMultiContext(ctx, files, sizes, EstimateOptions{}, nil, reader)
+			Expect(err).ToNot(HaveOccurred())
+
+			prev := uint64(0)
+			for _, sz := range sizes {
+				v := res.VRAMForContext(sz)
+				Expect(v).To(BeNumerically(">", prev), "VRAM should increase at context %d", sz)
+				prev = v
+			}
+		})
+
+		It("size is constant across context sizes", func() {
+			files := []FileInput{{URI: "http://a/model.gguf", Size: 4_000_000_000}}
+			sizes := []uint32{8192, 32768}
+
+			res, err := EstimateMultiContext(ctx, files, sizes, EstimateOptions{}, nil, nil)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.SizeBytes).To(Equal(uint64(4_000_000_000)))
+		})
+
+		It("defaults to [8192] when contextSizes is empty", func() {
+			files := []FileInput{{URI: "http://a/model.gguf", Size: 4_000_000_000}}
+
+			res, err := EstimateMultiContext(ctx, files, nil, EstimateOptions{}, nil, nil)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(res.Estimates).To(HaveLen(1))
+			Expect(res.Estimates).To(HaveKey("8192"))
+		})
+	})
+
+	Describe("VRAMForContext helper", func() {
+		It("returns 0 for missing context size", func() {
+			res := MultiContextEstimate{
+				Estimates: map[string]VRAMAt{
+					"8192": {VRAMBytes: 5000},
+				},
+			}
+			Expect(res.VRAMForContext(99999)).To(Equal(uint64(0)))
+			Expect(res.VRAMForContext(8192)).To(Equal(uint64(5000)))
 		})
 	})
 })
--- a/pkg/vram/hf_estimate.go
+++ b/pkg/vram/hf_estimate.go
@@ -4,7 +4,6 @@ import (
 	"context"
 	"strings"
 	"sync"
-	"time"

 	hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
 )
@@ -15,13 +14,11 @@ var (
 )

 type hfSizeCacheEntry struct {
-	result    EstimateResult
-	err       error
-	expiresAt time.Time
+	totalBytes uint64
+	err        error
+	generation uint64
 }

-const hfSizeCacheTTL = 15 * time.Minute
-
 // ExtractHFRepoID extracts a HuggingFace repo ID from a string.
 // It handles both short form ("org/model") and full URL form
 // ("https://huggingface.co/org/model", "huggingface.co/org/model").
@@ -62,30 +59,31 @@ func ExtractHFRepoID(s string) (string, bool) {
 	return "", false
 }

-// EstimateFromHFRepo estimates model size by querying the HuggingFace API for file listings.
-// Results are cached for 15 minutes.
-func EstimateFromHFRepo(ctx context.Context, repoID string) (EstimateResult, error) {
+// hfRepoWeightSize returns the total weight file size for a HuggingFace repo.
+// Results are cached and invalidated when the gallery generation changes.
+func hfRepoWeightSize(ctx context.Context, repoID string) (uint64, error) {
+	gen := currentGeneration()
 	hfSizeCacheMu.Lock()
-	if entry, ok := hfSizeCacheData[repoID]; ok && time.Now().Before(entry.expiresAt) {
+	if entry, ok := hfSizeCacheData[repoID]; ok && entry.generation == gen {
 		hfSizeCacheMu.Unlock()
-		return entry.result, entry.err
+		return entry.totalBytes, entry.err
 	}
 	hfSizeCacheMu.Unlock()

-	result, err := estimateFromHFRepoUncached(ctx, repoID)
+	totalBytes, err := hfRepoWeightSizeUncached(ctx, repoID)

 	hfSizeCacheMu.Lock()
 	hfSizeCacheData[repoID] = hfSizeCacheEntry{
-		result:    result,
-		err:       err,
-		expiresAt: time.Now().Add(hfSizeCacheTTL),
+		totalBytes: totalBytes,
+		err:        err,
+		generation: gen,
 	}
 	hfSizeCacheMu.Unlock()

-	return result, err
+	return totalBytes, err
 }

-func estimateFromHFRepoUncached(ctx context.Context, repoID string) (EstimateResult, error) {
+func hfRepoWeightSizeUncached(ctx context.Context, repoID string) (uint64, error) {
 	client := hfapi.NewClient()

 	type listResult struct {
@@ -100,17 +98,17 @@ func estimateFromHFRepoUncached(ctx context.Context, repoID string) (EstimateRes

 	select {
 	case <-ctx.Done():
-		return EstimateResult{}, ctx.Err()
+		return 0, ctx.Err()
 	case res := <-ch:
 		if res.err != nil {
-			return EstimateResult{}, res.err
+			return 0, res.err
 		}
-		return estimateFromFileInfos(res.files), nil
+		return sumWeightFileBytes(res.files), nil
 	}
 }

-func estimateFromFileInfos(files []hfapi.FileInfo) EstimateResult {
-	var totalSize int64
+func sumWeightFileBytes(files []hfapi.FileInfo) uint64 {
+	var total int64
 	for _, f := range files {
 		if f.Type != "file" {
 			continue
@@ -128,20 +126,10 @@ func estimateFromFileInfos(files []hfapi.FileInfo) EstimateResult {
 		if f.LFS != nil && f.LFS.Size > 0 {
 			size = f.LFS.Size
 		}
-		totalSize += size
+		total += size
 	}
-
-	if totalSize <= 0 {
-		return EstimateResult{}
-	}
-
-	sizeBytes := uint64(totalSize)
-	vramBytes := sizeOnlyVRAM(sizeBytes, 8192)
-
-	return EstimateResult{
-		SizeBytes:   sizeBytes,
-		SizeDisplay: FormatBytes(sizeBytes),
-		VRAMBytes:   vramBytes,
-		VRAMDisplay: FormatBytes(vramBytes),
+	if total < 0 {
+		return 0
 	}
+	return uint64(total)
 }
--- a/pkg/vram/types.go
+++ b/pkg/vram/types.go
@@ -1,6 +1,9 @@
 package vram

-import "context"
+import (
+	"context"
+	"fmt"
+)

 // FileInput represents a single model file for estimation (URI and optional pre-known size).
 type FileInput struct {
@@ -28,16 +31,45 @@ type GGUFMetadataReader interface {
 }

 // EstimateOptions configures VRAM/size estimation.
+// GPULayers and KVQuantBits apply uniformly across all context sizes.
 type EstimateOptions struct {
-	ContextLength uint32
-	GPULayers     int
-	KVQuantBits   int
+	GPULayers   int
+	KVQuantBits int
 }

-// EstimateResult holds estimated download size and VRAM with display strings.
-type EstimateResult struct {
-	SizeBytes   uint64 `json:"sizeBytes"`   // total model weight size in bytes
-	SizeDisplay string `json:"sizeDisplay"` // human-readable size (e.g. "4.2 GB")
-	VRAMBytes   uint64 `json:"vramBytes"`   // estimated VRAM usage in bytes
-	VRAMDisplay string `json:"vramDisplay"` // human-readable VRAM (e.g. "6.1 GB")
+// VRAMAt holds the VRAM estimate at a specific context size.
+type VRAMAt struct {
+	ContextLength uint32 `json:"contextLength"`
+	VRAMBytes     uint64 `json:"vramBytes"`
+	VRAMDisplay   string `json:"vramDisplay"`
+}
+
+// EstimateResult is a flat single-context view of an estimate, suitable for
+// the REST /api/models/vram-estimate response and the MCP vram_estimate tool.
+// It is the legacy shape the LLM and HTTP clients expect (size_bytes /
+// size_display / vram_bytes / vram_display).
+type EstimateResult struct {
+	SizeBytes     uint64 `json:"size_bytes"`
+	SizeDisplay   string `json:"size_display"`
+	ContextLength uint32 `json:"context_length,omitempty"`
+	VRAMBytes     uint64 `json:"vram_bytes"`
+	VRAMDisplay   string `json:"vram_display"`
+}
+
+// MultiContextEstimate holds VRAM estimates for one or more context sizes,
+// computed from a single metadata fetch.
+type MultiContextEstimate struct {
+	SizeBytes       uint64            `json:"sizeBytes"`
+	SizeDisplay     string            `json:"sizeDisplay"`
+	Estimates       map[string]VRAMAt `json:"estimates"`                // keys: context size as string
+	ModelMaxContext uint64            `json:"modelMaxContext,omitempty"` // from GGUF metadata
+}
+
+// VRAMForContext is a convenience method that returns the VRAMBytes for a
+// specific context size, or 0 if not present.
+func (m MultiContextEstimate) VRAMForContext(ctxLen uint32) uint64 {
+	if e, ok := m.Estimates[fmt.Sprint(ctxLen)]; ok {
+		return e.VRAMBytes
+	}
+	return 0
 }