From 3fe175868a7ecd3caaca31a23d118d4ab32459c4 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 4 Jul 2026 22:26:22 +0000 Subject: [PATCH] feat(api): add GET /v1/models/capabilities endpoint Additive superset of /v1/models that enriches each model entry with the capabilities it supports plus its input/output modalities (text / image / audio / video). Clients that only understand /v1/models are unaffected -- they simply never call the new route. Audio and video *input* are derived from the model's multimodal limits (vLLM limit_mm_per_prompt), which no single usecase FLAG expresses. That gap is exactly why a plain capability list is insufficient and this enriched endpoint exists: an attachment router can now decide whether an image/audio/video file can go to the active model directly, or must be converted/transcribed first. Capability derivation lives in core/config as the single source of truth (ModelConfig.Capabilities / InputModalities / OutputModalities / VisionSupported / ...); the Ollama capability surface now delegates to it instead of keeping a parallel copy. Vision is gated on chat/completion capability so a MediaMarker hydrated onto a non-chat model (e.g. a pure ASR/TTS backend) no longer reports a false vision capability. Read-only listing: no new FLAG_* flag, reuses the existing `models` swagger tag, and intentionally exposes no MCP admin tool (there is nothing to manage conversationally). Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto --- core/config/model_capabilities.go | 197 ++++++++++++++++++ core/config/model_capabilities_test.go | 103 +++++++++ core/http/endpoints/ollama/capabilities.go | 57 +---- core/http/endpoints/openai/list.go | 89 ++++---- .../endpoints/openai/list_capabilities.go | 50 +++++ .../openai/list_capabilities_test.go | 119 +++++++++++ core/http/routes/localai.go | 30 +-- core/http/routes/openai.go | 6 + core/schema/openai.go | 24 +++ docs/content/features/api-discovery.md | 40 ++++ docs/content/whats-new.md | 1 + swagger/docs.go | 62 ++++++ swagger/swagger.json | 62 ++++++ swagger/swagger.yaml | 45 ++++ 14 files changed, 785 insertions(+), 100 deletions(-) create mode 100644 core/config/model_capabilities.go create mode 100644 core/config/model_capabilities_test.go create mode 100644 core/http/endpoints/openai/list_capabilities.go create mode 100644 core/http/endpoints/openai/list_capabilities_test.go diff --git a/core/config/model_capabilities.go b/core/config/model_capabilities.go new file mode 100644 index 000000000..51b244675 --- /dev/null +++ b/core/config/model_capabilities.go @@ -0,0 +1,197 @@ +package config + +// This file is the single source of truth for deriving a model's user-facing +// capabilities and input/output modalities from its ModelConfig. Both the +// OpenAI-compatible /v1/models/capabilities endpoint and the Ollama-compatible +// /api/tags|/api/show surface consume these, so the vocabulary stays consistent +// across clients. Keep the detection heuristics here rather than duplicating +// them per endpoint. + +// VisionSupported reports whether the model can accept image inputs. +// +// We deliberately avoid HasUsecases(FLAG_VISION): GuessUsecases has no +// FLAG_VISION branch and reports true for any chat model, so it would paint +// vision onto text-only models. Instead we look for explicit signals: the +// declared KnownUsecases bit, a multimodal projector, or a template/backend +// multimodal marker. +func (c *ModelConfig) VisionSupported() bool { + if c.KnownUsecases != nil && (*c.KnownUsecases&FLAG_VISION) == FLAG_VISION { + return true + } + if c.MMProj != "" { + return true + } + if c.TemplateConfig.Multimodal != "" { + return true + } + if c.MediaMarker != "" { + return true + } + return false +} + +// ToolSupported reports whether the model is wired up for tool / function +// calling. We look for any of the explicit knobs LocalAI uses to drive +// function-call extraction (regex match, response regex, grammar triggers, XML +// format) or the auto-detected tool-format markers the llama.cpp backend +// populates during model load. +func (c *ModelConfig) ToolSupported() bool { + fc := c.FunctionsConfig + if fc.ToolFormatMarkers != nil && fc.ToolFormatMarkers.FormatType != "" { + return true + } + if len(fc.JSONRegexMatch) > 0 || len(fc.ResponseRegex) > 0 { + return true + } + if fc.XMLFormatPreset != "" || fc.XMLFormat != nil { + return true + } + if len(fc.GrammarConfig.GrammarTriggers) > 0 || fc.GrammarConfig.SchemaType != "" { + return true + } + return false +} + +// ThinkingSupported reports whether the model has reasoning / thinking enabled. +// LocalAI sets DisableReasoning=false (or leaves thinking markers configured) +// when the backend probe reports that the model supports thinking. +func (c *ModelConfig) ThinkingSupported() bool { + rc := c.ReasoningConfig + if rc.DisableReasoning != nil && !*rc.DisableReasoning { + return true + } + if len(rc.ThinkingStartTokens) > 0 || len(rc.TagPairs) > 0 { + // Explicit thinking markers imply support unless explicitly disabled. + return rc.DisableReasoning == nil || !*rc.DisableReasoning + } + return false +} + +// AudioInputSupported reports whether a chat/generation model accepts audio as +// input (e.g. vLLM omni models). The signal is the vLLM per-prompt audio limit; +// there is no FLAG_* for "chat model that hears audio", which is exactly why a +// plain usecase list can't express it. Transcription models are handled +// separately in InputModalities via FLAG_TRANSCRIPT. +func (c *ModelConfig) AudioInputSupported() bool { + return c.LimitMMPerPrompt.LimitAudioPerPrompt > 0 +} + +// VideoInputSupported reports whether a chat/generation model accepts video as +// input. The signal is the vLLM per-prompt video limit. Note this is distinct +// from FLAG_VIDEO, which denotes video *generation* (diffusers) — an output +// modality, not an input one. +func (c *ModelConfig) VideoInputSupported() bool { + return c.LimitMMPerPrompt.LimitVideoPerPrompt > 0 +} + +// Capabilities returns the ordered list of capability strings the model +// supports, using the canonical usecase vocabulary (chat, vision, transcript, +// tts, embeddings, image, video, ...) plus the modifier capabilities "tools" +// and "thinking". Vision is resolved via VisionSupported (not HasUsecases) to +// avoid the guess-heuristic false positive. +func (c *ModelConfig) Capabilities() []string { + chat := c.HasUsecases(FLAG_CHAT) + completion := c.HasUsecases(FLAG_COMPLETION) + + var caps []string + add := func(cond bool, name string) { + if cond { + caps = append(caps, name) + } + } + + add(chat, UsecaseChat) + add(completion, UsecaseCompletion) + add(c.HasUsecases(FLAG_EDIT), UsecaseEdit) + add(c.HasUsecases(FLAG_EMBEDDINGS), UsecaseEmbeddings) + add(c.HasUsecases(FLAG_RERANK), UsecaseRerank) + // Vision is only meaningful as an image-understanding modifier on a chat/ + // completion model. Gating on (chat||completion) matches the Ollama surface + // and avoids a false positive when config defaults hydrate a MediaMarker on + // a non-chat model (e.g. a pure ASR/TTS backend). + add((chat || completion) && c.VisionSupported(), UsecaseVision) + // tools/thinking are modifiers on the chat/completion surface. + add((chat || completion) && c.ToolSupported(), "tools") + add((chat || completion) && c.ThinkingSupported(), "thinking") + add(c.HasUsecases(FLAG_TRANSCRIPT), UsecaseTranscript) + add(c.HasUsecases(FLAG_TTS), UsecaseTTS) + add(c.HasUsecases(FLAG_SOUND_GENERATION), UsecaseSoundGeneration) + add(c.HasUsecases(FLAG_IMAGE), UsecaseImage) + add(c.HasUsecases(FLAG_VIDEO), UsecaseVideo) + add(c.HasUsecases(FLAG_VAD), UsecaseVAD) + add(c.HasUsecases(FLAG_DETECTION), UsecaseDetection) + add(c.HasUsecases(FLAG_DEPTH), UsecaseDepth) + add(c.HasUsecases(FLAG_AUDIO_TRANSFORM), UsecaseAudioTransform) + add(c.HasUsecases(FLAG_DIARIZATION), UsecaseDiarization) + add(c.HasUsecases(FLAG_SOUND_CLASSIFICATION), UsecaseSoundClassification) + add(c.HasUsecases(FLAG_REALTIME_AUDIO), UsecaseRealtimeAudio) + add(c.HasUsecases(FLAG_FACE_RECOGNITION), UsecaseFaceRecognition) + add(c.HasUsecases(FLAG_SPEAKER_RECOGNITION), UsecaseSpeakerRecognition) + return caps +} + +// InputModalities returns the set of modalities (text, image, audio, video) the +// model accepts as input, ordered text→image→audio→video. This is what an +// attachment router consults to decide whether an image/audio/video file can be +// handed to the active model directly. +func (c *ModelConfig) InputModalities() []string { + imageGen := c.HasUsecases(FLAG_IMAGE) + videoGen := c.HasUsecases(FLAG_VIDEO) + chatish := c.HasUsecases(FLAG_CHAT) || c.HasUsecases(FLAG_COMPLETION) + + textIn := chatish || c.HasUsecases(FLAG_EDIT) || + c.HasUsecases(FLAG_EMBEDDINGS) || c.HasUsecases(FLAG_RERANK) || c.HasUsecases(FLAG_TOKENIZE) || + c.HasUsecases(FLAG_TTS) || c.HasUsecases(FLAG_SOUND_GENERATION) || imageGen || videoGen + + // Image input via a chat model requires vision (gated on chat, like the + // Ollama surface); detection/depth/face models consume images directly. + imageIn := (chatish && c.VisionSupported()) || c.LimitMMPerPrompt.LimitImagePerPrompt > 0 || + c.HasUsecases(FLAG_DETECTION) || c.HasUsecases(FLAG_DEPTH) || c.HasUsecases(FLAG_FACE_RECOGNITION) + + audioIn := c.AudioInputSupported() || c.HasUsecases(FLAG_TRANSCRIPT) || c.HasUsecases(FLAG_AUDIO_TRANSFORM) || + c.HasUsecases(FLAG_REALTIME_AUDIO) || c.HasUsecases(FLAG_VAD) || c.HasUsecases(FLAG_DIARIZATION) || + c.HasUsecases(FLAG_SOUND_CLASSIFICATION) || c.HasUsecases(FLAG_SPEAKER_RECOGNITION) + + videoIn := c.VideoInputSupported() + + var mods []string + if textIn { + mods = append(mods, "text") + } + if imageIn { + mods = append(mods, "image") + } + if audioIn { + mods = append(mods, "audio") + } + if videoIn { + mods = append(mods, "video") + } + return mods +} + +// OutputModalities returns the set of modalities (text, image, audio, video) +// the model produces, ordered text→image→audio→video. +func (c *ModelConfig) OutputModalities() []string { + textOut := c.HasUsecases(FLAG_CHAT) || c.HasUsecases(FLAG_COMPLETION) || c.HasUsecases(FLAG_EDIT) || + c.HasUsecases(FLAG_TRANSCRIPT) + imageOut := c.HasUsecases(FLAG_IMAGE) + audioOut := c.HasUsecases(FLAG_TTS) || c.HasUsecases(FLAG_SOUND_GENERATION) || + c.HasUsecases(FLAG_AUDIO_TRANSFORM) || c.HasUsecases(FLAG_REALTIME_AUDIO) + videoOut := c.HasUsecases(FLAG_VIDEO) + + var mods []string + if textOut { + mods = append(mods, "text") + } + if imageOut { + mods = append(mods, "image") + } + if audioOut { + mods = append(mods, "audio") + } + if videoOut { + mods = append(mods, "video") + } + return mods +} diff --git a/core/config/model_capabilities_test.go b/core/config/model_capabilities_test.go new file mode 100644 index 000000000..8aab180b2 --- /dev/null +++ b/core/config/model_capabilities_test.go @@ -0,0 +1,103 @@ +package config + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func usecaseBits(flags ModelConfigUsecase) *ModelConfigUsecase { + return &flags +} + +var _ = Describe("Model capabilities derivation", func() { + Describe("VisionSupported", func() { + It("is false for a plain text chat model", func() { + cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"} + Expect(cfg.VisionSupported()).To(BeFalse()) + }) + + It("is true when the FLAG_VISION bit is declared", func() { + cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT | FLAG_VISION), Backend: "llama.cpp"} + Expect(cfg.VisionSupported()).To(BeTrue()) + }) + + It("is true when an mmproj projector is set", func() { + cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"} + cfg.MMProj = "mmproj.gguf" // promoted field from the embedded options struct + Expect(cfg.VisionSupported()).To(BeTrue()) + }) + + It("does not fall for the GuessUsecases FLAG_VISION false positive", func() { + // A chat model with a chat template would make HasUsecases(FLAG_VISION) + // return true via the guess heuristic; VisionSupported must not. + cfg := &ModelConfig{Backend: "llama.cpp"} + cfg.TemplateConfig.Chat = "{{.Input}}" + Expect(cfg.VisionSupported()).To(BeFalse()) + }) + }) + + Describe("AudioInputSupported / VideoInputSupported", func() { + It("detects vLLM omni audio input via limit_mm_per_prompt", func() { + cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"} + cfg.LimitMMPerPrompt.LimitAudioPerPrompt = 1 + Expect(cfg.AudioInputSupported()).To(BeTrue()) + Expect(cfg.VideoInputSupported()).To(BeFalse()) + }) + + It("detects vLLM omni video input via limit_mm_per_prompt", func() { + cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"} + cfg.LimitMMPerPrompt.LimitVideoPerPrompt = 2 + Expect(cfg.VideoInputSupported()).To(BeTrue()) + }) + }) + + Describe("Capabilities + modalities", func() { + It("a text-only chat model exposes chat and text-only modalities", func() { + cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"} + Expect(cfg.Capabilities()).To(ContainElement(UsecaseChat)) + Expect(cfg.Capabilities()).NotTo(ContainElement(UsecaseVision)) + Expect(cfg.Capabilities()).NotTo(ContainElement(UsecaseTranscript)) + Expect(cfg.InputModalities()).To(Equal([]string{"text"})) + Expect(cfg.OutputModalities()).To(Equal([]string{"text"})) + }) + + It("a vision chat model accepts text+image input", func() { + cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT | FLAG_VISION), Backend: "llama.cpp"} + Expect(cfg.Capabilities()).To(ContainElements(UsecaseChat, UsecaseVision)) + Expect(cfg.InputModalities()).To(Equal([]string{"text", "image"})) + Expect(cfg.OutputModalities()).To(Equal([]string{"text"})) + }) + + It("an omni chat model accepts text+audio input without an audio capability flag", func() { + cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"} + cfg.LimitMMPerPrompt.LimitAudioPerPrompt = 1 + // audio-in is a modality, not a usecase string — this is exactly the + // case a plain capability list cannot express. + Expect(cfg.Capabilities()).To(ContainElement(UsecaseChat)) + Expect(cfg.InputModalities()).To(Equal([]string{"text", "audio"})) + }) + + It("a transcription model reads audio and writes text", func() { + cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_TRANSCRIPT), Backend: "parakeet-cpp"} + Expect(cfg.Capabilities()).To(Equal([]string{UsecaseTranscript})) + Expect(cfg.InputModalities()).To(Equal([]string{"audio"})) + Expect(cfg.OutputModalities()).To(Equal([]string{"text"})) + }) + + It("an image-generation model reads text and writes an image", func() { + // stablediffusion-ggml is image-only; plain "stablediffusion" is also + // in GuessUsecases' video-backend list, so it would report video too. + cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_IMAGE), Backend: "stablediffusion-ggml"} + Expect(cfg.Capabilities()).To(Equal([]string{UsecaseImage})) + Expect(cfg.InputModalities()).To(Equal([]string{"text"})) + Expect(cfg.OutputModalities()).To(Equal([]string{"image"})) + }) + + It("a TTS model reads text and writes audio", func() { + cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_TTS), Backend: "piper"} + Expect(cfg.Capabilities()).To(ContainElement(UsecaseTTS)) + Expect(cfg.InputModalities()).To(Equal([]string{"text"})) + Expect(cfg.OutputModalities()).To(Equal([]string{"audio"})) + }) + }) +}) diff --git a/core/http/endpoints/ollama/capabilities.go b/core/http/endpoints/ollama/capabilities.go index 96c24651d..ae952362f 100644 --- a/core/http/endpoints/ollama/capabilities.go +++ b/core/http/endpoints/ollama/capabilities.go @@ -49,62 +49,23 @@ func modelCapabilities(cfg *config.ModelConfig) []string { return caps } -// hasVisionSupport reports whether the model can accept image inputs. We avoid -// cfg.HasUsecases(FLAG_VISION) because GuessUsecases has no FLAG_VISION case -// and returns true for any chat model — see core/config/model_config.go. Instead -// we look for explicit signals: KnownUsecases bit, multimodal projector, or -// template/backend-reported multimodal markers. +// hasVisionSupport reports whether the model can accept image inputs. +// The detection heuristic is the canonical config.ModelConfig.VisionSupported — +// kept as a thin wrapper here so the Ollama capability mapping reads cleanly. func hasVisionSupport(cfg *config.ModelConfig) bool { - if cfg.KnownUsecases != nil && (*cfg.KnownUsecases&config.FLAG_VISION) == config.FLAG_VISION { - return true - } - if cfg.MMProj != "" { - return true - } - if cfg.TemplateConfig.Multimodal != "" { - return true - } - if cfg.MediaMarker != "" { - return true - } - return false + return cfg.VisionSupported() } -// hasToolSupport reports whether the model is wired up for tool / function calling. -// We look for any of the explicit configuration knobs LocalAI uses to drive -// function-call extraction (regex match, response regex, grammar triggers, XML -// format) or for the auto-detected tool-format markers populated by the -// llama.cpp backend during model load. +// hasToolSupport reports whether the model is wired up for tool / function +// calling. Delegates to the canonical config.ModelConfig.ToolSupported. func hasToolSupport(cfg *config.ModelConfig) bool { - fc := cfg.FunctionsConfig - if fc.ToolFormatMarkers != nil && fc.ToolFormatMarkers.FormatType != "" { - return true - } - if len(fc.JSONRegexMatch) > 0 || len(fc.ResponseRegex) > 0 { - return true - } - if fc.XMLFormatPreset != "" || fc.XMLFormat != nil { - return true - } - if len(fc.GrammarConfig.GrammarTriggers) > 0 || fc.GrammarConfig.SchemaType != "" { - return true - } - return false + return cfg.ToolSupported() } // hasThinkingSupport reports whether the model has reasoning / thinking enabled. -// LocalAI sets DisableReasoning=false (or leaves thinking markers configured) -// when the backend probe reports that the model supports thinking. +// Delegates to the canonical config.ModelConfig.ThinkingSupported. func hasThinkingSupport(cfg *config.ModelConfig) bool { - rc := cfg.ReasoningConfig - if rc.DisableReasoning != nil && !*rc.DisableReasoning { - return true - } - if len(rc.ThinkingStartTokens) > 0 || len(rc.TagPairs) > 0 { - // Explicit thinking markers imply support unless explicitly disabled. - return rc.DisableReasoning == nil || !*rc.DisableReasoning - } - return false + return cfg.ThinkingSupported() } // quantRegex matches GGUF-style quantization suffixes (Q4_K_M, Q8_0, IQ3_XS, F16, ...). diff --git a/core/http/endpoints/openai/list.go b/core/http/endpoints/openai/list.go index f2535f2d4..dca5a3ce0 100644 --- a/core/http/endpoints/openai/list.go +++ b/core/http/endpoints/openai/list.go @@ -21,48 +21,11 @@ func ListModelsEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, ap authDB = db[0] } return func(c echo.Context) error { - // If blank, no filter is applied. - filter := c.QueryParam("filter") - - // By default, exclude any loose files that are already referenced by a configuration file. - var policy galleryop.LooseFilePolicy - excludeConfigured := c.QueryParam("excludeConfigured") - if excludeConfigured == "" || excludeConfigured == "true" { - policy = galleryop.SKIP_IF_CONFIGURED - } else { - policy = galleryop.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user? - } - - filterFn, err := config.BuildNameFilterFn(filter) + modelNames, err := listVisibleModelNames(c, bcl, ml, authDB) if err != nil { return err } - modelNames, err := galleryop.ListModels(bcl, ml, filterFn, policy) - if err != nil { - return err - } - - // Filter models by user's allowlist if auth is enabled - if authDB != nil { - if user := auth.GetUser(c); user != nil && user.Role != auth.RoleAdmin { - perm, err := auth.GetCachedUserPermissions(c, authDB, user.ID) - if err == nil && perm.AllowedModels.Enabled { - allowed := map[string]bool{} - for _, m := range perm.AllowedModels.Models { - allowed[m] = true - } - filtered := make([]string, 0, len(modelNames)) - for _, m := range modelNames { - if allowed[m] { - filtered = append(filtered, m) - } - } - modelNames = filtered - } - } - } - // Map from a slice of names to a slice of OpenAIModel response objects dataModels := []schema.OpenAIModel{} for _, m := range modelNames { @@ -75,3 +38,53 @@ func ListModelsEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, ap }) } } + +// listVisibleModelNames resolves the model names visible to the caller, applying +// the same query filters (filter, excludeConfigured) and per-user allowlist as +// the OpenAI models listing. Shared by ListModelsEndpoint and +// ListModelCapabilitiesEndpoint so both stay consistent. +func listVisibleModelNames(c echo.Context, bcl *config.ModelConfigLoader, ml *model.ModelLoader, authDB *gorm.DB) ([]string, error) { + // If blank, no filter is applied. + filter := c.QueryParam("filter") + + // By default, exclude any loose files that are already referenced by a configuration file. + var policy galleryop.LooseFilePolicy + excludeConfigured := c.QueryParam("excludeConfigured") + if excludeConfigured == "" || excludeConfigured == "true" { + policy = galleryop.SKIP_IF_CONFIGURED + } else { + policy = galleryop.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user? + } + + filterFn, err := config.BuildNameFilterFn(filter) + if err != nil { + return nil, err + } + + modelNames, err := galleryop.ListModels(bcl, ml, filterFn, policy) + if err != nil { + return nil, err + } + + // Filter models by user's allowlist if auth is enabled + if authDB != nil { + if user := auth.GetUser(c); user != nil && user.Role != auth.RoleAdmin { + perm, err := auth.GetCachedUserPermissions(c, authDB, user.ID) + if err == nil && perm.AllowedModels.Enabled { + allowed := map[string]bool{} + for _, m := range perm.AllowedModels.Models { + allowed[m] = true + } + filtered := make([]string, 0, len(modelNames)) + for _, m := range modelNames { + if allowed[m] { + filtered = append(filtered, m) + } + } + modelNames = filtered + } + } + } + + return modelNames, nil +} diff --git a/core/http/endpoints/openai/list_capabilities.go b/core/http/endpoints/openai/list_capabilities.go new file mode 100644 index 000000000..386f53e85 --- /dev/null +++ b/core/http/endpoints/openai/list_capabilities.go @@ -0,0 +1,50 @@ +package openai + +import ( + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/schema" + model "github.com/mudler/LocalAI/pkg/model" + "gorm.io/gorm" +) + +// ListModelCapabilitiesEndpoint is a LocalAI-specific extension of the OpenAI +// models listing. It returns the same set of models as /v1/models but enriches +// each entry with the capabilities and input/output modalities the model +// supports, so clients can decide whether an image/audio/video attachment can be +// handed to a given model directly (or must be converted/transcribed first). +// +// It is purely additive: clients that don't know about it keep using /v1/models +// and see no change. +// @Summary List available models enriched with capabilities and input/output modalities. +// @Tags models +// @Success 200 {object} schema.ModelCapabilitiesResponse "Response" +// @Router /v1/models/capabilities [get] +func ListModelCapabilitiesEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, db ...*gorm.DB) echo.HandlerFunc { + var authDB *gorm.DB + if len(db) > 0 { + authDB = db[0] + } + return func(c echo.Context) error { + modelNames, err := listVisibleModelNames(c, bcl, ml, authDB) + if err != nil { + return err + } + + dataModels := []schema.ModelCapabilities{} + for _, m := range modelNames { + entry := schema.ModelCapabilities{ID: m, Object: "model"} + if cfg, ok := bcl.GetModelConfig(m); ok { + entry.Capabilities = cfg.Capabilities() + entry.InputModalities = cfg.InputModalities() + entry.OutputModalities = cfg.OutputModalities() + } + dataModels = append(dataModels, entry) + } + + return c.JSON(200, schema.ModelCapabilitiesResponse{ + Object: "list", + Data: dataModels, + }) + } +} diff --git a/core/http/endpoints/openai/list_capabilities_test.go b/core/http/endpoints/openai/list_capabilities_test.go new file mode 100644 index 000000000..b1cc8a1bf --- /dev/null +++ b/core/http/endpoints/openai/list_capabilities_test.go @@ -0,0 +1,119 @@ +package openai + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/LocalAI/pkg/system" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("ListModelCapabilitiesEndpoint", func() { + var ( + e *echo.Echo + tmpDir string + bcl *config.ModelConfigLoader + ml *model.ModelLoader + appConf *config.ApplicationConfig + ) + + BeforeEach(func() { + var err error + e = echo.New() + tmpDir, err = os.MkdirTemp("", "models-caps-test-*") + Expect(err).NotTo(HaveOccurred()) + + st, err := system.GetSystemState(system.WithModelPath(tmpDir)) + Expect(err).NotTo(HaveOccurred()) + ml = model.NewModelLoader(st) + bcl = config.NewModelConfigLoader(tmpDir) + appConf = config.NewApplicationConfig() + }) + + AfterEach(func() { + _ = os.RemoveAll(tmpDir) + }) + + writeConfig := func(name, yaml string) { + path := filepath.Join(tmpDir, name+".yaml") + Expect(os.WriteFile(path, []byte(yaml), 0o644)).To(Succeed()) + Expect(bcl.ReadModelConfig(path)).To(Succeed()) + } + + // call exercises the endpoint with auth disabled (no auth DB), which is the + // standard deployment path. The per-user allowlist branch is shared verbatim + // with ListModelsEndpoint (listVisibleModelNames) and covered there. + call := func() schema.ModelCapabilitiesResponse { + req := httptest.NewRequest(http.MethodGet, "/v1/models/capabilities", nil) + rec := httptest.NewRecorder() + c := e.NewContext(req, rec) + + handler := ListModelCapabilitiesEndpoint(bcl, ml, appConf) + Expect(handler(c)).To(Succeed()) + Expect(rec.Code).To(Equal(http.StatusOK)) + + var resp schema.ModelCapabilitiesResponse + Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed()) + return resp + } + + entryFor := func(resp schema.ModelCapabilitiesResponse, id string) *schema.ModelCapabilities { + for i := range resp.Data { + if resp.Data[i].ID == id { + return &resp.Data[i] + } + } + return nil + } + + It("returns the list envelope even with no models", func() { + resp := call() + Expect(resp.Object).To(Equal("list")) + }) + + It("enriches a vision chat model with capabilities and image input modality", func() { + writeConfig("vlm", ` +name: vlm +backend: llama-cpp +known_usecases: + - FLAG_CHAT + - FLAG_VISION +template: + chat: "{{ .Input }}" +parameters: + model: qwen2.5-vl-Q4_K_M.gguf +`) + entry := entryFor(call(), "vlm") + Expect(entry).NotTo(BeNil()) + Expect(entry.Object).To(Equal("model")) + Expect(entry.Capabilities).To(ContainElements("chat", "vision")) + Expect(entry.InputModalities).To(ContainElements("text", "image")) + Expect(entry.OutputModalities).To(ContainElement("text")) + }) + + It("marks a parakeet model as an audio-in/text-out transcription model", func() { + writeConfig("parakeet", ` +name: parakeet +backend: parakeet-cpp +known_usecases: + - FLAG_TRANSCRIPT +parameters: + model: parakeet-tdt-0.6b +`) + entry := entryFor(call(), "parakeet") + Expect(entry).NotTo(BeNil()) + Expect(entry.Capabilities).To(ContainElement("transcript")) + Expect(entry.InputModalities).To(Equal([]string{"audio"})) + Expect(entry.OutputModalities).To(Equal([]string{"text"})) + Expect(entry.Capabilities).NotTo(ContainElement("chat")) + }) +}) diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go index 5ef94539c..e51ab900d 100644 --- a/core/http/routes/localai.go +++ b/core/http/routes/localai.go @@ -272,25 +272,27 @@ func RegisterLocalAIRoutes(router *echo.Echo, "version": internal.PrintableVersion(), // Flat endpoint list for backwards compatibility "endpoints": map[string]any{ - "models": "/v1/models", - "chat_completions": "/v1/chat/completions", - "completions": "/v1/completions", - "embeddings": "/v1/embeddings", - "config_metadata": "/api/models/config-metadata", - "config_json": "/api/models/config-json/:name", - "config_patch": "/api/models/config-json/:name", - "autocomplete": "/api/models/config-metadata/autocomplete/:provider", - "vram_estimate": "/api/models/vram-estimate", - "tts": "/tts", - "transcription": "/v1/audio/transcriptions", - "image_generation": "/v1/images/generations", - "swagger": "/swagger/index.html", - "instructions": "/api/instructions", + "models": "/v1/models", + "models_capabilities": "/v1/models/capabilities", + "chat_completions": "/v1/chat/completions", + "completions": "/v1/completions", + "embeddings": "/v1/embeddings", + "config_metadata": "/api/models/config-metadata", + "config_json": "/api/models/config-json/:name", + "config_patch": "/api/models/config-json/:name", + "autocomplete": "/api/models/config-metadata/autocomplete/:provider", + "vram_estimate": "/api/models/vram-estimate", + "tts": "/tts", + "transcription": "/v1/audio/transcriptions", + "image_generation": "/v1/images/generations", + "swagger": "/swagger/index.html", + "instructions": "/api/instructions", }, // Categorized endpoint groups for structured discovery "endpoint_groups": map[string]any{ "openai_compatible": map[string]string{ "models": "/v1/models", + "models_capabilities": "/v1/models/capabilities", "chat_completions": "/v1/chat/completions", "completions": "/v1/completions", "embeddings": "/v1/embeddings", diff --git a/core/http/routes/openai.go b/core/http/routes/openai.go index 32603f567..e302ec5a5 100644 --- a/core/http/routes/openai.go +++ b/core/http/routes/openai.go @@ -257,4 +257,10 @@ func RegisterOpenAIRoutes(app *echo.Echo, // List models app.GET("/v1/models", openai.ListModelsEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB())) app.GET("/models", openai.ListModelsEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB())) + + // List models enriched with capabilities + input/output modalities + // (LocalAI-specific, additive superset of /v1/models). + capabilitiesHandler := openai.ListModelCapabilitiesEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB()) + app.GET("/v1/models/capabilities", capabilitiesHandler) + app.GET("/models/capabilities", capabilitiesHandler) } diff --git a/core/schema/openai.go b/core/schema/openai.go index 897dcbb97..752a27853 100644 --- a/core/schema/openai.go +++ b/core/schema/openai.go @@ -251,3 +251,27 @@ type ModelsDataResponse struct { Object string `json:"object"` Data []OpenAIModel `json:"data"` } + +// ModelCapabilities is a strict superset of OpenAIModel that additionally +// describes what a model can do and which modalities it accepts/produces. It is +// served by the LocalAI-specific /v1/models/capabilities endpoint so clients can +// route attachments (image/audio/video) to a model only when it can handle them. +type ModelCapabilities struct { + ID string `json:"id"` + Object string `json:"object"` + // Capabilities are canonical usecase strings (e.g. chat, vision, transcript, + // tts, embeddings, image, video) plus the modifiers "tools" and "thinking". + Capabilities []string `json:"capabilities"` + // InputModalities is the subset of {text,image,audio,video} the model accepts. + InputModalities []string `json:"input_modalities"` + // OutputModalities is the subset of {text,image,audio,video} the model produces. + OutputModalities []string `json:"output_modalities"` +} + +// ModelCapabilitiesResponse is the envelope returned by /v1/models/capabilities. +// It mirrors ModelsDataResponse so a client can treat it as an enriched +// drop-in for /v1/models. +type ModelCapabilitiesResponse struct { + Object string `json:"object"` + Data []ModelCapabilities `json:"data"` +} diff --git a/docs/content/features/api-discovery.md b/docs/content/features/api-discovery.md index c6ff4b6b9..cbcf636b3 100644 --- a/docs/content/features/api-discovery.md +++ b/docs/content/features/api-discovery.md @@ -36,6 +36,7 @@ Returns the instance version, all available endpoint URLs (flat and categorized) "endpoints": { "chat_completions": "/v1/chat/completions", "models": "/v1/models", + "models_capabilities": "/v1/models/capabilities", "config_metadata": "/api/models/config-metadata", "instructions": "/api/instructions", "swagger": "/swagger/index.html" @@ -123,6 +124,45 @@ Add `?format=json` to get a raw **OpenAPI fragment** (filtered Swagger spec with curl http://localhost:8080/api/instructions/config-management?format=json ``` +## Model Capabilities + +`GET /v1/models/capabilities` + +An additive, LocalAI-specific superset of `/v1/models`. It returns the same set of models but enriches each entry with the **capabilities** the model supports and the **input/output modalities** it accepts and produces. Use it to decide, before sending a request, whether a given model can take an image, audio, or video attachment directly — or whether the input needs converting/transcribing first. + +Because it is purely additive, clients that only understand `/v1/models` keep working unchanged; they simply never call this route. + +```bash +curl http://localhost:8080/v1/models/capabilities +``` + +```json +{ + "object": "list", + "data": [ + { + "id": "qwen2.5-omni", + "object": "model", + "capabilities": ["chat", "vision", "tools"], + "input_modalities": ["text", "image", "audio"], + "output_modalities": ["text"] + }, + { + "id": "parakeet", + "object": "model", + "capabilities": ["transcript"], + "input_modalities": ["audio"], + "output_modalities": ["text"] + } + ] +} +``` + +- **`capabilities`** — canonical usecase strings (e.g. `chat`, `vision`, `transcript`, `tts`, `embeddings`, `image`, `video`) plus the modifiers `tools` and `thinking`. +- **`input_modalities` / `output_modalities`** — the subsets of `{text, image, audio, video}` the model accepts and produces. Audio and video *input* are derived from the model's multimodal limits (e.g. vLLM `limit_mm_per_prompt`), which no single usecase flag expresses — which is why this endpoint exists alongside the plain listing. + +The same query parameters as `/v1/models` are honored (`filter`, `excludeConfigured`), and the same per-user model allowlist is applied when authentication is enabled. + ## Configuration Management APIs These endpoints let agents discover model configuration fields, read current settings, modify them, and estimate VRAM usage. diff --git a/docs/content/whats-new.md b/docs/content/whats-new.md index 6ff7979cc..0987afa2e 100644 --- a/docs/content/whats-new.md +++ b/docs/content/whats-new.md @@ -17,6 +17,7 @@ You can see the release notes [here](https://github.com/mudler/LocalAI/releases) - **May 2026**: [Speaker diarization](/features/audio-diarization/) — new `/v1/audio/diarization` endpoint returning "who spoke when" segments. Backed by `sherpa-onnx` (pyannote-3.0 + speaker embeddings + clustering) for pure diarization, and `vibevoice-cpp` for diarization bundled with long-form ASR. Supports `json` / `verbose_json` / `rttm` response formats. - **June 2026**: [Sound classification](/features/audio-classification/) — new `/v1/audio/classification` endpoint for audio tagging / sound-event classification, returning scored [AudioSet](https://research.google.com/audioset/) labels (baby cry, glass breaking, alarms, ...). Backed by [ced.cpp](https://github.com/mudler/ced.cpp), a 527-class AudioSet tagger ported to ggml. - **June 2026**: [PII analyze / redact API](/features/middleware/#analyze--redact-api) — the PII detection pipeline (NER + restricted-regex pattern tiers) is now a standalone service: `POST /api/pii/analyze` returns detected entity spans and `POST /api/pii/redact` returns the sanitised text (or `400 pii_blocked`), without routing a chat request through the middleware. Events gain an `origin` (`middleware` / `proxy` / `pii_analyze` / `pii_redact`) so `/api/pii/events` can be filtered by source. +- **July 2026**: [Model capabilities endpoint](/features/api-discovery/#model-capabilities) — `GET /v1/models/capabilities`, an additive superset of `/v1/models` that reports each model's `capabilities` plus its `input_modalities` / `output_modalities` (`text` / `image` / `audio` / `video`). Lets clients route image/audio/video attachments to a model only when it can handle them; audio/video *input* is derived from the model's multimodal limits, which no single usecase flag expresses. - **June 2026**: Concurrent scoring and PII NER on llama.cpp — the `Score` (router classifier) and `TokenClassify` (PII NER) primitives now ride llama.cpp's server task queue instead of locking the context, so they run concurrently with chat/completion/embedding traffic and with each other. The `known_usecases` restriction that forced dedicated scorer/NER model configs on llama-cpp is lifted, repeated scoring calls reuse the prompt KV cache across candidates, and scoring inputs are no longer capped by the physical batch size. ## 2024 Highlights diff --git a/swagger/docs.go b/swagger/docs.go index ec23de1aa..f20644160 100644 --- a/swagger/docs.go +++ b/swagger/docs.go @@ -2728,6 +2728,22 @@ const docTemplate = `{ } } }, + "/v1/models/capabilities": { + "get": { + "tags": [ + "models" + ], + "summary": "List available models enriched with capabilities and input/output modalities.", + "responses": { + "200": { + "description": "Response", + "schema": { + "$ref": "#/definitions/schema.ModelCapabilitiesResponse" + } + } + } + } + }, "/v1/rerank": { "post": { "tags": [ @@ -5182,6 +5198,52 @@ const docTemplate = `{ } } }, + "schema.ModelCapabilities": { + "type": "object", + "properties": { + "capabilities": { + "description": "Capabilities are canonical usecase strings (e.g. chat, vision, transcript,\ntts, embeddings, image, video) plus the modifiers \"tools\" and \"thinking\".", + "type": "array", + "items": { + "type": "string" + } + }, + "id": { + "type": "string" + }, + "input_modalities": { + "description": "InputModalities is the subset of {text,image,audio,video} the model accepts.", + "type": "array", + "items": { + "type": "string" + } + }, + "object": { + "type": "string" + }, + "output_modalities": { + "description": "OutputModalities is the subset of {text,image,audio,video} the model produces.", + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "schema.ModelCapabilitiesResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.ModelCapabilities" + } + }, + "object": { + "type": "string" + } + } + }, "schema.ModelLoadRequest": { "type": "object", "properties": { diff --git a/swagger/swagger.json b/swagger/swagger.json index 32baa866d..5af1c479b 100644 --- a/swagger/swagger.json +++ b/swagger/swagger.json @@ -2725,6 +2725,22 @@ } } }, + "/v1/models/capabilities": { + "get": { + "tags": [ + "models" + ], + "summary": "List available models enriched with capabilities and input/output modalities.", + "responses": { + "200": { + "description": "Response", + "schema": { + "$ref": "#/definitions/schema.ModelCapabilitiesResponse" + } + } + } + } + }, "/v1/rerank": { "post": { "tags": [ @@ -5179,6 +5195,52 @@ } } }, + "schema.ModelCapabilities": { + "type": "object", + "properties": { + "capabilities": { + "description": "Capabilities are canonical usecase strings (e.g. chat, vision, transcript,\ntts, embeddings, image, video) plus the modifiers \"tools\" and \"thinking\".", + "type": "array", + "items": { + "type": "string" + } + }, + "id": { + "type": "string" + }, + "input_modalities": { + "description": "InputModalities is the subset of {text,image,audio,video} the model accepts.", + "type": "array", + "items": { + "type": "string" + } + }, + "object": { + "type": "string" + }, + "output_modalities": { + "description": "OutputModalities is the subset of {text,image,audio,video} the model produces.", + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "schema.ModelCapabilitiesResponse": { + "type": "object", + "properties": { + "data": { + "type": "array", + "items": { + "$ref": "#/definitions/schema.ModelCapabilities" + } + }, + "object": { + "type": "string" + } + } + }, "schema.ModelLoadRequest": { "type": "object", "properties": { diff --git a/swagger/swagger.yaml b/swagger/swagger.yaml index ae158410a..a90ef2088 100644 --- a/swagger/swagger.yaml +++ b/swagger/swagger.yaml @@ -1362,6 +1362,41 @@ definitions: $ref: '#/definitions/schema.ToolCall' type: array type: object + schema.ModelCapabilities: + properties: + capabilities: + description: |- + Capabilities are canonical usecase strings (e.g. chat, vision, transcript, + tts, embeddings, image, video) plus the modifiers "tools" and "thinking". + items: + type: string + type: array + id: + type: string + input_modalities: + description: InputModalities is the subset of {text,image,audio,video} the + model accepts. + items: + type: string + type: array + object: + type: string + output_modalities: + description: OutputModalities is the subset of {text,image,audio,video} the + model produces. + items: + type: string + type: array + type: object + schema.ModelCapabilitiesResponse: + properties: + data: + items: + $ref: '#/definitions/schema.ModelCapabilities' + type: array + object: + type: string + type: object schema.ModelLoadRequest: properties: model: @@ -4358,6 +4393,16 @@ paths: summary: List and describe the various models available in the API. tags: - models + /v1/models/capabilities: + get: + responses: + "200": + description: Response + schema: + $ref: '#/definitions/schema.ModelCapabilitiesResponse' + summary: List available models enriched with capabilities and input/output modalities. + tags: + - models /v1/rerank: post: parameters: