feat(api): add GET /v1/models/capabilities endpoint

Additive superset of /v1/models that enriches each model entry with the capabilities it supports plus its input/output modalities (text / image / audio / video). Clients that only understand /v1/models are unaffected -- they simply never call the new route. Audio and video *input* are derived from the model's multimodal limits (vLLM limit_mm_per_prompt), which no single usecase FLAG expresses. That gap is exactly why a plain capability list is insufficient and this enriched endpoint exists: an attachment router can now decide whether an image/audio/video file can go to the active model directly, or must be converted/transcribed first. Capability derivation lives in core/config as the single source of truth (ModelConfig.Capabilities / InputModalities / OutputModalities / VisionSupported / ...); the Ollama capability surface now delegates to it instead of keeping a parallel copy. Vision is gated on chat/completion capability so a MediaMarker hydrated onto a non-chat model (e.g. a pure ASR/TTS backend) no longer reports a false vision capability. Read-only listing: no new FLAG_* flag, reuses the existing `models` swagger tag, and intentionally exposes no MCP admin tool (there is nothing to manage conversationally). Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
fix(backends): enable ROCm/HIP GPU offload for ggml audio backends (#10666 ) (#10667 )
2026-07-04 21:37:02 -04:00 · 2026-07-04 22:26:22 +00:00 · 2026-07-04 09:08:20 +02:00 · 2026-07-04 08:17:02 +02:00 · 2026-07-04 08:16:41 +02:00 · 2026-07-04 08:14:12 +02:00
28 changed files with 945 additions and 129 deletions
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=fdb1db877c526ec90f668eca1b858da5dba85560
+LLAMA_VERSION?=d4cff114c0084f1fbc9b4c62717eca8fb2ae494a
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/run.sh
+++ b/backend/cpp/llama-cpp/run.sh
@@ -36,6 +36,12 @@ else
 	if [ -d "$CURDIR/lib/rocblas/library" ]; then
 		export ROCBLAS_TENSILE_LIBPATH="$CURDIR"/lib/rocblas/library
 	fi
+	# Same for hipBLASLt (rocblaslt): the bundled libhipblaslt.so resolves its
+	# TensileLibrary_lazy_gfx*.dat kernel data relative to itself, so point it at
+	# the bundled data or it falls back to slow generic kernels (issue #10660).
+	if [ -d "$CURDIR/lib/hipblaslt/library" ]; then
+		export HIPBLASLT_TENSILE_LIBPATH="$CURDIR"/lib/hipblaslt/library
+	fi
 fi

 # If there is a lib/ld.so, use it
--- a/backend/cpp/turboquant/run.sh
+++ b/backend/cpp/turboquant/run.sh
@@ -34,6 +34,12 @@ else
 	if [ -d "$CURDIR/lib/rocblas/library" ]; then
 		export ROCBLAS_TENSILE_LIBPATH="$CURDIR"/lib/rocblas/library
 	fi
+	# Same for hipBLASLt (rocblaslt): the bundled libhipblaslt.so resolves its
+	# TensileLibrary_lazy_gfx*.dat kernel data relative to itself, so point it at
+	# the bundled data or it falls back to slow generic kernels (issue #10660).
+	if [ -d "$CURDIR/lib/hipblaslt/library" ]; then
+		export HIPBLASLT_TENSILE_LIBPATH="$CURDIR"/lib/hipblaslt/library
+	fi
 fi

 # If there is a lib/ld.so, use it
--- a/backend/go/acestep-cpp/CMakeLists.txt
+++ b/backend/go/acestep-cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ target_include_directories(goacestepcpp PRIVATE ${ACESTEP_DIR}/src ${ACESTEP_DIR
 target_include_directories(goacestepcpp SYSTEM PRIVATE ${ACESTEP_DIR}/ggml/include)

 # Link GPU backends if available (mirrors link_ggml_backends macro)
-foreach(backend blas cuda metal vulkan)
+foreach(backend blas cuda hip metal vulkan)
    if(TARGET ggml-${backend})
        target_link_libraries(goacestepcpp PRIVATE ggml-${backend})
        string(TOUPPER ${backend} BACKEND_UPPER)
--- a/backend/go/acestep-cpp/Makefile
+++ b/backend/go/acestep-cpp/Makefile
@@ -24,7 +24,14 @@ else ifeq ($(BUILD_TYPE),openblas)
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
+	# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
+	# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
+	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
+	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=9a26976a8c8cf5af0afcdd04463cf8ba91e96a54
+CRISPASR_VERSION?=f35185b876fc482fcb2053a81a2697936ed5fcc0
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/omnivoice-cpp/CMakeLists.txt
+++ b/backend/go/omnivoice-cpp/CMakeLists.txt
@@ -30,7 +30,7 @@ target_include_directories(gomnivoicecpp PRIVATE ${OMNIVOICE_DIR}/src)
 target_include_directories(gomnivoicecpp SYSTEM PRIVATE ${OMNIVOICE_DIR}/ggml/include)

 # Link GPU backends if the upstream ggml created them.
-foreach(backend blas cuda metal vulkan sycl)
+foreach(backend blas cuda hip metal vulkan sycl)
    if(TARGET ggml-${backend})
        target_link_libraries(gomnivoicecpp PRIVATE ggml-${backend})
        if(backend STREQUAL "cuda")
--- a/backend/go/omnivoice-cpp/Makefile
+++ b/backend/go/omnivoice-cpp/Makefile
@@ -24,7 +24,14 @@ else ifeq ($(BUILD_TYPE),openblas)
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
+	# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
+	# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
+	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
+	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
--- a/backend/go/qwen3-tts-cpp/CMakeLists.txt
+++ b/backend/go/qwen3-tts-cpp/CMakeLists.txt
@@ -30,7 +30,7 @@ target_include_directories(goqwen3ttscpp PRIVATE ${QWENTTS_DIR}/src)
 target_include_directories(goqwen3ttscpp SYSTEM PRIVATE ${QWENTTS_DIR}/ggml/include)

 # Link GPU backends if the upstream ggml created them.
-foreach(backend blas cuda metal vulkan sycl)
+foreach(backend blas cuda hip metal vulkan sycl)
    if(TARGET ggml-${backend})
        target_link_libraries(goqwen3ttscpp PRIVATE ggml-${backend})
        if(backend STREQUAL "cuda")
--- a/backend/go/qwen3-tts-cpp/Makefile
+++ b/backend/go/qwen3-tts-cpp/Makefile
@@ -24,7 +24,14 @@ else ifeq ($(BUILD_TYPE),openblas)
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
+	# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
+	# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
+	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
+	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
--- a/backend/go/vibevoice-cpp/CMakeLists.txt
+++ b/backend/go/vibevoice-cpp/CMakeLists.txt
@@ -50,7 +50,7 @@ target_include_directories(govibevoicecpp SYSTEM PRIVATE ${VIBEVOICE_DIR}/third_
 # Link GPU backends if available — vibevoice's own CMake already links
 # these to the libvibevoice STATIC library, but we re-link them on the
 # MODULE so resolved symbols include all backend kernels.
-foreach(backend blas cuda metal vulkan)
+foreach(backend blas cuda hip metal vulkan)
    if(TARGET ggml-${backend})
        target_link_libraries(govibevoicecpp PRIVATE ggml-${backend})
        string(TOUPPER ${backend} BACKEND_UPPER)
--- a/backend/go/vibevoice-cpp/Makefile
+++ b/backend/go/vibevoice-cpp/Makefile
@@ -29,7 +29,14 @@ else ifeq ($(BUILD_TYPE),openblas)
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DVIBEVOICE_GGML_HIPBLAS=ON
+	# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
+	# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
+	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
+	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=ON -DVIBEVOICE_GGML_VULKAN=ON
 else ifeq ($(OS),Darwin)
--- a/core/config/model_capabilities.go
+++ b/core/config/model_capabilities.go
@@ -0,0 +1,197 @@
+package config
+
+// This file is the single source of truth for deriving a model's user-facing
+// capabilities and input/output modalities from its ModelConfig. Both the
+// OpenAI-compatible /v1/models/capabilities endpoint and the Ollama-compatible
+// /api/tags|/api/show surface consume these, so the vocabulary stays consistent
+// across clients. Keep the detection heuristics here rather than duplicating
+// them per endpoint.
+
+// VisionSupported reports whether the model can accept image inputs.
+//
+// We deliberately avoid HasUsecases(FLAG_VISION): GuessUsecases has no
+// FLAG_VISION branch and reports true for any chat model, so it would paint
+// vision onto text-only models. Instead we look for explicit signals: the
+// declared KnownUsecases bit, a multimodal projector, or a template/backend
+// multimodal marker.
+func (c *ModelConfig) VisionSupported() bool {
+	if c.KnownUsecases != nil && (*c.KnownUsecases&FLAG_VISION) == FLAG_VISION {
+		return true
+	}
+	if c.MMProj != "" {
+		return true
+	}
+	if c.TemplateConfig.Multimodal != "" {
+		return true
+	}
+	if c.MediaMarker != "" {
+		return true
+	}
+	return false
+}
+
+// ToolSupported reports whether the model is wired up for tool / function
+// calling. We look for any of the explicit knobs LocalAI uses to drive
+// function-call extraction (regex match, response regex, grammar triggers, XML
+// format) or the auto-detected tool-format markers the llama.cpp backend
+// populates during model load.
+func (c *ModelConfig) ToolSupported() bool {
+	fc := c.FunctionsConfig
+	if fc.ToolFormatMarkers != nil && fc.ToolFormatMarkers.FormatType != "" {
+		return true
+	}
+	if len(fc.JSONRegexMatch) > 0 || len(fc.ResponseRegex) > 0 {
+		return true
+	}
+	if fc.XMLFormatPreset != "" || fc.XMLFormat != nil {
+		return true
+	}
+	if len(fc.GrammarConfig.GrammarTriggers) > 0 || fc.GrammarConfig.SchemaType != "" {
+		return true
+	}
+	return false
+}
+
+// ThinkingSupported reports whether the model has reasoning / thinking enabled.
+// LocalAI sets DisableReasoning=false (or leaves thinking markers configured)
+// when the backend probe reports that the model supports thinking.
+func (c *ModelConfig) ThinkingSupported() bool {
+	rc := c.ReasoningConfig
+	if rc.DisableReasoning != nil && !*rc.DisableReasoning {
+		return true
+	}
+	if len(rc.ThinkingStartTokens) > 0 || len(rc.TagPairs) > 0 {
+		// Explicit thinking markers imply support unless explicitly disabled.
+		return rc.DisableReasoning == nil || !*rc.DisableReasoning
+	}
+	return false
+}
+
+// AudioInputSupported reports whether a chat/generation model accepts audio as
+// input (e.g. vLLM omni models). The signal is the vLLM per-prompt audio limit;
+// there is no FLAG_* for "chat model that hears audio", which is exactly why a
+// plain usecase list can't express it. Transcription models are handled
+// separately in InputModalities via FLAG_TRANSCRIPT.
+func (c *ModelConfig) AudioInputSupported() bool {
+	return c.LimitMMPerPrompt.LimitAudioPerPrompt > 0
+}
+
+// VideoInputSupported reports whether a chat/generation model accepts video as
+// input. The signal is the vLLM per-prompt video limit. Note this is distinct
+// from FLAG_VIDEO, which denotes video *generation* (diffusers) — an output
+// modality, not an input one.
+func (c *ModelConfig) VideoInputSupported() bool {
+	return c.LimitMMPerPrompt.LimitVideoPerPrompt > 0
+}
+
+// Capabilities returns the ordered list of capability strings the model
+// supports, using the canonical usecase vocabulary (chat, vision, transcript,
+// tts, embeddings, image, video, ...) plus the modifier capabilities "tools"
+// and "thinking". Vision is resolved via VisionSupported (not HasUsecases) to
+// avoid the guess-heuristic false positive.
+func (c *ModelConfig) Capabilities() []string {
+	chat := c.HasUsecases(FLAG_CHAT)
+	completion := c.HasUsecases(FLAG_COMPLETION)
+
+	var caps []string
+	add := func(cond bool, name string) {
+		if cond {
+			caps = append(caps, name)
+		}
+	}
+
+	add(chat, UsecaseChat)
+	add(completion, UsecaseCompletion)
+	add(c.HasUsecases(FLAG_EDIT), UsecaseEdit)
+	add(c.HasUsecases(FLAG_EMBEDDINGS), UsecaseEmbeddings)
+	add(c.HasUsecases(FLAG_RERANK), UsecaseRerank)
+	// Vision is only meaningful as an image-understanding modifier on a chat/
+	// completion model. Gating on (chat||completion) matches the Ollama surface
+	// and avoids a false positive when config defaults hydrate a MediaMarker on
+	// a non-chat model (e.g. a pure ASR/TTS backend).
+	add((chat || completion) && c.VisionSupported(), UsecaseVision)
+	// tools/thinking are modifiers on the chat/completion surface.
+	add((chat || completion) && c.ToolSupported(), "tools")
+	add((chat || completion) && c.ThinkingSupported(), "thinking")
+	add(c.HasUsecases(FLAG_TRANSCRIPT), UsecaseTranscript)
+	add(c.HasUsecases(FLAG_TTS), UsecaseTTS)
+	add(c.HasUsecases(FLAG_SOUND_GENERATION), UsecaseSoundGeneration)
+	add(c.HasUsecases(FLAG_IMAGE), UsecaseImage)
+	add(c.HasUsecases(FLAG_VIDEO), UsecaseVideo)
+	add(c.HasUsecases(FLAG_VAD), UsecaseVAD)
+	add(c.HasUsecases(FLAG_DETECTION), UsecaseDetection)
+	add(c.HasUsecases(FLAG_DEPTH), UsecaseDepth)
+	add(c.HasUsecases(FLAG_AUDIO_TRANSFORM), UsecaseAudioTransform)
+	add(c.HasUsecases(FLAG_DIARIZATION), UsecaseDiarization)
+	add(c.HasUsecases(FLAG_SOUND_CLASSIFICATION), UsecaseSoundClassification)
+	add(c.HasUsecases(FLAG_REALTIME_AUDIO), UsecaseRealtimeAudio)
+	add(c.HasUsecases(FLAG_FACE_RECOGNITION), UsecaseFaceRecognition)
+	add(c.HasUsecases(FLAG_SPEAKER_RECOGNITION), UsecaseSpeakerRecognition)
+	return caps
+}
+
+// InputModalities returns the set of modalities (text, image, audio, video) the
+// model accepts as input, ordered text→image→audio→video. This is what an
+// attachment router consults to decide whether an image/audio/video file can be
+// handed to the active model directly.
+func (c *ModelConfig) InputModalities() []string {
+	imageGen := c.HasUsecases(FLAG_IMAGE)
+	videoGen := c.HasUsecases(FLAG_VIDEO)
+	chatish := c.HasUsecases(FLAG_CHAT) || c.HasUsecases(FLAG_COMPLETION)
+
+	textIn := chatish || c.HasUsecases(FLAG_EDIT) ||
+		c.HasUsecases(FLAG_EMBEDDINGS) || c.HasUsecases(FLAG_RERANK) || c.HasUsecases(FLAG_TOKENIZE) ||
+		c.HasUsecases(FLAG_TTS) || c.HasUsecases(FLAG_SOUND_GENERATION) || imageGen || videoGen
+
+	// Image input via a chat model requires vision (gated on chat, like the
+	// Ollama surface); detection/depth/face models consume images directly.
+	imageIn := (chatish && c.VisionSupported()) || c.LimitMMPerPrompt.LimitImagePerPrompt > 0 ||
+		c.HasUsecases(FLAG_DETECTION) || c.HasUsecases(FLAG_DEPTH) || c.HasUsecases(FLAG_FACE_RECOGNITION)
+
+	audioIn := c.AudioInputSupported() || c.HasUsecases(FLAG_TRANSCRIPT) || c.HasUsecases(FLAG_AUDIO_TRANSFORM) ||
+		c.HasUsecases(FLAG_REALTIME_AUDIO) || c.HasUsecases(FLAG_VAD) || c.HasUsecases(FLAG_DIARIZATION) ||
+		c.HasUsecases(FLAG_SOUND_CLASSIFICATION) || c.HasUsecases(FLAG_SPEAKER_RECOGNITION)
+
+	videoIn := c.VideoInputSupported()
+
+	var mods []string
+	if textIn {
+		mods = append(mods, "text")
+	}
+	if imageIn {
+		mods = append(mods, "image")
+	}
+	if audioIn {
+		mods = append(mods, "audio")
+	}
+	if videoIn {
+		mods = append(mods, "video")
+	}
+	return mods
+}
+
+// OutputModalities returns the set of modalities (text, image, audio, video)
+// the model produces, ordered text→image→audio→video.
+func (c *ModelConfig) OutputModalities() []string {
+	textOut := c.HasUsecases(FLAG_CHAT) || c.HasUsecases(FLAG_COMPLETION) || c.HasUsecases(FLAG_EDIT) ||
+		c.HasUsecases(FLAG_TRANSCRIPT)
+	imageOut := c.HasUsecases(FLAG_IMAGE)
+	audioOut := c.HasUsecases(FLAG_TTS) || c.HasUsecases(FLAG_SOUND_GENERATION) ||
+		c.HasUsecases(FLAG_AUDIO_TRANSFORM) || c.HasUsecases(FLAG_REALTIME_AUDIO)
+	videoOut := c.HasUsecases(FLAG_VIDEO)
+
+	var mods []string
+	if textOut {
+		mods = append(mods, "text")
+	}
+	if imageOut {
+		mods = append(mods, "image")
+	}
+	if audioOut {
+		mods = append(mods, "audio")
+	}
+	if videoOut {
+		mods = append(mods, "video")
+	}
+	return mods
+}
--- a/core/config/model_capabilities_test.go
+++ b/core/config/model_capabilities_test.go
@@ -0,0 +1,103 @@
+package config
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func usecaseBits(flags ModelConfigUsecase) *ModelConfigUsecase {
+	return &flags
+}
+
+var _ = Describe("Model capabilities derivation", func() {
+	Describe("VisionSupported", func() {
+		It("is false for a plain text chat model", func() {
+			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"}
+			Expect(cfg.VisionSupported()).To(BeFalse())
+		})
+
+		It("is true when the FLAG_VISION bit is declared", func() {
+			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT | FLAG_VISION), Backend: "llama.cpp"}
+			Expect(cfg.VisionSupported()).To(BeTrue())
+		})
+
+		It("is true when an mmproj projector is set", func() {
+			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"}
+			cfg.MMProj = "mmproj.gguf" // promoted field from the embedded options struct
+			Expect(cfg.VisionSupported()).To(BeTrue())
+		})
+
+		It("does not fall for the GuessUsecases FLAG_VISION false positive", func() {
+			// A chat model with a chat template would make HasUsecases(FLAG_VISION)
+			// return true via the guess heuristic; VisionSupported must not.
+			cfg := &ModelConfig{Backend: "llama.cpp"}
+			cfg.TemplateConfig.Chat = "{{.Input}}"
+			Expect(cfg.VisionSupported()).To(BeFalse())
+		})
+	})
+
+	Describe("AudioInputSupported / VideoInputSupported", func() {
+		It("detects vLLM omni audio input via limit_mm_per_prompt", func() {
+			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"}
+			cfg.LimitMMPerPrompt.LimitAudioPerPrompt = 1
+			Expect(cfg.AudioInputSupported()).To(BeTrue())
+			Expect(cfg.VideoInputSupported()).To(BeFalse())
+		})
+
+		It("detects vLLM omni video input via limit_mm_per_prompt", func() {
+			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"}
+			cfg.LimitMMPerPrompt.LimitVideoPerPrompt = 2
+			Expect(cfg.VideoInputSupported()).To(BeTrue())
+		})
+	})
+
+	Describe("Capabilities + modalities", func() {
+		It("a text-only chat model exposes chat and text-only modalities", func() {
+			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"}
+			Expect(cfg.Capabilities()).To(ContainElement(UsecaseChat))
+			Expect(cfg.Capabilities()).NotTo(ContainElement(UsecaseVision))
+			Expect(cfg.Capabilities()).NotTo(ContainElement(UsecaseTranscript))
+			Expect(cfg.InputModalities()).To(Equal([]string{"text"}))
+			Expect(cfg.OutputModalities()).To(Equal([]string{"text"}))
+		})
+
+		It("a vision chat model accepts text+image input", func() {
+			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT | FLAG_VISION), Backend: "llama.cpp"}
+			Expect(cfg.Capabilities()).To(ContainElements(UsecaseChat, UsecaseVision))
+			Expect(cfg.InputModalities()).To(Equal([]string{"text", "image"}))
+			Expect(cfg.OutputModalities()).To(Equal([]string{"text"}))
+		})
+
+		It("an omni chat model accepts text+audio input without an audio capability flag", func() {
+			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"}
+			cfg.LimitMMPerPrompt.LimitAudioPerPrompt = 1
+			// audio-in is a modality, not a usecase string — this is exactly the
+			// case a plain capability list cannot express.
+			Expect(cfg.Capabilities()).To(ContainElement(UsecaseChat))
+			Expect(cfg.InputModalities()).To(Equal([]string{"text", "audio"}))
+		})
+
+		It("a transcription model reads audio and writes text", func() {
+			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_TRANSCRIPT), Backend: "parakeet-cpp"}
+			Expect(cfg.Capabilities()).To(Equal([]string{UsecaseTranscript}))
+			Expect(cfg.InputModalities()).To(Equal([]string{"audio"}))
+			Expect(cfg.OutputModalities()).To(Equal([]string{"text"}))
+		})
+
+		It("an image-generation model reads text and writes an image", func() {
+			// stablediffusion-ggml is image-only; plain "stablediffusion" is also
+			// in GuessUsecases' video-backend list, so it would report video too.
+			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_IMAGE), Backend: "stablediffusion-ggml"}
+			Expect(cfg.Capabilities()).To(Equal([]string{UsecaseImage}))
+			Expect(cfg.InputModalities()).To(Equal([]string{"text"}))
+			Expect(cfg.OutputModalities()).To(Equal([]string{"image"}))
+		})
+
+		It("a TTS model reads text and writes audio", func() {
+			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_TTS), Backend: "piper"}
+			Expect(cfg.Capabilities()).To(ContainElement(UsecaseTTS))
+			Expect(cfg.InputModalities()).To(Equal([]string{"text"}))
+			Expect(cfg.OutputModalities()).To(Equal([]string{"audio"}))
+		})
+	})
+})
--- a/core/http/endpoints/ollama/capabilities.go
+++ b/core/http/endpoints/ollama/capabilities.go
@@ -49,62 +49,23 @@ func modelCapabilities(cfg *config.ModelConfig) []string {
 	return caps
 }

-// hasVisionSupport reports whether the model can accept image inputs. We avoid
-// cfg.HasUsecases(FLAG_VISION) because GuessUsecases has no FLAG_VISION case
-// and returns true for any chat model — see core/config/model_config.go. Instead
-// we look for explicit signals: KnownUsecases bit, multimodal projector, or
-// template/backend-reported multimodal markers.
+// hasVisionSupport reports whether the model can accept image inputs.
+// The detection heuristic is the canonical config.ModelConfig.VisionSupported —
+// kept as a thin wrapper here so the Ollama capability mapping reads cleanly.
 func hasVisionSupport(cfg *config.ModelConfig) bool {
-	if cfg.KnownUsecases != nil && (*cfg.KnownUsecases&config.FLAG_VISION) == config.FLAG_VISION {
-		return true
-	}
-	if cfg.MMProj != "" {
-		return true
-	}
-	if cfg.TemplateConfig.Multimodal != "" {
-		return true
-	}
-	if cfg.MediaMarker != "" {
-		return true
-	}
-	return false
+	return cfg.VisionSupported()
 }

-// hasToolSupport reports whether the model is wired up for tool / function calling.
-// We look for any of the explicit configuration knobs LocalAI uses to drive
-// function-call extraction (regex match, response regex, grammar triggers, XML
-// format) or for the auto-detected tool-format markers populated by the
-// llama.cpp backend during model load.
+// hasToolSupport reports whether the model is wired up for tool / function
+// calling. Delegates to the canonical config.ModelConfig.ToolSupported.
 func hasToolSupport(cfg *config.ModelConfig) bool {
-	fc := cfg.FunctionsConfig
-	if fc.ToolFormatMarkers != nil && fc.ToolFormatMarkers.FormatType != "" {
-		return true
-	}
-	if len(fc.JSONRegexMatch) > 0 || len(fc.ResponseRegex) > 0 {
-		return true
-	}
-	if fc.XMLFormatPreset != "" || fc.XMLFormat != nil {
-		return true
-	}
-	if len(fc.GrammarConfig.GrammarTriggers) > 0 || fc.GrammarConfig.SchemaType != "" {
-		return true
-	}
-	return false
+	return cfg.ToolSupported()
 }

 // hasThinkingSupport reports whether the model has reasoning / thinking enabled.
-// LocalAI sets DisableReasoning=false (or leaves thinking markers configured)
-// when the backend probe reports that the model supports thinking.
+// Delegates to the canonical config.ModelConfig.ThinkingSupported.
 func hasThinkingSupport(cfg *config.ModelConfig) bool {
-	rc := cfg.ReasoningConfig
-	if rc.DisableReasoning != nil && !*rc.DisableReasoning {
-		return true
-	}
-	if len(rc.ThinkingStartTokens) > 0 || len(rc.TagPairs) > 0 {
-		// Explicit thinking markers imply support unless explicitly disabled.
-		return rc.DisableReasoning == nil || !*rc.DisableReasoning
-	}
-	return false
+	return cfg.ThinkingSupported()
 }

 // quantRegex matches GGUF-style quantization suffixes (Q4_K_M, Q8_0, IQ3_XS, F16, ...).
--- a/core/http/endpoints/openai/list.go
+++ b/core/http/endpoints/openai/list.go
@@ -21,48 +21,11 @@ func ListModelsEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, ap
 		authDB = db[0]
 	}
 	return func(c echo.Context) error {
-		// If blank, no filter is applied.
-		filter := c.QueryParam("filter")
-
-		// By default, exclude any loose files that are already referenced by a configuration file.
-		var policy galleryop.LooseFilePolicy
-		excludeConfigured := c.QueryParam("excludeConfigured")
-		if excludeConfigured == "" || excludeConfigured == "true" {
-			policy = galleryop.SKIP_IF_CONFIGURED
-		} else {
-			policy = galleryop.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user?
-		}
-
-		filterFn, err := config.BuildNameFilterFn(filter)
+		modelNames, err := listVisibleModelNames(c, bcl, ml, authDB)
 		if err != nil {
 			return err
 		}

-		modelNames, err := galleryop.ListModels(bcl, ml, filterFn, policy)
-		if err != nil {
-			return err
-		}
-
-		// Filter models by user's allowlist if auth is enabled
-		if authDB != nil {
-			if user := auth.GetUser(c); user != nil && user.Role != auth.RoleAdmin {
-				perm, err := auth.GetCachedUserPermissions(c, authDB, user.ID)
-				if err == nil && perm.AllowedModels.Enabled {
-					allowed := map[string]bool{}
-					for _, m := range perm.AllowedModels.Models {
-						allowed[m] = true
-					}
-					filtered := make([]string, 0, len(modelNames))
-					for _, m := range modelNames {
-						if allowed[m] {
-							filtered = append(filtered, m)
-						}
-					}
-					modelNames = filtered
-				}
-			}
-		}
-
 		// Map from a slice of names to a slice of OpenAIModel response objects
 		dataModels := []schema.OpenAIModel{}
 		for _, m := range modelNames {
@@ -75,3 +38,53 @@ func ListModelsEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, ap
 		})
 	}
 }
+
+// listVisibleModelNames resolves the model names visible to the caller, applying
+// the same query filters (filter, excludeConfigured) and per-user allowlist as
+// the OpenAI models listing. Shared by ListModelsEndpoint and
+// ListModelCapabilitiesEndpoint so both stay consistent.
+func listVisibleModelNames(c echo.Context, bcl *config.ModelConfigLoader, ml *model.ModelLoader, authDB *gorm.DB) ([]string, error) {
+	// If blank, no filter is applied.
+	filter := c.QueryParam("filter")
+
+	// By default, exclude any loose files that are already referenced by a configuration file.
+	var policy galleryop.LooseFilePolicy
+	excludeConfigured := c.QueryParam("excludeConfigured")
+	if excludeConfigured == "" || excludeConfigured == "true" {
+		policy = galleryop.SKIP_IF_CONFIGURED
+	} else {
+		policy = galleryop.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user?
+	}
+
+	filterFn, err := config.BuildNameFilterFn(filter)
+	if err != nil {
+		return nil, err
+	}
+
+	modelNames, err := galleryop.ListModels(bcl, ml, filterFn, policy)
+	if err != nil {
+		return nil, err
+	}
+
+	// Filter models by user's allowlist if auth is enabled
+	if authDB != nil {
+		if user := auth.GetUser(c); user != nil && user.Role != auth.RoleAdmin {
+			perm, err := auth.GetCachedUserPermissions(c, authDB, user.ID)
+			if err == nil && perm.AllowedModels.Enabled {
+				allowed := map[string]bool{}
+				for _, m := range perm.AllowedModels.Models {
+					allowed[m] = true
+				}
+				filtered := make([]string, 0, len(modelNames))
+				for _, m := range modelNames {
+					if allowed[m] {
+						filtered = append(filtered, m)
+					}
+				}
+				modelNames = filtered
+			}
+		}
+	}
+
+	return modelNames, nil
+}
--- a/core/http/endpoints/openai/list_capabilities.go
+++ b/core/http/endpoints/openai/list_capabilities.go
@@ -0,0 +1,50 @@
+package openai
+
+import (
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+	model "github.com/mudler/LocalAI/pkg/model"
+	"gorm.io/gorm"
+)
+
+// ListModelCapabilitiesEndpoint is a LocalAI-specific extension of the OpenAI
+// models listing. It returns the same set of models as /v1/models but enriches
+// each entry with the capabilities and input/output modalities the model
+// supports, so clients can decide whether an image/audio/video attachment can be
+// handed to a given model directly (or must be converted/transcribed first).
+//
+// It is purely additive: clients that don't know about it keep using /v1/models
+// and see no change.
+// @Summary List available models enriched with capabilities and input/output modalities.
+// @Tags models
+// @Success 200 {object} schema.ModelCapabilitiesResponse "Response"
+// @Router /v1/models/capabilities [get]
+func ListModelCapabilitiesEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, db ...*gorm.DB) echo.HandlerFunc {
+	var authDB *gorm.DB
+	if len(db) > 0 {
+		authDB = db[0]
+	}
+	return func(c echo.Context) error {
+		modelNames, err := listVisibleModelNames(c, bcl, ml, authDB)
+		if err != nil {
+			return err
+		}
+
+		dataModels := []schema.ModelCapabilities{}
+		for _, m := range modelNames {
+			entry := schema.ModelCapabilities{ID: m, Object: "model"}
+			if cfg, ok := bcl.GetModelConfig(m); ok {
+				entry.Capabilities = cfg.Capabilities()
+				entry.InputModalities = cfg.InputModalities()
+				entry.OutputModalities = cfg.OutputModalities()
+			}
+			dataModels = append(dataModels, entry)
+		}
+
+		return c.JSON(200, schema.ModelCapabilitiesResponse{
+			Object: "list",
+			Data:   dataModels,
+		})
+	}
+}
--- a/core/http/endpoints/openai/list_capabilities_test.go
+++ b/core/http/endpoints/openai/list_capabilities_test.go
@@ -0,0 +1,119 @@
+package openai
+
+import (
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/system"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("ListModelCapabilitiesEndpoint", func() {
+	var (
+		e       *echo.Echo
+		tmpDir  string
+		bcl     *config.ModelConfigLoader
+		ml      *model.ModelLoader
+		appConf *config.ApplicationConfig
+	)
+
+	BeforeEach(func() {
+		var err error
+		e = echo.New()
+		tmpDir, err = os.MkdirTemp("", "models-caps-test-*")
+		Expect(err).NotTo(HaveOccurred())
+
+		st, err := system.GetSystemState(system.WithModelPath(tmpDir))
+		Expect(err).NotTo(HaveOccurred())
+		ml = model.NewModelLoader(st)
+		bcl = config.NewModelConfigLoader(tmpDir)
+		appConf = config.NewApplicationConfig()
+	})
+
+	AfterEach(func() {
+		_ = os.RemoveAll(tmpDir)
+	})
+
+	writeConfig := func(name, yaml string) {
+		path := filepath.Join(tmpDir, name+".yaml")
+		Expect(os.WriteFile(path, []byte(yaml), 0o644)).To(Succeed())
+		Expect(bcl.ReadModelConfig(path)).To(Succeed())
+	}
+
+	// call exercises the endpoint with auth disabled (no auth DB), which is the
+	// standard deployment path. The per-user allowlist branch is shared verbatim
+	// with ListModelsEndpoint (listVisibleModelNames) and covered there.
+	call := func() schema.ModelCapabilitiesResponse {
+		req := httptest.NewRequest(http.MethodGet, "/v1/models/capabilities", nil)
+		rec := httptest.NewRecorder()
+		c := e.NewContext(req, rec)
+
+		handler := ListModelCapabilitiesEndpoint(bcl, ml, appConf)
+		Expect(handler(c)).To(Succeed())
+		Expect(rec.Code).To(Equal(http.StatusOK))
+
+		var resp schema.ModelCapabilitiesResponse
+		Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
+		return resp
+	}
+
+	entryFor := func(resp schema.ModelCapabilitiesResponse, id string) *schema.ModelCapabilities {
+		for i := range resp.Data {
+			if resp.Data[i].ID == id {
+				return &resp.Data[i]
+			}
+		}
+		return nil
+	}
+
+	It("returns the list envelope even with no models", func() {
+		resp := call()
+		Expect(resp.Object).To(Equal("list"))
+	})
+
+	It("enriches a vision chat model with capabilities and image input modality", func() {
+		writeConfig("vlm", `
+name: vlm
+backend: llama-cpp
+known_usecases:
+  - FLAG_CHAT
+  - FLAG_VISION
+template:
+  chat: "{{ .Input }}"
+parameters:
+  model: qwen2.5-vl-Q4_K_M.gguf
+`)
+		entry := entryFor(call(), "vlm")
+		Expect(entry).NotTo(BeNil())
+		Expect(entry.Object).To(Equal("model"))
+		Expect(entry.Capabilities).To(ContainElements("chat", "vision"))
+		Expect(entry.InputModalities).To(ContainElements("text", "image"))
+		Expect(entry.OutputModalities).To(ContainElement("text"))
+	})
+
+	It("marks a parakeet model as an audio-in/text-out transcription model", func() {
+		writeConfig("parakeet", `
+name: parakeet
+backend: parakeet-cpp
+known_usecases:
+  - FLAG_TRANSCRIPT
+parameters:
+  model: parakeet-tdt-0.6b
+`)
+		entry := entryFor(call(), "parakeet")
+		Expect(entry).NotTo(BeNil())
+		Expect(entry.Capabilities).To(ContainElement("transcript"))
+		Expect(entry.InputModalities).To(Equal([]string{"audio"}))
+		Expect(entry.OutputModalities).To(Equal([]string{"text"}))
+		Expect(entry.Capabilities).NotTo(ContainElement("chat"))
+	})
+})
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -272,25 +272,27 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 			"version": internal.PrintableVersion(),
 			// Flat endpoint list for backwards compatibility
 			"endpoints": map[string]any{
-				"models":           "/v1/models",
-				"chat_completions": "/v1/chat/completions",
-				"completions":      "/v1/completions",
-				"embeddings":       "/v1/embeddings",
-				"config_metadata":  "/api/models/config-metadata",
-				"config_json":      "/api/models/config-json/:name",
-				"config_patch":     "/api/models/config-json/:name",
-				"autocomplete":     "/api/models/config-metadata/autocomplete/:provider",
-				"vram_estimate":    "/api/models/vram-estimate",
-				"tts":              "/tts",
-				"transcription":    "/v1/audio/transcriptions",
-				"image_generation": "/v1/images/generations",
-				"swagger":          "/swagger/index.html",
-				"instructions":     "/api/instructions",
+				"models":              "/v1/models",
+				"models_capabilities": "/v1/models/capabilities",
+				"chat_completions":    "/v1/chat/completions",
+				"completions":         "/v1/completions",
+				"embeddings":          "/v1/embeddings",
+				"config_metadata":     "/api/models/config-metadata",
+				"config_json":         "/api/models/config-json/:name",
+				"config_patch":        "/api/models/config-json/:name",
+				"autocomplete":        "/api/models/config-metadata/autocomplete/:provider",
+				"vram_estimate":       "/api/models/vram-estimate",
+				"tts":                 "/tts",
+				"transcription":       "/v1/audio/transcriptions",
+				"image_generation":    "/v1/images/generations",
+				"swagger":             "/swagger/index.html",
+				"instructions":        "/api/instructions",
 			},
 			// Categorized endpoint groups for structured discovery
 			"endpoint_groups": map[string]any{
 				"openai_compatible": map[string]string{
 					"models":               "/v1/models",
+					"models_capabilities":  "/v1/models/capabilities",
 					"chat_completions":     "/v1/chat/completions",
 					"completions":          "/v1/completions",
 					"embeddings":           "/v1/embeddings",
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -257,4 +257,10 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	// List models
 	app.GET("/v1/models", openai.ListModelsEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB()))
 	app.GET("/models", openai.ListModelsEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB()))
+
+	// List models enriched with capabilities + input/output modalities
+	// (LocalAI-specific, additive superset of /v1/models).
+	capabilitiesHandler := openai.ListModelCapabilitiesEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB())
+	app.GET("/v1/models/capabilities", capabilitiesHandler)
+	app.GET("/models/capabilities", capabilitiesHandler)
 }
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -251,3 +251,27 @@ type ModelsDataResponse struct {
 	Object string        `json:"object"`
 	Data   []OpenAIModel `json:"data"`
 }
+
+// ModelCapabilities is a strict superset of OpenAIModel that additionally
+// describes what a model can do and which modalities it accepts/produces. It is
+// served by the LocalAI-specific /v1/models/capabilities endpoint so clients can
+// route attachments (image/audio/video) to a model only when it can handle them.
+type ModelCapabilities struct {
+	ID     string `json:"id"`
+	Object string `json:"object"`
+	// Capabilities are canonical usecase strings (e.g. chat, vision, transcript,
+	// tts, embeddings, image, video) plus the modifiers "tools" and "thinking".
+	Capabilities []string `json:"capabilities"`
+	// InputModalities is the subset of {text,image,audio,video} the model accepts.
+	InputModalities []string `json:"input_modalities"`
+	// OutputModalities is the subset of {text,image,audio,video} the model produces.
+	OutputModalities []string `json:"output_modalities"`
+}
+
+// ModelCapabilitiesResponse is the envelope returned by /v1/models/capabilities.
+// It mirrors ModelsDataResponse so a client can treat it as an enriched
+// drop-in for /v1/models.
+type ModelCapabilitiesResponse struct {
+	Object string              `json:"object"`
+	Data   []ModelCapabilities `json:"data"`
+}
--- a/docs/content/features/api-discovery.md
+++ b/docs/content/features/api-discovery.md
@@ -36,6 +36,7 @@ Returns the instance version, all available endpoint URLs (flat and categorized)
  "endpoints": {
    "chat_completions": "/v1/chat/completions",
    "models": "/v1/models",
+    "models_capabilities": "/v1/models/capabilities",
    "config_metadata": "/api/models/config-metadata",
    "instructions": "/api/instructions",
    "swagger": "/swagger/index.html"
@@ -123,6 +124,45 @@ Add `?format=json` to get a raw **OpenAPI fragment** (filtered Swagger spec with
 curl http://localhost:8080/api/instructions/config-management?format=json
 ```

+## Model Capabilities
+
+`GET /v1/models/capabilities`
+
+An additive, LocalAI-specific superset of `/v1/models`. It returns the same set of models but enriches each entry with the **capabilities** the model supports and the **input/output modalities** it accepts and produces. Use it to decide, before sending a request, whether a given model can take an image, audio, or video attachment directly — or whether the input needs converting/transcribing first.
+
+Because it is purely additive, clients that only understand `/v1/models` keep working unchanged; they simply never call this route.
+
+```bash
+curl http://localhost:8080/v1/models/capabilities
+```
+
+```json
+{
+  "object": "list",
+  "data": [
+    {
+      "id": "qwen2.5-omni",
+      "object": "model",
+      "capabilities": ["chat", "vision", "tools"],
+      "input_modalities": ["text", "image", "audio"],
+      "output_modalities": ["text"]
+    },
+    {
+      "id": "parakeet",
+      "object": "model",
+      "capabilities": ["transcript"],
+      "input_modalities": ["audio"],
+      "output_modalities": ["text"]
+    }
+  ]
+}
+```
+
+- **`capabilities`** — canonical usecase strings (e.g. `chat`, `vision`, `transcript`, `tts`, `embeddings`, `image`, `video`) plus the modifiers `tools` and `thinking`.
+- **`input_modalities` / `output_modalities`** — the subsets of `{text, image, audio, video}` the model accepts and produces. Audio and video *input* are derived from the model's multimodal limits (e.g. vLLM `limit_mm_per_prompt`), which no single usecase flag expresses — which is why this endpoint exists alongside the plain listing.
+
+The same query parameters as `/v1/models` are honored (`filter`, `excludeConfigured`), and the same per-user model allowlist is applied when authentication is enabled.
+
 ## Configuration Management APIs

 These endpoints let agents discover model configuration fields, read current settings, modify them, and estimate VRAM usage.
--- a/docs/content/whats-new.md
+++ b/docs/content/whats-new.md
@@ -17,6 +17,7 @@ You can see the release notes [here](https://github.com/mudler/LocalAI/releases)
 - **May 2026**: [Speaker diarization](/features/audio-diarization/) — new `/v1/audio/diarization` endpoint returning "who spoke when" segments. Backed by `sherpa-onnx` (pyannote-3.0 + speaker embeddings + clustering) for pure diarization, and `vibevoice-cpp` for diarization bundled with long-form ASR. Supports `json` / `verbose_json` / `rttm` response formats.
 - **June 2026**: [Sound classification](/features/audio-classification/) — new `/v1/audio/classification` endpoint for audio tagging / sound-event classification, returning scored [AudioSet](https://research.google.com/audioset/) labels (baby cry, glass breaking, alarms, ...). Backed by [ced.cpp](https://github.com/mudler/ced.cpp), a 527-class AudioSet tagger ported to ggml.
 - **June 2026**: [PII analyze / redact API](/features/middleware/#analyze--redact-api) — the PII detection pipeline (NER + restricted-regex pattern tiers) is now a standalone service: `POST /api/pii/analyze` returns detected entity spans and `POST /api/pii/redact` returns the sanitised text (or `400 pii_blocked`), without routing a chat request through the middleware. Events gain an `origin` (`middleware` / `proxy` / `pii_analyze` / `pii_redact`) so `/api/pii/events` can be filtered by source.
+- **July 2026**: [Model capabilities endpoint](/features/api-discovery/#model-capabilities) — `GET /v1/models/capabilities`, an additive superset of `/v1/models` that reports each model's `capabilities` plus its `input_modalities` / `output_modalities` (`text` / `image` / `audio` / `video`). Lets clients route image/audio/video attachments to a model only when it can handle them; audio/video *input* is derived from the model's multimodal limits, which no single usecase flag expresses.
 - **June 2026**: Concurrent scoring and PII NER on llama.cpp — the `Score` (router classifier) and `TokenClassify` (PII NER) primitives now ride llama.cpp's server task queue instead of locking the context, so they run concurrently with chat/completion/embedding traffic and with each other. The `known_usecases` restriction that forced dedicated scorer/NER model configs on llama-cpp is lifted, repeated scoring calls reuse the prompt KV cache across candidates, and scoring inputs are no longer capped by the physical batch size.

 ## 2024 Highlights
--- a/scripts/build/package-gpu-libs-rocm-data_test.sh
+++ b/scripts/build/package-gpu-libs-rocm-data_test.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Regression test for scripts/build/package-gpu-libs.sh ROCm data bundling.
+#
+# Guards issue #10660: hipBLASLt (rocblaslt) resolves its TensileLibrary_lazy_gfx*.dat
+# kernel data relative to the bundled libhipblaslt.so. The packager copied the
+# rocblas/ data dir but not the hipblaslt/ data dir, so the bundled backend
+# fell back to slow generic kernels and logged
+#   rocblaslt error: Cannot read "TensileLibrary_lazy_gfx1201.dat": No such file or directory
+#
+# This test fabricates a fake ROCm tree containing both rocblas/ and hipblaslt/
+# tensile data, points the packager at it via ROCM_BASE_DIRS, and asserts BOTH
+# data directories are bundled into the target lib dir.
+set -euo pipefail
+
+CURDIR=$(dirname "$(realpath "$0")")
+SCRIPT="$CURDIR/package-gpu-libs.sh"
+
+WORK=$(mktemp -d)
+trap 'rm -rf "$WORK"' EXIT
+
+# Fabricate a fake ROCm install with both rocblas and hipblaslt tensile data.
+FAKE_ROCM="$WORK/opt/rocm"
+mkdir -p "$FAKE_ROCM/lib/rocblas/library"
+mkdir -p "$FAKE_ROCM/lib/hipblaslt/library"
+echo "fake rocblas tensile" > "$FAKE_ROCM/lib/rocblas/library/TensileLibrary_lazy_gfx1201.dat"
+echo "fake hipblaslt tensile" > "$FAKE_ROCM/lib/hipblaslt/library/TensileLibrary_lazy_gfx1201.dat"
+
+TARGET="$WORK/target"
+mkdir -p "$TARGET"
+
+# shellcheck source=/dev/null
+source "$SCRIPT" "$TARGET"
+
+# Point the data-dir copy at the fabricated tree instead of the real /opt/rocm,
+# then run the actual ROCm packager. This asserts package_rocm_libs itself
+# bundles BOTH data dirs, not just that the helper works in isolation.
+export BUILD_TYPE=hipblas
+export ROCM_BASE_DIRS="$FAKE_ROCM"
+package_rocm_libs
+
+fail=false
+if [ ! -e "$TARGET/rocblas/library/TensileLibrary_lazy_gfx1201.dat" ]; then
+    echo "FAIL: rocblas tensile data was NOT bundled"
+    fail=true
+fi
+if [ ! -e "$TARGET/hipblaslt/library/TensileLibrary_lazy_gfx1201.dat" ]; then
+    echo "FAIL: hipblaslt tensile data was NOT bundled (regression of #10660)"
+    fail=true
+fi
+
+if [ "$fail" = true ]; then
+    ls -R "$TARGET" || true
+    exit 1
+fi
+
+echo "PASS: rocblas and hipblaslt tensile data were both bundled"
+exit 0
--- a/scripts/build/package-gpu-libs.sh
+++ b/scripts/build/package-gpu-libs.sh
@@ -224,6 +224,50 @@ package_cuda_libs() {
    echo "CUDA libraries packaged successfully"
 }

+# Copy a ROCm library data subdirectory (e.g. rocblas, hipblaslt) into the
+# bundled lib/ dir. These directories hold the TensileLibrary_*.dat GPU kernel
+# tuning files, which rocBLAS/hipBLASLt load at runtime *relative to their own
+# .so*. Since backends ship their own copies of libhipblaslt.so/librocblas.so
+# under lib/, the matching data dir must travel with them or the libs fall back
+# to slow generic kernels (rocblaslt error: Cannot read TensileLibrary_lazy_gfx*.dat;
+# see issue #10660).
+#
+# The ROCm search roots default to /opt/rocm{,-*} but can be overridden via the
+# ROCM_BASE_DIRS env var (space-separated), which keeps the copy unit-testable
+# without a real ROCm install.
+# Args: $1 = data subdir name found under <rocm-root>/lib{,64}/
+copy_rocm_data_dir() {
+    local data_name="$1"
+    # Single-line `local x=$(...)` on purpose: `local` masks the command
+    # substitution's exit status, which is 1 when nullglob is unset and would
+    # otherwise trip the script's `set -e`.
+    local old_nullglob=$(shopt -p nullglob)
+    shopt -s nullglob
+    local rocm_dirs
+    if [ -n "${ROCM_BASE_DIRS:-}" ]; then
+        # shellcheck disable=SC2206  # intentional word-split of the override
+        rocm_dirs=(${ROCM_BASE_DIRS})
+    else
+        rocm_dirs=(/opt/rocm /opt/rocm-*)
+    fi
+    eval "$old_nullglob"
+    local found=false
+    local rocm_base lib_subdir
+    for rocm_base in "${rocm_dirs[@]}"; do
+        for lib_subdir in lib lib64; do
+            if [ -d "$rocm_base/$lib_subdir/$data_name" ]; then
+                echo "Found $data_name data at $rocm_base/$lib_subdir/$data_name"
+                mkdir -p "$TARGET_LIB_DIR/$data_name"
+                cp -arfL "$rocm_base/$lib_subdir/$data_name/"* "$TARGET_LIB_DIR/$data_name/" || echo "WARNING: Failed to copy $data_name data from $rocm_base/$lib_subdir/$data_name"
+                found=true
+            fi
+        done
+    done
+    if [ "$found" = false ]; then
+        echo "WARNING: No $data_name library data found in ${ROCM_BASE_DIRS:-/opt/rocm*}/lib{,64}/$data_name"
+    fi
+}
+
 # Package AMD ROCm/HIPBlas libraries
 package_rocm_libs() {
    echo "Packaging ROCm/HIPBlas libraries for BUILD_TYPE=${BUILD_TYPE}..."
@@ -267,27 +311,16 @@ package_rocm_libs() {
        fi
    done

-    # Copy rocblas library data (tuning files, TensileLibrary, etc.)
-    local old_nullglob=$(shopt -p nullglob)
-    shopt -s nullglob
-    local rocm_dirs=(/opt/rocm /opt/rocm-*)
-    eval "$old_nullglob"
-    local rocblas_found=false
-    for rocm_base in "${rocm_dirs[@]}"; do
-        for lib_subdir in lib lib64; do
-            if [ -d "$rocm_base/$lib_subdir/rocblas" ]; then
-                echo "Found rocblas data at $rocm_base/$lib_subdir/rocblas"
-                mkdir -p "$TARGET_LIB_DIR/rocblas"
-                cp -arfL "$rocm_base/$lib_subdir/rocblas/"* "$TARGET_LIB_DIR/rocblas/" || echo "WARNING: Failed to copy rocblas data from $rocm_base/$lib_subdir/rocblas"
-                rocblas_found=true
-            fi
-        done
-    done
-    if [ "$rocblas_found" = false ]; then
-        echo "WARNING: No rocblas library data found in /opt/rocm*/lib{,64}/rocblas"
-    fi
+    # Copy rocBLAS and hipBLASLt kernel data (TensileLibrary_*.dat tuning files)
+    # so the bundled libs find their per-arch kernels at runtime instead of
+    # falling back to slow generic code (see copy_rocm_data_dir / issue #10660).
+    copy_rocm_data_dir rocblas
+    copy_rocm_data_dir hipblaslt

    # Copy libomp from LLVM (required for ROCm)
+    # Single-line `local x=$(...)` on purpose: masks shopt -p's nonzero exit
+    # (nullglob unset) so it doesn't trip `set -e`.
+    local old_nullglob=$(shopt -p nullglob)
    shopt -s nullglob
    local omp_libs=(/opt/rocm*/lib/llvm/lib/libomp.so*)
    eval "$old_nullglob"
@@ -477,6 +510,7 @@ export -f copy_libs_glob
 export -f is_core_lib
 export -f copy_elf_deps
 export -f sweep_transitive_deps
+export -f copy_rocm_data_dir
 export -f package_cuda_libs
 export -f package_rocm_libs
 export -f package_intel_libs
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -2728,6 +2728,22 @@ const docTemplate = `{
                }
            }
        },
+        "/v1/models/capabilities": {
+            "get": {
+                "tags": [
+                    "models"
+                ],
+                "summary": "List available models enriched with capabilities and input/output modalities.",
+                "responses": {
+                    "200": {
+                        "description": "Response",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ModelCapabilitiesResponse"
+                        }
+                    }
+                }
+            }
+        },
        "/v1/rerank": {
            "post": {
                "tags": [
@@ -5182,6 +5198,52 @@ const docTemplate = `{
                }
            }
        },
+        "schema.ModelCapabilities": {
+            "type": "object",
+            "properties": {
+                "capabilities": {
+                    "description": "Capabilities are canonical usecase strings (e.g. chat, vision, transcript,\ntts, embeddings, image, video) plus the modifiers \"tools\" and \"thinking\".",
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "id": {
+                    "type": "string"
+                },
+                "input_modalities": {
+                    "description": "InputModalities is the subset of {text,image,audio,video} the model accepts.",
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "object": {
+                    "type": "string"
+                },
+                "output_modalities": {
+                    "description": "OutputModalities is the subset of {text,image,audio,video} the model produces.",
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                }
+            }
+        },
+        "schema.ModelCapabilitiesResponse": {
+            "type": "object",
+            "properties": {
+                "data": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/schema.ModelCapabilities"
+                    }
+                },
+                "object": {
+                    "type": "string"
+                }
+            }
+        },
        "schema.ModelLoadRequest": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -2725,6 +2725,22 @@
                }
            }
        },
+        "/v1/models/capabilities": {
+            "get": {
+                "tags": [
+                    "models"
+                ],
+                "summary": "List available models enriched with capabilities and input/output modalities.",
+                "responses": {
+                    "200": {
+                        "description": "Response",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ModelCapabilitiesResponse"
+                        }
+                    }
+                }
+            }
+        },
        "/v1/rerank": {
            "post": {
                "tags": [
@@ -5179,6 +5195,52 @@
                }
            }
        },
+        "schema.ModelCapabilities": {
+            "type": "object",
+            "properties": {
+                "capabilities": {
+                    "description": "Capabilities are canonical usecase strings (e.g. chat, vision, transcript,\ntts, embeddings, image, video) plus the modifiers \"tools\" and \"thinking\".",
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "id": {
+                    "type": "string"
+                },
+                "input_modalities": {
+                    "description": "InputModalities is the subset of {text,image,audio,video} the model accepts.",
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "object": {
+                    "type": "string"
+                },
+                "output_modalities": {
+                    "description": "OutputModalities is the subset of {text,image,audio,video} the model produces.",
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                }
+            }
+        },
+        "schema.ModelCapabilitiesResponse": {
+            "type": "object",
+            "properties": {
+                "data": {
+                    "type": "array",
+                    "items": {
+                        "$ref": "#/definitions/schema.ModelCapabilities"
+                    }
+                },
+                "object": {
+                    "type": "string"
+                }
+            }
+        },
        "schema.ModelLoadRequest": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -1362,6 +1362,41 @@ definitions:
          $ref: '#/definitions/schema.ToolCall'
        type: array
    type: object
+  schema.ModelCapabilities:
+    properties:
+      capabilities:
+        description: |-
+          Capabilities are canonical usecase strings (e.g. chat, vision, transcript,
+          tts, embeddings, image, video) plus the modifiers "tools" and "thinking".
+        items:
+          type: string
+        type: array
+      id:
+        type: string
+      input_modalities:
+        description: InputModalities is the subset of {text,image,audio,video} the
+          model accepts.
+        items:
+          type: string
+        type: array
+      object:
+        type: string
+      output_modalities:
+        description: OutputModalities is the subset of {text,image,audio,video} the
+          model produces.
+        items:
+          type: string
+        type: array
+    type: object
+  schema.ModelCapabilitiesResponse:
+    properties:
+      data:
+        items:
+          $ref: '#/definitions/schema.ModelCapabilities'
+        type: array
+      object:
+        type: string
+    type: object
  schema.ModelLoadRequest:
    properties:
      model:
@@ -4358,6 +4393,16 @@ paths:
      summary: List and describe the various models available in the API.
      tags:
      - models
+  /v1/models/capabilities:
+    get:
+      responses:
+        "200":
+          description: Response
+          schema:
+            $ref: '#/definitions/schema.ModelCapabilitiesResponse'
+      summary: List available models enriched with capabilities and input/output modalities.
+      tags:
+      - models
  /v1/rerank:
    post:
      parameters:
Author	SHA1	Message	Date
Ettore Di Giacinto	3fe175868a	feat(api): add GET /v1/models/capabilities endpoint Additive superset of /v1/models that enriches each model entry with the capabilities it supports plus its input/output modalities (text / image / audio / video). Clients that only understand /v1/models are unaffected -- they simply never call the new route. Audio and video input are derived from the model's multimodal limits (vLLM limit_mm_per_prompt), which no single usecase FLAG expresses. That gap is exactly why a plain capability list is insufficient and this enriched endpoint exists: an attachment router can now decide whether an image/audio/video file can go to the active model directly, or must be converted/transcribed first. Capability derivation lives in core/config as the single source of truth (ModelConfig.Capabilities / InputModalities / OutputModalities / VisionSupported / ...); the Ollama capability surface now delegates to it instead of keeping a parallel copy. Vision is gated on chat/completion capability so a MediaMarker hydrated onto a non-chat model (e.g. a pure ASR/TTS backend) no longer reports a false vision capability. Read-only listing: no new FLAG_* flag, reuses the existing `models` swagger tag, and intentionally exposes no MCP admin tool (there is nothing to manage conversationally). Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-07-04 22:26:22 +00:00
LocalAI [bot]	38350d363e	fix(backends): enable ROCm/HIP GPU offload for ggml audio backends (#10666 ) (#10667 ) qwen3-tts-cpp, omnivoice-cpp, acestep-cpp and vibevoice-cpp shipped rocm-* variants that silently ran on CPU ([Load] backend: CPU). Two coupled defects: - The Makefiles passed -DGGML_HIPBLAS=ON, but the vendored ggml only understands -DGGML_HIP=ON (GGML_HIPBLAS was removed upstream), so the ggml-hip backend target was never created and no GPU code was built. - The CMake foreach that links the ggml GPU backends into the module listed blas/cuda/metal/vulkan but not hip, so even a built ggml-hip would not have been linked and its static backend registration would never run. CUDA users were unaffected because cublas passes the correct GGML_CUDA=ON and the foreach already links cuda. Mirror the proven llama-cpp hipblas block (ROCm clang CC/CXX + AMDGPU_TARGETS) and add hip to each foreach. Upstream picks the best device via ggml_backend_init_best(), so no runtime flag is needed once HIP is compiled and linked. Assisted-by: Claude:claude-opus-4-8[1m] [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>	2026-07-04 09:08:20 +02:00
LocalAI [bot]	817136c20e	chore: ⬆️ Update CrispStrobe/CrispASR to `f35185b876fc482fcb2053a81a2697936ed5fcc0` (#10670 ) ⬆️ Update CrispStrobe/CrispASR Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-07-04 08:17:02 +02:00
LocalAI [bot]	8396ce1388	chore: ⬆️ Update ggml-org/llama.cpp to `d4cff114c0084f1fbc9b4c62717eca8fb2ae494a` (#10671 ) ⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-07-04 08:16:41 +02:00
LocalAI [bot]	348f3c87c0	fix(gpu-libs): bundle hipBLASLt TensileLibrary data so ROCm backends stop falling back (#10660 ) (#10672 ) the The ROCm packager copied rocBLAS kernel data (rocblas/library/.dat) into the bundled lib/ dir and run.sh pointed ROCBLAS_TENSILE_LIBPATH at it, but the parallel hipBLASLt data dir (hipblaslt/library/TensileLibrary_lazy_gfx.dat) was never packaged and no HIPBLASLT_TENSILE_LIBPATH was set. The bundled libhipblaslt.so therefore resolved its per-arch kernel data relative to itself, found nothing, and silently fell back to slow generic kernels, logging: rocblaslt error: Cannot read "TensileLibrary_lazy_gfx1201.dat": No such file or directory rocblaslt error: Could not load "TensileLibrary_lazy_gfx1201.dat" Fix, mirroring the existing rocBLAS handling: - package-gpu-libs.sh: extract the rocblas data-dir copy into a reusable copy_rocm_data_dir helper and call it for both rocblas and hipblaslt. - llama-cpp/turboquant run.sh: export HIPBLASLT_TENSILE_LIBPATH when the bundled hipblaslt/library dir exists. The helper takes an optional ROCM_BASE_DIRS override so the copy is unit testable without a real ROCm install; add a regression test that runs package_rocm_libs against a fabricated ROCm tree and asserts both data dirs are bundled. Note: this bundles whatever gfx*.dat the build image's ROCm provides. If a given arch's tensile data is absent from the shipped ROCm, that arch still needs a ROCm bump; the packaging gap itself is fixed for every supported arch. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>	2026-07-04 08:14:12 +02:00