mirror of
https://github.com/mudler/LocalAI.git
synced 2026-07-04 21:37:02 -04:00
Compare commits
5 Commits
dependabot
...
feat/model
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3fe175868a | ||
|
|
38350d363e | ||
|
|
817136c20e | ||
|
|
8396ce1388 | ||
|
|
348f3c87c0 |
@@ -1,5 +1,5 @@
|
||||
|
||||
LLAMA_VERSION?=fdb1db877c526ec90f668eca1b858da5dba85560
|
||||
LLAMA_VERSION?=d4cff114c0084f1fbc9b4c62717eca8fb2ae494a
|
||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
@@ -36,6 +36,12 @@ else
|
||||
if [ -d "$CURDIR/lib/rocblas/library" ]; then
|
||||
export ROCBLAS_TENSILE_LIBPATH="$CURDIR"/lib/rocblas/library
|
||||
fi
|
||||
# Same for hipBLASLt (rocblaslt): the bundled libhipblaslt.so resolves its
|
||||
# TensileLibrary_lazy_gfx*.dat kernel data relative to itself, so point it at
|
||||
# the bundled data or it falls back to slow generic kernels (issue #10660).
|
||||
if [ -d "$CURDIR/lib/hipblaslt/library" ]; then
|
||||
export HIPBLASLT_TENSILE_LIBPATH="$CURDIR"/lib/hipblaslt/library
|
||||
fi
|
||||
fi
|
||||
|
||||
# If there is a lib/ld.so, use it
|
||||
|
||||
@@ -34,6 +34,12 @@ else
|
||||
if [ -d "$CURDIR/lib/rocblas/library" ]; then
|
||||
export ROCBLAS_TENSILE_LIBPATH="$CURDIR"/lib/rocblas/library
|
||||
fi
|
||||
# Same for hipBLASLt (rocblaslt): the bundled libhipblaslt.so resolves its
|
||||
# TensileLibrary_lazy_gfx*.dat kernel data relative to itself, so point it at
|
||||
# the bundled data or it falls back to slow generic kernels (issue #10660).
|
||||
if [ -d "$CURDIR/lib/hipblaslt/library" ]; then
|
||||
export HIPBLASLT_TENSILE_LIBPATH="$CURDIR"/lib/hipblaslt/library
|
||||
fi
|
||||
fi
|
||||
|
||||
# If there is a lib/ld.so, use it
|
||||
|
||||
@@ -25,7 +25,7 @@ target_include_directories(goacestepcpp PRIVATE ${ACESTEP_DIR}/src ${ACESTEP_DIR
|
||||
target_include_directories(goacestepcpp SYSTEM PRIVATE ${ACESTEP_DIR}/ggml/include)
|
||||
|
||||
# Link GPU backends if available (mirrors link_ggml_backends macro)
|
||||
foreach(backend blas cuda metal vulkan)
|
||||
foreach(backend blas cuda hip metal vulkan)
|
||||
if(TARGET ggml-${backend})
|
||||
target_link_libraries(goacestepcpp PRIVATE ggml-${backend})
|
||||
string(TOUPPER ${backend} BACKEND_UPPER)
|
||||
|
||||
@@ -24,7 +24,14 @@ else ifeq ($(BUILD_TYPE),openblas)
|
||||
else ifeq ($(BUILD_TYPE),clblas)
|
||||
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
CMAKE_ARGS+=-DGGML_HIPBLAS=ON
|
||||
# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
|
||||
# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
|
||||
ROCM_HOME ?= /opt/rocm
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
export CXX=$(ROCM_HOME)/llvm/bin/clang++
|
||||
export CC=$(ROCM_HOME)/llvm/bin/clang
|
||||
AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
|
||||
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
|
||||
else ifeq ($(BUILD_TYPE),vulkan)
|
||||
CMAKE_ARGS+=-DGGML_VULKAN=ON
|
||||
else ifeq ($(OS),Darwin)
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# CrispASR version (release tag)
|
||||
CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
|
||||
CRISPASR_VERSION?=9a26976a8c8cf5af0afcdd04463cf8ba91e96a54
|
||||
CRISPASR_VERSION?=f35185b876fc482fcb2053a81a2697936ed5fcc0
|
||||
SO_TARGET?=libgocrispasr.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
@@ -30,7 +30,7 @@ target_include_directories(gomnivoicecpp PRIVATE ${OMNIVOICE_DIR}/src)
|
||||
target_include_directories(gomnivoicecpp SYSTEM PRIVATE ${OMNIVOICE_DIR}/ggml/include)
|
||||
|
||||
# Link GPU backends if the upstream ggml created them.
|
||||
foreach(backend blas cuda metal vulkan sycl)
|
||||
foreach(backend blas cuda hip metal vulkan sycl)
|
||||
if(TARGET ggml-${backend})
|
||||
target_link_libraries(gomnivoicecpp PRIVATE ggml-${backend})
|
||||
if(backend STREQUAL "cuda")
|
||||
|
||||
@@ -24,7 +24,14 @@ else ifeq ($(BUILD_TYPE),openblas)
|
||||
else ifeq ($(BUILD_TYPE),clblas)
|
||||
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
CMAKE_ARGS+=-DGGML_HIPBLAS=ON
|
||||
# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
|
||||
# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
|
||||
ROCM_HOME ?= /opt/rocm
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
export CXX=$(ROCM_HOME)/llvm/bin/clang++
|
||||
export CC=$(ROCM_HOME)/llvm/bin/clang
|
||||
AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
|
||||
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
|
||||
else ifeq ($(BUILD_TYPE),vulkan)
|
||||
CMAKE_ARGS+=-DGGML_VULKAN=ON
|
||||
else ifeq ($(OS),Darwin)
|
||||
|
||||
@@ -30,7 +30,7 @@ target_include_directories(goqwen3ttscpp PRIVATE ${QWENTTS_DIR}/src)
|
||||
target_include_directories(goqwen3ttscpp SYSTEM PRIVATE ${QWENTTS_DIR}/ggml/include)
|
||||
|
||||
# Link GPU backends if the upstream ggml created them.
|
||||
foreach(backend blas cuda metal vulkan sycl)
|
||||
foreach(backend blas cuda hip metal vulkan sycl)
|
||||
if(TARGET ggml-${backend})
|
||||
target_link_libraries(goqwen3ttscpp PRIVATE ggml-${backend})
|
||||
if(backend STREQUAL "cuda")
|
||||
|
||||
@@ -24,7 +24,14 @@ else ifeq ($(BUILD_TYPE),openblas)
|
||||
else ifeq ($(BUILD_TYPE),clblas)
|
||||
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
CMAKE_ARGS+=-DGGML_HIPBLAS=ON
|
||||
# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
|
||||
# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
|
||||
ROCM_HOME ?= /opt/rocm
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
export CXX=$(ROCM_HOME)/llvm/bin/clang++
|
||||
export CC=$(ROCM_HOME)/llvm/bin/clang
|
||||
AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
|
||||
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
|
||||
else ifeq ($(BUILD_TYPE),vulkan)
|
||||
CMAKE_ARGS+=-DGGML_VULKAN=ON
|
||||
else ifeq ($(OS),Darwin)
|
||||
|
||||
@@ -50,7 +50,7 @@ target_include_directories(govibevoicecpp SYSTEM PRIVATE ${VIBEVOICE_DIR}/third_
|
||||
# Link GPU backends if available — vibevoice's own CMake already links
|
||||
# these to the libvibevoice STATIC library, but we re-link them on the
|
||||
# MODULE so resolved symbols include all backend kernels.
|
||||
foreach(backend blas cuda metal vulkan)
|
||||
foreach(backend blas cuda hip metal vulkan)
|
||||
if(TARGET ggml-${backend})
|
||||
target_link_libraries(govibevoicecpp PRIVATE ggml-${backend})
|
||||
string(TOUPPER ${backend} BACKEND_UPPER)
|
||||
|
||||
@@ -29,7 +29,14 @@ else ifeq ($(BUILD_TYPE),openblas)
|
||||
else ifeq ($(BUILD_TYPE),clblas)
|
||||
CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DVIBEVOICE_GGML_HIPBLAS=ON
|
||||
# This ggml only understands GGML_HIP (GGML_HIPBLAS was removed upstream),
|
||||
# so passing GGML_HIPBLAS silently produced a CPU-only build (see #10666).
|
||||
ROCM_HOME ?= /opt/rocm
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
export CXX=$(ROCM_HOME)/llvm/bin/clang++
|
||||
export CC=$(ROCM_HOME)/llvm/bin/clang
|
||||
AMDGPU_TARGETS ?= gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201
|
||||
CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
|
||||
else ifeq ($(BUILD_TYPE),vulkan)
|
||||
CMAKE_ARGS+=-DGGML_VULKAN=ON -DVIBEVOICE_GGML_VULKAN=ON
|
||||
else ifeq ($(OS),Darwin)
|
||||
|
||||
197
core/config/model_capabilities.go
Normal file
197
core/config/model_capabilities.go
Normal file
@@ -0,0 +1,197 @@
|
||||
package config
|
||||
|
||||
// This file is the single source of truth for deriving a model's user-facing
|
||||
// capabilities and input/output modalities from its ModelConfig. Both the
|
||||
// OpenAI-compatible /v1/models/capabilities endpoint and the Ollama-compatible
|
||||
// /api/tags|/api/show surface consume these, so the vocabulary stays consistent
|
||||
// across clients. Keep the detection heuristics here rather than duplicating
|
||||
// them per endpoint.
|
||||
|
||||
// VisionSupported reports whether the model can accept image inputs.
|
||||
//
|
||||
// We deliberately avoid HasUsecases(FLAG_VISION): GuessUsecases has no
|
||||
// FLAG_VISION branch and reports true for any chat model, so it would paint
|
||||
// vision onto text-only models. Instead we look for explicit signals: the
|
||||
// declared KnownUsecases bit, a multimodal projector, or a template/backend
|
||||
// multimodal marker.
|
||||
func (c *ModelConfig) VisionSupported() bool {
|
||||
if c.KnownUsecases != nil && (*c.KnownUsecases&FLAG_VISION) == FLAG_VISION {
|
||||
return true
|
||||
}
|
||||
if c.MMProj != "" {
|
||||
return true
|
||||
}
|
||||
if c.TemplateConfig.Multimodal != "" {
|
||||
return true
|
||||
}
|
||||
if c.MediaMarker != "" {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// ToolSupported reports whether the model is wired up for tool / function
|
||||
// calling. We look for any of the explicit knobs LocalAI uses to drive
|
||||
// function-call extraction (regex match, response regex, grammar triggers, XML
|
||||
// format) or the auto-detected tool-format markers the llama.cpp backend
|
||||
// populates during model load.
|
||||
func (c *ModelConfig) ToolSupported() bool {
|
||||
fc := c.FunctionsConfig
|
||||
if fc.ToolFormatMarkers != nil && fc.ToolFormatMarkers.FormatType != "" {
|
||||
return true
|
||||
}
|
||||
if len(fc.JSONRegexMatch) > 0 || len(fc.ResponseRegex) > 0 {
|
||||
return true
|
||||
}
|
||||
if fc.XMLFormatPreset != "" || fc.XMLFormat != nil {
|
||||
return true
|
||||
}
|
||||
if len(fc.GrammarConfig.GrammarTriggers) > 0 || fc.GrammarConfig.SchemaType != "" {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// ThinkingSupported reports whether the model has reasoning / thinking enabled.
|
||||
// LocalAI sets DisableReasoning=false (or leaves thinking markers configured)
|
||||
// when the backend probe reports that the model supports thinking.
|
||||
func (c *ModelConfig) ThinkingSupported() bool {
|
||||
rc := c.ReasoningConfig
|
||||
if rc.DisableReasoning != nil && !*rc.DisableReasoning {
|
||||
return true
|
||||
}
|
||||
if len(rc.ThinkingStartTokens) > 0 || len(rc.TagPairs) > 0 {
|
||||
// Explicit thinking markers imply support unless explicitly disabled.
|
||||
return rc.DisableReasoning == nil || !*rc.DisableReasoning
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// AudioInputSupported reports whether a chat/generation model accepts audio as
|
||||
// input (e.g. vLLM omni models). The signal is the vLLM per-prompt audio limit;
|
||||
// there is no FLAG_* for "chat model that hears audio", which is exactly why a
|
||||
// plain usecase list can't express it. Transcription models are handled
|
||||
// separately in InputModalities via FLAG_TRANSCRIPT.
|
||||
func (c *ModelConfig) AudioInputSupported() bool {
|
||||
return c.LimitMMPerPrompt.LimitAudioPerPrompt > 0
|
||||
}
|
||||
|
||||
// VideoInputSupported reports whether a chat/generation model accepts video as
|
||||
// input. The signal is the vLLM per-prompt video limit. Note this is distinct
|
||||
// from FLAG_VIDEO, which denotes video *generation* (diffusers) — an output
|
||||
// modality, not an input one.
|
||||
func (c *ModelConfig) VideoInputSupported() bool {
|
||||
return c.LimitMMPerPrompt.LimitVideoPerPrompt > 0
|
||||
}
|
||||
|
||||
// Capabilities returns the ordered list of capability strings the model
|
||||
// supports, using the canonical usecase vocabulary (chat, vision, transcript,
|
||||
// tts, embeddings, image, video, ...) plus the modifier capabilities "tools"
|
||||
// and "thinking". Vision is resolved via VisionSupported (not HasUsecases) to
|
||||
// avoid the guess-heuristic false positive.
|
||||
func (c *ModelConfig) Capabilities() []string {
|
||||
chat := c.HasUsecases(FLAG_CHAT)
|
||||
completion := c.HasUsecases(FLAG_COMPLETION)
|
||||
|
||||
var caps []string
|
||||
add := func(cond bool, name string) {
|
||||
if cond {
|
||||
caps = append(caps, name)
|
||||
}
|
||||
}
|
||||
|
||||
add(chat, UsecaseChat)
|
||||
add(completion, UsecaseCompletion)
|
||||
add(c.HasUsecases(FLAG_EDIT), UsecaseEdit)
|
||||
add(c.HasUsecases(FLAG_EMBEDDINGS), UsecaseEmbeddings)
|
||||
add(c.HasUsecases(FLAG_RERANK), UsecaseRerank)
|
||||
// Vision is only meaningful as an image-understanding modifier on a chat/
|
||||
// completion model. Gating on (chat||completion) matches the Ollama surface
|
||||
// and avoids a false positive when config defaults hydrate a MediaMarker on
|
||||
// a non-chat model (e.g. a pure ASR/TTS backend).
|
||||
add((chat || completion) && c.VisionSupported(), UsecaseVision)
|
||||
// tools/thinking are modifiers on the chat/completion surface.
|
||||
add((chat || completion) && c.ToolSupported(), "tools")
|
||||
add((chat || completion) && c.ThinkingSupported(), "thinking")
|
||||
add(c.HasUsecases(FLAG_TRANSCRIPT), UsecaseTranscript)
|
||||
add(c.HasUsecases(FLAG_TTS), UsecaseTTS)
|
||||
add(c.HasUsecases(FLAG_SOUND_GENERATION), UsecaseSoundGeneration)
|
||||
add(c.HasUsecases(FLAG_IMAGE), UsecaseImage)
|
||||
add(c.HasUsecases(FLAG_VIDEO), UsecaseVideo)
|
||||
add(c.HasUsecases(FLAG_VAD), UsecaseVAD)
|
||||
add(c.HasUsecases(FLAG_DETECTION), UsecaseDetection)
|
||||
add(c.HasUsecases(FLAG_DEPTH), UsecaseDepth)
|
||||
add(c.HasUsecases(FLAG_AUDIO_TRANSFORM), UsecaseAudioTransform)
|
||||
add(c.HasUsecases(FLAG_DIARIZATION), UsecaseDiarization)
|
||||
add(c.HasUsecases(FLAG_SOUND_CLASSIFICATION), UsecaseSoundClassification)
|
||||
add(c.HasUsecases(FLAG_REALTIME_AUDIO), UsecaseRealtimeAudio)
|
||||
add(c.HasUsecases(FLAG_FACE_RECOGNITION), UsecaseFaceRecognition)
|
||||
add(c.HasUsecases(FLAG_SPEAKER_RECOGNITION), UsecaseSpeakerRecognition)
|
||||
return caps
|
||||
}
|
||||
|
||||
// InputModalities returns the set of modalities (text, image, audio, video) the
|
||||
// model accepts as input, ordered text→image→audio→video. This is what an
|
||||
// attachment router consults to decide whether an image/audio/video file can be
|
||||
// handed to the active model directly.
|
||||
func (c *ModelConfig) InputModalities() []string {
|
||||
imageGen := c.HasUsecases(FLAG_IMAGE)
|
||||
videoGen := c.HasUsecases(FLAG_VIDEO)
|
||||
chatish := c.HasUsecases(FLAG_CHAT) || c.HasUsecases(FLAG_COMPLETION)
|
||||
|
||||
textIn := chatish || c.HasUsecases(FLAG_EDIT) ||
|
||||
c.HasUsecases(FLAG_EMBEDDINGS) || c.HasUsecases(FLAG_RERANK) || c.HasUsecases(FLAG_TOKENIZE) ||
|
||||
c.HasUsecases(FLAG_TTS) || c.HasUsecases(FLAG_SOUND_GENERATION) || imageGen || videoGen
|
||||
|
||||
// Image input via a chat model requires vision (gated on chat, like the
|
||||
// Ollama surface); detection/depth/face models consume images directly.
|
||||
imageIn := (chatish && c.VisionSupported()) || c.LimitMMPerPrompt.LimitImagePerPrompt > 0 ||
|
||||
c.HasUsecases(FLAG_DETECTION) || c.HasUsecases(FLAG_DEPTH) || c.HasUsecases(FLAG_FACE_RECOGNITION)
|
||||
|
||||
audioIn := c.AudioInputSupported() || c.HasUsecases(FLAG_TRANSCRIPT) || c.HasUsecases(FLAG_AUDIO_TRANSFORM) ||
|
||||
c.HasUsecases(FLAG_REALTIME_AUDIO) || c.HasUsecases(FLAG_VAD) || c.HasUsecases(FLAG_DIARIZATION) ||
|
||||
c.HasUsecases(FLAG_SOUND_CLASSIFICATION) || c.HasUsecases(FLAG_SPEAKER_RECOGNITION)
|
||||
|
||||
videoIn := c.VideoInputSupported()
|
||||
|
||||
var mods []string
|
||||
if textIn {
|
||||
mods = append(mods, "text")
|
||||
}
|
||||
if imageIn {
|
||||
mods = append(mods, "image")
|
||||
}
|
||||
if audioIn {
|
||||
mods = append(mods, "audio")
|
||||
}
|
||||
if videoIn {
|
||||
mods = append(mods, "video")
|
||||
}
|
||||
return mods
|
||||
}
|
||||
|
||||
// OutputModalities returns the set of modalities (text, image, audio, video)
|
||||
// the model produces, ordered text→image→audio→video.
|
||||
func (c *ModelConfig) OutputModalities() []string {
|
||||
textOut := c.HasUsecases(FLAG_CHAT) || c.HasUsecases(FLAG_COMPLETION) || c.HasUsecases(FLAG_EDIT) ||
|
||||
c.HasUsecases(FLAG_TRANSCRIPT)
|
||||
imageOut := c.HasUsecases(FLAG_IMAGE)
|
||||
audioOut := c.HasUsecases(FLAG_TTS) || c.HasUsecases(FLAG_SOUND_GENERATION) ||
|
||||
c.HasUsecases(FLAG_AUDIO_TRANSFORM) || c.HasUsecases(FLAG_REALTIME_AUDIO)
|
||||
videoOut := c.HasUsecases(FLAG_VIDEO)
|
||||
|
||||
var mods []string
|
||||
if textOut {
|
||||
mods = append(mods, "text")
|
||||
}
|
||||
if imageOut {
|
||||
mods = append(mods, "image")
|
||||
}
|
||||
if audioOut {
|
||||
mods = append(mods, "audio")
|
||||
}
|
||||
if videoOut {
|
||||
mods = append(mods, "video")
|
||||
}
|
||||
return mods
|
||||
}
|
||||
103
core/config/model_capabilities_test.go
Normal file
103
core/config/model_capabilities_test.go
Normal file
@@ -0,0 +1,103 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
func usecaseBits(flags ModelConfigUsecase) *ModelConfigUsecase {
|
||||
return &flags
|
||||
}
|
||||
|
||||
var _ = Describe("Model capabilities derivation", func() {
|
||||
Describe("VisionSupported", func() {
|
||||
It("is false for a plain text chat model", func() {
|
||||
cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"}
|
||||
Expect(cfg.VisionSupported()).To(BeFalse())
|
||||
})
|
||||
|
||||
It("is true when the FLAG_VISION bit is declared", func() {
|
||||
cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT | FLAG_VISION), Backend: "llama.cpp"}
|
||||
Expect(cfg.VisionSupported()).To(BeTrue())
|
||||
})
|
||||
|
||||
It("is true when an mmproj projector is set", func() {
|
||||
cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"}
|
||||
cfg.MMProj = "mmproj.gguf" // promoted field from the embedded options struct
|
||||
Expect(cfg.VisionSupported()).To(BeTrue())
|
||||
})
|
||||
|
||||
It("does not fall for the GuessUsecases FLAG_VISION false positive", func() {
|
||||
// A chat model with a chat template would make HasUsecases(FLAG_VISION)
|
||||
// return true via the guess heuristic; VisionSupported must not.
|
||||
cfg := &ModelConfig{Backend: "llama.cpp"}
|
||||
cfg.TemplateConfig.Chat = "{{.Input}}"
|
||||
Expect(cfg.VisionSupported()).To(BeFalse())
|
||||
})
|
||||
})
|
||||
|
||||
Describe("AudioInputSupported / VideoInputSupported", func() {
|
||||
It("detects vLLM omni audio input via limit_mm_per_prompt", func() {
|
||||
cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"}
|
||||
cfg.LimitMMPerPrompt.LimitAudioPerPrompt = 1
|
||||
Expect(cfg.AudioInputSupported()).To(BeTrue())
|
||||
Expect(cfg.VideoInputSupported()).To(BeFalse())
|
||||
})
|
||||
|
||||
It("detects vLLM omni video input via limit_mm_per_prompt", func() {
|
||||
cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"}
|
||||
cfg.LimitMMPerPrompt.LimitVideoPerPrompt = 2
|
||||
Expect(cfg.VideoInputSupported()).To(BeTrue())
|
||||
})
|
||||
})
|
||||
|
||||
Describe("Capabilities + modalities", func() {
|
||||
It("a text-only chat model exposes chat and text-only modalities", func() {
|
||||
cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"}
|
||||
Expect(cfg.Capabilities()).To(ContainElement(UsecaseChat))
|
||||
Expect(cfg.Capabilities()).NotTo(ContainElement(UsecaseVision))
|
||||
Expect(cfg.Capabilities()).NotTo(ContainElement(UsecaseTranscript))
|
||||
Expect(cfg.InputModalities()).To(Equal([]string{"text"}))
|
||||
Expect(cfg.OutputModalities()).To(Equal([]string{"text"}))
|
||||
})
|
||||
|
||||
It("a vision chat model accepts text+image input", func() {
|
||||
cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT | FLAG_VISION), Backend: "llama.cpp"}
|
||||
Expect(cfg.Capabilities()).To(ContainElements(UsecaseChat, UsecaseVision))
|
||||
Expect(cfg.InputModalities()).To(Equal([]string{"text", "image"}))
|
||||
Expect(cfg.OutputModalities()).To(Equal([]string{"text"}))
|
||||
})
|
||||
|
||||
It("an omni chat model accepts text+audio input without an audio capability flag", func() {
|
||||
cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"}
|
||||
cfg.LimitMMPerPrompt.LimitAudioPerPrompt = 1
|
||||
// audio-in is a modality, not a usecase string — this is exactly the
|
||||
// case a plain capability list cannot express.
|
||||
Expect(cfg.Capabilities()).To(ContainElement(UsecaseChat))
|
||||
Expect(cfg.InputModalities()).To(Equal([]string{"text", "audio"}))
|
||||
})
|
||||
|
||||
It("a transcription model reads audio and writes text", func() {
|
||||
cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_TRANSCRIPT), Backend: "parakeet-cpp"}
|
||||
Expect(cfg.Capabilities()).To(Equal([]string{UsecaseTranscript}))
|
||||
Expect(cfg.InputModalities()).To(Equal([]string{"audio"}))
|
||||
Expect(cfg.OutputModalities()).To(Equal([]string{"text"}))
|
||||
})
|
||||
|
||||
It("an image-generation model reads text and writes an image", func() {
|
||||
// stablediffusion-ggml is image-only; plain "stablediffusion" is also
|
||||
// in GuessUsecases' video-backend list, so it would report video too.
|
||||
cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_IMAGE), Backend: "stablediffusion-ggml"}
|
||||
Expect(cfg.Capabilities()).To(Equal([]string{UsecaseImage}))
|
||||
Expect(cfg.InputModalities()).To(Equal([]string{"text"}))
|
||||
Expect(cfg.OutputModalities()).To(Equal([]string{"image"}))
|
||||
})
|
||||
|
||||
It("a TTS model reads text and writes audio", func() {
|
||||
cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_TTS), Backend: "piper"}
|
||||
Expect(cfg.Capabilities()).To(ContainElement(UsecaseTTS))
|
||||
Expect(cfg.InputModalities()).To(Equal([]string{"text"}))
|
||||
Expect(cfg.OutputModalities()).To(Equal([]string{"audio"}))
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -49,62 +49,23 @@ func modelCapabilities(cfg *config.ModelConfig) []string {
|
||||
return caps
|
||||
}
|
||||
|
||||
// hasVisionSupport reports whether the model can accept image inputs. We avoid
|
||||
// cfg.HasUsecases(FLAG_VISION) because GuessUsecases has no FLAG_VISION case
|
||||
// and returns true for any chat model — see core/config/model_config.go. Instead
|
||||
// we look for explicit signals: KnownUsecases bit, multimodal projector, or
|
||||
// template/backend-reported multimodal markers.
|
||||
// hasVisionSupport reports whether the model can accept image inputs.
|
||||
// The detection heuristic is the canonical config.ModelConfig.VisionSupported —
|
||||
// kept as a thin wrapper here so the Ollama capability mapping reads cleanly.
|
||||
func hasVisionSupport(cfg *config.ModelConfig) bool {
|
||||
if cfg.KnownUsecases != nil && (*cfg.KnownUsecases&config.FLAG_VISION) == config.FLAG_VISION {
|
||||
return true
|
||||
}
|
||||
if cfg.MMProj != "" {
|
||||
return true
|
||||
}
|
||||
if cfg.TemplateConfig.Multimodal != "" {
|
||||
return true
|
||||
}
|
||||
if cfg.MediaMarker != "" {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
return cfg.VisionSupported()
|
||||
}
|
||||
|
||||
// hasToolSupport reports whether the model is wired up for tool / function calling.
|
||||
// We look for any of the explicit configuration knobs LocalAI uses to drive
|
||||
// function-call extraction (regex match, response regex, grammar triggers, XML
|
||||
// format) or for the auto-detected tool-format markers populated by the
|
||||
// llama.cpp backend during model load.
|
||||
// hasToolSupport reports whether the model is wired up for tool / function
|
||||
// calling. Delegates to the canonical config.ModelConfig.ToolSupported.
|
||||
func hasToolSupport(cfg *config.ModelConfig) bool {
|
||||
fc := cfg.FunctionsConfig
|
||||
if fc.ToolFormatMarkers != nil && fc.ToolFormatMarkers.FormatType != "" {
|
||||
return true
|
||||
}
|
||||
if len(fc.JSONRegexMatch) > 0 || len(fc.ResponseRegex) > 0 {
|
||||
return true
|
||||
}
|
||||
if fc.XMLFormatPreset != "" || fc.XMLFormat != nil {
|
||||
return true
|
||||
}
|
||||
if len(fc.GrammarConfig.GrammarTriggers) > 0 || fc.GrammarConfig.SchemaType != "" {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
return cfg.ToolSupported()
|
||||
}
|
||||
|
||||
// hasThinkingSupport reports whether the model has reasoning / thinking enabled.
|
||||
// LocalAI sets DisableReasoning=false (or leaves thinking markers configured)
|
||||
// when the backend probe reports that the model supports thinking.
|
||||
// Delegates to the canonical config.ModelConfig.ThinkingSupported.
|
||||
func hasThinkingSupport(cfg *config.ModelConfig) bool {
|
||||
rc := cfg.ReasoningConfig
|
||||
if rc.DisableReasoning != nil && !*rc.DisableReasoning {
|
||||
return true
|
||||
}
|
||||
if len(rc.ThinkingStartTokens) > 0 || len(rc.TagPairs) > 0 {
|
||||
// Explicit thinking markers imply support unless explicitly disabled.
|
||||
return rc.DisableReasoning == nil || !*rc.DisableReasoning
|
||||
}
|
||||
return false
|
||||
return cfg.ThinkingSupported()
|
||||
}
|
||||
|
||||
// quantRegex matches GGUF-style quantization suffixes (Q4_K_M, Q8_0, IQ3_XS, F16, ...).
|
||||
|
||||
@@ -21,48 +21,11 @@ func ListModelsEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, ap
|
||||
authDB = db[0]
|
||||
}
|
||||
return func(c echo.Context) error {
|
||||
// If blank, no filter is applied.
|
||||
filter := c.QueryParam("filter")
|
||||
|
||||
// By default, exclude any loose files that are already referenced by a configuration file.
|
||||
var policy galleryop.LooseFilePolicy
|
||||
excludeConfigured := c.QueryParam("excludeConfigured")
|
||||
if excludeConfigured == "" || excludeConfigured == "true" {
|
||||
policy = galleryop.SKIP_IF_CONFIGURED
|
||||
} else {
|
||||
policy = galleryop.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user?
|
||||
}
|
||||
|
||||
filterFn, err := config.BuildNameFilterFn(filter)
|
||||
modelNames, err := listVisibleModelNames(c, bcl, ml, authDB)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
modelNames, err := galleryop.ListModels(bcl, ml, filterFn, policy)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Filter models by user's allowlist if auth is enabled
|
||||
if authDB != nil {
|
||||
if user := auth.GetUser(c); user != nil && user.Role != auth.RoleAdmin {
|
||||
perm, err := auth.GetCachedUserPermissions(c, authDB, user.ID)
|
||||
if err == nil && perm.AllowedModels.Enabled {
|
||||
allowed := map[string]bool{}
|
||||
for _, m := range perm.AllowedModels.Models {
|
||||
allowed[m] = true
|
||||
}
|
||||
filtered := make([]string, 0, len(modelNames))
|
||||
for _, m := range modelNames {
|
||||
if allowed[m] {
|
||||
filtered = append(filtered, m)
|
||||
}
|
||||
}
|
||||
modelNames = filtered
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Map from a slice of names to a slice of OpenAIModel response objects
|
||||
dataModels := []schema.OpenAIModel{}
|
||||
for _, m := range modelNames {
|
||||
@@ -75,3 +38,53 @@ func ListModelsEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, ap
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// listVisibleModelNames resolves the model names visible to the caller, applying
|
||||
// the same query filters (filter, excludeConfigured) and per-user allowlist as
|
||||
// the OpenAI models listing. Shared by ListModelsEndpoint and
|
||||
// ListModelCapabilitiesEndpoint so both stay consistent.
|
||||
func listVisibleModelNames(c echo.Context, bcl *config.ModelConfigLoader, ml *model.ModelLoader, authDB *gorm.DB) ([]string, error) {
|
||||
// If blank, no filter is applied.
|
||||
filter := c.QueryParam("filter")
|
||||
|
||||
// By default, exclude any loose files that are already referenced by a configuration file.
|
||||
var policy galleryop.LooseFilePolicy
|
||||
excludeConfigured := c.QueryParam("excludeConfigured")
|
||||
if excludeConfigured == "" || excludeConfigured == "true" {
|
||||
policy = galleryop.SKIP_IF_CONFIGURED
|
||||
} else {
|
||||
policy = galleryop.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user?
|
||||
}
|
||||
|
||||
filterFn, err := config.BuildNameFilterFn(filter)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
modelNames, err := galleryop.ListModels(bcl, ml, filterFn, policy)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Filter models by user's allowlist if auth is enabled
|
||||
if authDB != nil {
|
||||
if user := auth.GetUser(c); user != nil && user.Role != auth.RoleAdmin {
|
||||
perm, err := auth.GetCachedUserPermissions(c, authDB, user.ID)
|
||||
if err == nil && perm.AllowedModels.Enabled {
|
||||
allowed := map[string]bool{}
|
||||
for _, m := range perm.AllowedModels.Models {
|
||||
allowed[m] = true
|
||||
}
|
||||
filtered := make([]string, 0, len(modelNames))
|
||||
for _, m := range modelNames {
|
||||
if allowed[m] {
|
||||
filtered = append(filtered, m)
|
||||
}
|
||||
}
|
||||
modelNames = filtered
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return modelNames, nil
|
||||
}
|
||||
|
||||
50
core/http/endpoints/openai/list_capabilities.go
Normal file
50
core/http/endpoints/openai/list_capabilities.go
Normal file
@@ -0,0 +1,50 @@
|
||||
package openai
|
||||
|
||||
import (
|
||||
"github.com/labstack/echo/v4"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
"gorm.io/gorm"
|
||||
)
|
||||
|
||||
// ListModelCapabilitiesEndpoint is a LocalAI-specific extension of the OpenAI
|
||||
// models listing. It returns the same set of models as /v1/models but enriches
|
||||
// each entry with the capabilities and input/output modalities the model
|
||||
// supports, so clients can decide whether an image/audio/video attachment can be
|
||||
// handed to a given model directly (or must be converted/transcribed first).
|
||||
//
|
||||
// It is purely additive: clients that don't know about it keep using /v1/models
|
||||
// and see no change.
|
||||
// @Summary List available models enriched with capabilities and input/output modalities.
|
||||
// @Tags models
|
||||
// @Success 200 {object} schema.ModelCapabilitiesResponse "Response"
|
||||
// @Router /v1/models/capabilities [get]
|
||||
func ListModelCapabilitiesEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, db ...*gorm.DB) echo.HandlerFunc {
|
||||
var authDB *gorm.DB
|
||||
if len(db) > 0 {
|
||||
authDB = db[0]
|
||||
}
|
||||
return func(c echo.Context) error {
|
||||
modelNames, err := listVisibleModelNames(c, bcl, ml, authDB)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
dataModels := []schema.ModelCapabilities{}
|
||||
for _, m := range modelNames {
|
||||
entry := schema.ModelCapabilities{ID: m, Object: "model"}
|
||||
if cfg, ok := bcl.GetModelConfig(m); ok {
|
||||
entry.Capabilities = cfg.Capabilities()
|
||||
entry.InputModalities = cfg.InputModalities()
|
||||
entry.OutputModalities = cfg.OutputModalities()
|
||||
}
|
||||
dataModels = append(dataModels, entry)
|
||||
}
|
||||
|
||||
return c.JSON(200, schema.ModelCapabilitiesResponse{
|
||||
Object: "list",
|
||||
Data: dataModels,
|
||||
})
|
||||
}
|
||||
}
|
||||
119
core/http/endpoints/openai/list_capabilities_test.go
Normal file
119
core/http/endpoints/openai/list_capabilities_test.go
Normal file
@@ -0,0 +1,119 @@
|
||||
package openai
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/labstack/echo/v4"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/system"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("ListModelCapabilitiesEndpoint", func() {
|
||||
var (
|
||||
e *echo.Echo
|
||||
tmpDir string
|
||||
bcl *config.ModelConfigLoader
|
||||
ml *model.ModelLoader
|
||||
appConf *config.ApplicationConfig
|
||||
)
|
||||
|
||||
BeforeEach(func() {
|
||||
var err error
|
||||
e = echo.New()
|
||||
tmpDir, err = os.MkdirTemp("", "models-caps-test-*")
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
|
||||
st, err := system.GetSystemState(system.WithModelPath(tmpDir))
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
ml = model.NewModelLoader(st)
|
||||
bcl = config.NewModelConfigLoader(tmpDir)
|
||||
appConf = config.NewApplicationConfig()
|
||||
})
|
||||
|
||||
AfterEach(func() {
|
||||
_ = os.RemoveAll(tmpDir)
|
||||
})
|
||||
|
||||
writeConfig := func(name, yaml string) {
|
||||
path := filepath.Join(tmpDir, name+".yaml")
|
||||
Expect(os.WriteFile(path, []byte(yaml), 0o644)).To(Succeed())
|
||||
Expect(bcl.ReadModelConfig(path)).To(Succeed())
|
||||
}
|
||||
|
||||
// call exercises the endpoint with auth disabled (no auth DB), which is the
|
||||
// standard deployment path. The per-user allowlist branch is shared verbatim
|
||||
// with ListModelsEndpoint (listVisibleModelNames) and covered there.
|
||||
call := func() schema.ModelCapabilitiesResponse {
|
||||
req := httptest.NewRequest(http.MethodGet, "/v1/models/capabilities", nil)
|
||||
rec := httptest.NewRecorder()
|
||||
c := e.NewContext(req, rec)
|
||||
|
||||
handler := ListModelCapabilitiesEndpoint(bcl, ml, appConf)
|
||||
Expect(handler(c)).To(Succeed())
|
||||
Expect(rec.Code).To(Equal(http.StatusOK))
|
||||
|
||||
var resp schema.ModelCapabilitiesResponse
|
||||
Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
|
||||
return resp
|
||||
}
|
||||
|
||||
entryFor := func(resp schema.ModelCapabilitiesResponse, id string) *schema.ModelCapabilities {
|
||||
for i := range resp.Data {
|
||||
if resp.Data[i].ID == id {
|
||||
return &resp.Data[i]
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
It("returns the list envelope even with no models", func() {
|
||||
resp := call()
|
||||
Expect(resp.Object).To(Equal("list"))
|
||||
})
|
||||
|
||||
It("enriches a vision chat model with capabilities and image input modality", func() {
|
||||
writeConfig("vlm", `
|
||||
name: vlm
|
||||
backend: llama-cpp
|
||||
known_usecases:
|
||||
- FLAG_CHAT
|
||||
- FLAG_VISION
|
||||
template:
|
||||
chat: "{{ .Input }}"
|
||||
parameters:
|
||||
model: qwen2.5-vl-Q4_K_M.gguf
|
||||
`)
|
||||
entry := entryFor(call(), "vlm")
|
||||
Expect(entry).NotTo(BeNil())
|
||||
Expect(entry.Object).To(Equal("model"))
|
||||
Expect(entry.Capabilities).To(ContainElements("chat", "vision"))
|
||||
Expect(entry.InputModalities).To(ContainElements("text", "image"))
|
||||
Expect(entry.OutputModalities).To(ContainElement("text"))
|
||||
})
|
||||
|
||||
It("marks a parakeet model as an audio-in/text-out transcription model", func() {
|
||||
writeConfig("parakeet", `
|
||||
name: parakeet
|
||||
backend: parakeet-cpp
|
||||
known_usecases:
|
||||
- FLAG_TRANSCRIPT
|
||||
parameters:
|
||||
model: parakeet-tdt-0.6b
|
||||
`)
|
||||
entry := entryFor(call(), "parakeet")
|
||||
Expect(entry).NotTo(BeNil())
|
||||
Expect(entry.Capabilities).To(ContainElement("transcript"))
|
||||
Expect(entry.InputModalities).To(Equal([]string{"audio"}))
|
||||
Expect(entry.OutputModalities).To(Equal([]string{"text"}))
|
||||
Expect(entry.Capabilities).NotTo(ContainElement("chat"))
|
||||
})
|
||||
})
|
||||
@@ -272,25 +272,27 @@ func RegisterLocalAIRoutes(router *echo.Echo,
|
||||
"version": internal.PrintableVersion(),
|
||||
// Flat endpoint list for backwards compatibility
|
||||
"endpoints": map[string]any{
|
||||
"models": "/v1/models",
|
||||
"chat_completions": "/v1/chat/completions",
|
||||
"completions": "/v1/completions",
|
||||
"embeddings": "/v1/embeddings",
|
||||
"config_metadata": "/api/models/config-metadata",
|
||||
"config_json": "/api/models/config-json/:name",
|
||||
"config_patch": "/api/models/config-json/:name",
|
||||
"autocomplete": "/api/models/config-metadata/autocomplete/:provider",
|
||||
"vram_estimate": "/api/models/vram-estimate",
|
||||
"tts": "/tts",
|
||||
"transcription": "/v1/audio/transcriptions",
|
||||
"image_generation": "/v1/images/generations",
|
||||
"swagger": "/swagger/index.html",
|
||||
"instructions": "/api/instructions",
|
||||
"models": "/v1/models",
|
||||
"models_capabilities": "/v1/models/capabilities",
|
||||
"chat_completions": "/v1/chat/completions",
|
||||
"completions": "/v1/completions",
|
||||
"embeddings": "/v1/embeddings",
|
||||
"config_metadata": "/api/models/config-metadata",
|
||||
"config_json": "/api/models/config-json/:name",
|
||||
"config_patch": "/api/models/config-json/:name",
|
||||
"autocomplete": "/api/models/config-metadata/autocomplete/:provider",
|
||||
"vram_estimate": "/api/models/vram-estimate",
|
||||
"tts": "/tts",
|
||||
"transcription": "/v1/audio/transcriptions",
|
||||
"image_generation": "/v1/images/generations",
|
||||
"swagger": "/swagger/index.html",
|
||||
"instructions": "/api/instructions",
|
||||
},
|
||||
// Categorized endpoint groups for structured discovery
|
||||
"endpoint_groups": map[string]any{
|
||||
"openai_compatible": map[string]string{
|
||||
"models": "/v1/models",
|
||||
"models_capabilities": "/v1/models/capabilities",
|
||||
"chat_completions": "/v1/chat/completions",
|
||||
"completions": "/v1/completions",
|
||||
"embeddings": "/v1/embeddings",
|
||||
|
||||
@@ -257,4 +257,10 @@ func RegisterOpenAIRoutes(app *echo.Echo,
|
||||
// List models
|
||||
app.GET("/v1/models", openai.ListModelsEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB()))
|
||||
app.GET("/models", openai.ListModelsEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB()))
|
||||
|
||||
// List models enriched with capabilities + input/output modalities
|
||||
// (LocalAI-specific, additive superset of /v1/models).
|
||||
capabilitiesHandler := openai.ListModelCapabilitiesEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB())
|
||||
app.GET("/v1/models/capabilities", capabilitiesHandler)
|
||||
app.GET("/models/capabilities", capabilitiesHandler)
|
||||
}
|
||||
|
||||
@@ -251,3 +251,27 @@ type ModelsDataResponse struct {
|
||||
Object string `json:"object"`
|
||||
Data []OpenAIModel `json:"data"`
|
||||
}
|
||||
|
||||
// ModelCapabilities is a strict superset of OpenAIModel that additionally
|
||||
// describes what a model can do and which modalities it accepts/produces. It is
|
||||
// served by the LocalAI-specific /v1/models/capabilities endpoint so clients can
|
||||
// route attachments (image/audio/video) to a model only when it can handle them.
|
||||
type ModelCapabilities struct {
|
||||
ID string `json:"id"`
|
||||
Object string `json:"object"`
|
||||
// Capabilities are canonical usecase strings (e.g. chat, vision, transcript,
|
||||
// tts, embeddings, image, video) plus the modifiers "tools" and "thinking".
|
||||
Capabilities []string `json:"capabilities"`
|
||||
// InputModalities is the subset of {text,image,audio,video} the model accepts.
|
||||
InputModalities []string `json:"input_modalities"`
|
||||
// OutputModalities is the subset of {text,image,audio,video} the model produces.
|
||||
OutputModalities []string `json:"output_modalities"`
|
||||
}
|
||||
|
||||
// ModelCapabilitiesResponse is the envelope returned by /v1/models/capabilities.
|
||||
// It mirrors ModelsDataResponse so a client can treat it as an enriched
|
||||
// drop-in for /v1/models.
|
||||
type ModelCapabilitiesResponse struct {
|
||||
Object string `json:"object"`
|
||||
Data []ModelCapabilities `json:"data"`
|
||||
}
|
||||
|
||||
@@ -36,6 +36,7 @@ Returns the instance version, all available endpoint URLs (flat and categorized)
|
||||
"endpoints": {
|
||||
"chat_completions": "/v1/chat/completions",
|
||||
"models": "/v1/models",
|
||||
"models_capabilities": "/v1/models/capabilities",
|
||||
"config_metadata": "/api/models/config-metadata",
|
||||
"instructions": "/api/instructions",
|
||||
"swagger": "/swagger/index.html"
|
||||
@@ -123,6 +124,45 @@ Add `?format=json` to get a raw **OpenAPI fragment** (filtered Swagger spec with
|
||||
curl http://localhost:8080/api/instructions/config-management?format=json
|
||||
```
|
||||
|
||||
## Model Capabilities
|
||||
|
||||
`GET /v1/models/capabilities`
|
||||
|
||||
An additive, LocalAI-specific superset of `/v1/models`. It returns the same set of models but enriches each entry with the **capabilities** the model supports and the **input/output modalities** it accepts and produces. Use it to decide, before sending a request, whether a given model can take an image, audio, or video attachment directly — or whether the input needs converting/transcribing first.
|
||||
|
||||
Because it is purely additive, clients that only understand `/v1/models` keep working unchanged; they simply never call this route.
|
||||
|
||||
```bash
|
||||
curl http://localhost:8080/v1/models/capabilities
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"object": "list",
|
||||
"data": [
|
||||
{
|
||||
"id": "qwen2.5-omni",
|
||||
"object": "model",
|
||||
"capabilities": ["chat", "vision", "tools"],
|
||||
"input_modalities": ["text", "image", "audio"],
|
||||
"output_modalities": ["text"]
|
||||
},
|
||||
{
|
||||
"id": "parakeet",
|
||||
"object": "model",
|
||||
"capabilities": ["transcript"],
|
||||
"input_modalities": ["audio"],
|
||||
"output_modalities": ["text"]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
- **`capabilities`** — canonical usecase strings (e.g. `chat`, `vision`, `transcript`, `tts`, `embeddings`, `image`, `video`) plus the modifiers `tools` and `thinking`.
|
||||
- **`input_modalities` / `output_modalities`** — the subsets of `{text, image, audio, video}` the model accepts and produces. Audio and video *input* are derived from the model's multimodal limits (e.g. vLLM `limit_mm_per_prompt`), which no single usecase flag expresses — which is why this endpoint exists alongside the plain listing.
|
||||
|
||||
The same query parameters as `/v1/models` are honored (`filter`, `excludeConfigured`), and the same per-user model allowlist is applied when authentication is enabled.
|
||||
|
||||
## Configuration Management APIs
|
||||
|
||||
These endpoints let agents discover model configuration fields, read current settings, modify them, and estimate VRAM usage.
|
||||
|
||||
@@ -17,6 +17,7 @@ You can see the release notes [here](https://github.com/mudler/LocalAI/releases)
|
||||
- **May 2026**: [Speaker diarization](/features/audio-diarization/) — new `/v1/audio/diarization` endpoint returning "who spoke when" segments. Backed by `sherpa-onnx` (pyannote-3.0 + speaker embeddings + clustering) for pure diarization, and `vibevoice-cpp` for diarization bundled with long-form ASR. Supports `json` / `verbose_json` / `rttm` response formats.
|
||||
- **June 2026**: [Sound classification](/features/audio-classification/) — new `/v1/audio/classification` endpoint for audio tagging / sound-event classification, returning scored [AudioSet](https://research.google.com/audioset/) labels (baby cry, glass breaking, alarms, ...). Backed by [ced.cpp](https://github.com/mudler/ced.cpp), a 527-class AudioSet tagger ported to ggml.
|
||||
- **June 2026**: [PII analyze / redact API](/features/middleware/#analyze--redact-api) — the PII detection pipeline (NER + restricted-regex pattern tiers) is now a standalone service: `POST /api/pii/analyze` returns detected entity spans and `POST /api/pii/redact` returns the sanitised text (or `400 pii_blocked`), without routing a chat request through the middleware. Events gain an `origin` (`middleware` / `proxy` / `pii_analyze` / `pii_redact`) so `/api/pii/events` can be filtered by source.
|
||||
- **July 2026**: [Model capabilities endpoint](/features/api-discovery/#model-capabilities) — `GET /v1/models/capabilities`, an additive superset of `/v1/models` that reports each model's `capabilities` plus its `input_modalities` / `output_modalities` (`text` / `image` / `audio` / `video`). Lets clients route image/audio/video attachments to a model only when it can handle them; audio/video *input* is derived from the model's multimodal limits, which no single usecase flag expresses.
|
||||
- **June 2026**: Concurrent scoring and PII NER on llama.cpp — the `Score` (router classifier) and `TokenClassify` (PII NER) primitives now ride llama.cpp's server task queue instead of locking the context, so they run concurrently with chat/completion/embedding traffic and with each other. The `known_usecases` restriction that forced dedicated scorer/NER model configs on llama-cpp is lifted, repeated scoring calls reuse the prompt KV cache across candidates, and scoring inputs are no longer capped by the physical batch size.
|
||||
|
||||
## 2024 Highlights
|
||||
|
||||
57
scripts/build/package-gpu-libs-rocm-data_test.sh
Executable file
57
scripts/build/package-gpu-libs-rocm-data_test.sh
Executable file
@@ -0,0 +1,57 @@
|
||||
#!/bin/bash
|
||||
# Regression test for scripts/build/package-gpu-libs.sh ROCm data bundling.
|
||||
#
|
||||
# Guards issue #10660: hipBLASLt (rocblaslt) resolves its TensileLibrary_lazy_gfx*.dat
|
||||
# kernel data relative to the bundled libhipblaslt.so. The packager copied the
|
||||
# rocblas/ data dir but not the hipblaslt/ data dir, so the bundled backend
|
||||
# fell back to slow generic kernels and logged
|
||||
# rocblaslt error: Cannot read "TensileLibrary_lazy_gfx1201.dat": No such file or directory
|
||||
#
|
||||
# This test fabricates a fake ROCm tree containing both rocblas/ and hipblaslt/
|
||||
# tensile data, points the packager at it via ROCM_BASE_DIRS, and asserts BOTH
|
||||
# data directories are bundled into the target lib dir.
|
||||
set -euo pipefail
|
||||
|
||||
CURDIR=$(dirname "$(realpath "$0")")
|
||||
SCRIPT="$CURDIR/package-gpu-libs.sh"
|
||||
|
||||
WORK=$(mktemp -d)
|
||||
trap 'rm -rf "$WORK"' EXIT
|
||||
|
||||
# Fabricate a fake ROCm install with both rocblas and hipblaslt tensile data.
|
||||
FAKE_ROCM="$WORK/opt/rocm"
|
||||
mkdir -p "$FAKE_ROCM/lib/rocblas/library"
|
||||
mkdir -p "$FAKE_ROCM/lib/hipblaslt/library"
|
||||
echo "fake rocblas tensile" > "$FAKE_ROCM/lib/rocblas/library/TensileLibrary_lazy_gfx1201.dat"
|
||||
echo "fake hipblaslt tensile" > "$FAKE_ROCM/lib/hipblaslt/library/TensileLibrary_lazy_gfx1201.dat"
|
||||
|
||||
TARGET="$WORK/target"
|
||||
mkdir -p "$TARGET"
|
||||
|
||||
# shellcheck source=/dev/null
|
||||
source "$SCRIPT" "$TARGET"
|
||||
|
||||
# Point the data-dir copy at the fabricated tree instead of the real /opt/rocm,
|
||||
# then run the actual ROCm packager. This asserts package_rocm_libs itself
|
||||
# bundles BOTH data dirs, not just that the helper works in isolation.
|
||||
export BUILD_TYPE=hipblas
|
||||
export ROCM_BASE_DIRS="$FAKE_ROCM"
|
||||
package_rocm_libs
|
||||
|
||||
fail=false
|
||||
if [ ! -e "$TARGET/rocblas/library/TensileLibrary_lazy_gfx1201.dat" ]; then
|
||||
echo "FAIL: rocblas tensile data was NOT bundled"
|
||||
fail=true
|
||||
fi
|
||||
if [ ! -e "$TARGET/hipblaslt/library/TensileLibrary_lazy_gfx1201.dat" ]; then
|
||||
echo "FAIL: hipblaslt tensile data was NOT bundled (regression of #10660)"
|
||||
fail=true
|
||||
fi
|
||||
|
||||
if [ "$fail" = true ]; then
|
||||
ls -R "$TARGET" || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "PASS: rocblas and hipblaslt tensile data were both bundled"
|
||||
exit 0
|
||||
@@ -224,6 +224,50 @@ package_cuda_libs() {
|
||||
echo "CUDA libraries packaged successfully"
|
||||
}
|
||||
|
||||
# Copy a ROCm library data subdirectory (e.g. rocblas, hipblaslt) into the
|
||||
# bundled lib/ dir. These directories hold the TensileLibrary_*.dat GPU kernel
|
||||
# tuning files, which rocBLAS/hipBLASLt load at runtime *relative to their own
|
||||
# .so*. Since backends ship their own copies of libhipblaslt.so/librocblas.so
|
||||
# under lib/, the matching data dir must travel with them or the libs fall back
|
||||
# to slow generic kernels (rocblaslt error: Cannot read TensileLibrary_lazy_gfx*.dat;
|
||||
# see issue #10660).
|
||||
#
|
||||
# The ROCm search roots default to /opt/rocm{,-*} but can be overridden via the
|
||||
# ROCM_BASE_DIRS env var (space-separated), which keeps the copy unit-testable
|
||||
# without a real ROCm install.
|
||||
# Args: $1 = data subdir name found under <rocm-root>/lib{,64}/
|
||||
copy_rocm_data_dir() {
|
||||
local data_name="$1"
|
||||
# Single-line `local x=$(...)` on purpose: `local` masks the command
|
||||
# substitution's exit status, which is 1 when nullglob is unset and would
|
||||
# otherwise trip the script's `set -e`.
|
||||
local old_nullglob=$(shopt -p nullglob)
|
||||
shopt -s nullglob
|
||||
local rocm_dirs
|
||||
if [ -n "${ROCM_BASE_DIRS:-}" ]; then
|
||||
# shellcheck disable=SC2206 # intentional word-split of the override
|
||||
rocm_dirs=(${ROCM_BASE_DIRS})
|
||||
else
|
||||
rocm_dirs=(/opt/rocm /opt/rocm-*)
|
||||
fi
|
||||
eval "$old_nullglob"
|
||||
local found=false
|
||||
local rocm_base lib_subdir
|
||||
for rocm_base in "${rocm_dirs[@]}"; do
|
||||
for lib_subdir in lib lib64; do
|
||||
if [ -d "$rocm_base/$lib_subdir/$data_name" ]; then
|
||||
echo "Found $data_name data at $rocm_base/$lib_subdir/$data_name"
|
||||
mkdir -p "$TARGET_LIB_DIR/$data_name"
|
||||
cp -arfL "$rocm_base/$lib_subdir/$data_name/"* "$TARGET_LIB_DIR/$data_name/" || echo "WARNING: Failed to copy $data_name data from $rocm_base/$lib_subdir/$data_name"
|
||||
found=true
|
||||
fi
|
||||
done
|
||||
done
|
||||
if [ "$found" = false ]; then
|
||||
echo "WARNING: No $data_name library data found in ${ROCM_BASE_DIRS:-/opt/rocm*}/lib{,64}/$data_name"
|
||||
fi
|
||||
}
|
||||
|
||||
# Package AMD ROCm/HIPBlas libraries
|
||||
package_rocm_libs() {
|
||||
echo "Packaging ROCm/HIPBlas libraries for BUILD_TYPE=${BUILD_TYPE}..."
|
||||
@@ -267,27 +311,16 @@ package_rocm_libs() {
|
||||
fi
|
||||
done
|
||||
|
||||
# Copy rocblas library data (tuning files, TensileLibrary, etc.)
|
||||
local old_nullglob=$(shopt -p nullglob)
|
||||
shopt -s nullglob
|
||||
local rocm_dirs=(/opt/rocm /opt/rocm-*)
|
||||
eval "$old_nullglob"
|
||||
local rocblas_found=false
|
||||
for rocm_base in "${rocm_dirs[@]}"; do
|
||||
for lib_subdir in lib lib64; do
|
||||
if [ -d "$rocm_base/$lib_subdir/rocblas" ]; then
|
||||
echo "Found rocblas data at $rocm_base/$lib_subdir/rocblas"
|
||||
mkdir -p "$TARGET_LIB_DIR/rocblas"
|
||||
cp -arfL "$rocm_base/$lib_subdir/rocblas/"* "$TARGET_LIB_DIR/rocblas/" || echo "WARNING: Failed to copy rocblas data from $rocm_base/$lib_subdir/rocblas"
|
||||
rocblas_found=true
|
||||
fi
|
||||
done
|
||||
done
|
||||
if [ "$rocblas_found" = false ]; then
|
||||
echo "WARNING: No rocblas library data found in /opt/rocm*/lib{,64}/rocblas"
|
||||
fi
|
||||
# Copy rocBLAS and hipBLASLt kernel data (TensileLibrary_*.dat tuning files)
|
||||
# so the bundled libs find their per-arch kernels at runtime instead of
|
||||
# falling back to slow generic code (see copy_rocm_data_dir / issue #10660).
|
||||
copy_rocm_data_dir rocblas
|
||||
copy_rocm_data_dir hipblaslt
|
||||
|
||||
# Copy libomp from LLVM (required for ROCm)
|
||||
# Single-line `local x=$(...)` on purpose: masks shopt -p's nonzero exit
|
||||
# (nullglob unset) so it doesn't trip `set -e`.
|
||||
local old_nullglob=$(shopt -p nullglob)
|
||||
shopt -s nullglob
|
||||
local omp_libs=(/opt/rocm*/lib/llvm/lib/libomp.so*)
|
||||
eval "$old_nullglob"
|
||||
@@ -477,6 +510,7 @@ export -f copy_libs_glob
|
||||
export -f is_core_lib
|
||||
export -f copy_elf_deps
|
||||
export -f sweep_transitive_deps
|
||||
export -f copy_rocm_data_dir
|
||||
export -f package_cuda_libs
|
||||
export -f package_rocm_libs
|
||||
export -f package_intel_libs
|
||||
|
||||
@@ -2728,6 +2728,22 @@ const docTemplate = `{
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/models/capabilities": {
|
||||
"get": {
|
||||
"tags": [
|
||||
"models"
|
||||
],
|
||||
"summary": "List available models enriched with capabilities and input/output modalities.",
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Response",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/schema.ModelCapabilitiesResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/rerank": {
|
||||
"post": {
|
||||
"tags": [
|
||||
@@ -5182,6 +5198,52 @@ const docTemplate = `{
|
||||
}
|
||||
}
|
||||
},
|
||||
"schema.ModelCapabilities": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"capabilities": {
|
||||
"description": "Capabilities are canonical usecase strings (e.g. chat, vision, transcript,\ntts, embeddings, image, video) plus the modifiers \"tools\" and \"thinking\".",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"input_modalities": {
|
||||
"description": "InputModalities is the subset of {text,image,audio,video} the model accepts.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"object": {
|
||||
"type": "string"
|
||||
},
|
||||
"output_modalities": {
|
||||
"description": "OutputModalities is the subset of {text,image,audio,video} the model produces.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"schema.ModelCapabilitiesResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/schema.ModelCapabilities"
|
||||
}
|
||||
},
|
||||
"object": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"schema.ModelLoadRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
||||
@@ -2725,6 +2725,22 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/models/capabilities": {
|
||||
"get": {
|
||||
"tags": [
|
||||
"models"
|
||||
],
|
||||
"summary": "List available models enriched with capabilities and input/output modalities.",
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Response",
|
||||
"schema": {
|
||||
"$ref": "#/definitions/schema.ModelCapabilitiesResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/v1/rerank": {
|
||||
"post": {
|
||||
"tags": [
|
||||
@@ -5179,6 +5195,52 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"schema.ModelCapabilities": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"capabilities": {
|
||||
"description": "Capabilities are canonical usecase strings (e.g. chat, vision, transcript,\ntts, embeddings, image, video) plus the modifiers \"tools\" and \"thinking\".",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"input_modalities": {
|
||||
"description": "InputModalities is the subset of {text,image,audio,video} the model accepts.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"object": {
|
||||
"type": "string"
|
||||
},
|
||||
"output_modalities": {
|
||||
"description": "OutputModalities is the subset of {text,image,audio,video} the model produces.",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"schema.ModelCapabilitiesResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/schema.ModelCapabilities"
|
||||
}
|
||||
},
|
||||
"object": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"schema.ModelLoadRequest": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
||||
@@ -1362,6 +1362,41 @@ definitions:
|
||||
$ref: '#/definitions/schema.ToolCall'
|
||||
type: array
|
||||
type: object
|
||||
schema.ModelCapabilities:
|
||||
properties:
|
||||
capabilities:
|
||||
description: |-
|
||||
Capabilities are canonical usecase strings (e.g. chat, vision, transcript,
|
||||
tts, embeddings, image, video) plus the modifiers "tools" and "thinking".
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
id:
|
||||
type: string
|
||||
input_modalities:
|
||||
description: InputModalities is the subset of {text,image,audio,video} the
|
||||
model accepts.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
object:
|
||||
type: string
|
||||
output_modalities:
|
||||
description: OutputModalities is the subset of {text,image,audio,video} the
|
||||
model produces.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
type: object
|
||||
schema.ModelCapabilitiesResponse:
|
||||
properties:
|
||||
data:
|
||||
items:
|
||||
$ref: '#/definitions/schema.ModelCapabilities'
|
||||
type: array
|
||||
object:
|
||||
type: string
|
||||
type: object
|
||||
schema.ModelLoadRequest:
|
||||
properties:
|
||||
model:
|
||||
@@ -4358,6 +4393,16 @@ paths:
|
||||
summary: List and describe the various models available in the API.
|
||||
tags:
|
||||
- models
|
||||
/v1/models/capabilities:
|
||||
get:
|
||||
responses:
|
||||
"200":
|
||||
description: Response
|
||||
schema:
|
||||
$ref: '#/definitions/schema.ModelCapabilitiesResponse'
|
||||
summary: List available models enriched with capabilities and input/output modalities.
|
||||
tags:
|
||||
- models
|
||||
/v1/rerank:
|
||||
post:
|
||||
parameters:
|
||||
|
||||
Reference in New Issue
Block a user