fix(auth): log the real cause of OIDC/OAuth user-info failures (#10679 )

The OAuth callback discarded the error returned by user-info resolution before sending the generic 500, so real failures were completely opaque in the logs: ID-token verification errors (e.g. issuer/audience mismatch behind a reverse proxy), a missing id_token, claim-parse errors, or a rejecting GitHub userinfo endpoint all collapsed into "failed to fetch user info" with nothing logged. Log the wrapped cause with xlog.Error (provider + error), matching the code-exchange step just above it. The client-facing message is unchanged, so no internal detail leaks to the browser. Refs #10677 Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-07-04 21:37:02 -04:00 · 2026-07-04 19:33:53 +02:00
15 changed files with 105 additions and 785 deletions
--- a/core/config/model_capabilities.go
+++ b/core/config/model_capabilities.go
@@ -1,197 +0,0 @@
-package config
-
-// This file is the single source of truth for deriving a model's user-facing
-// capabilities and input/output modalities from its ModelConfig. Both the
-// OpenAI-compatible /v1/models/capabilities endpoint and the Ollama-compatible
-// /api/tags|/api/show surface consume these, so the vocabulary stays consistent
-// across clients. Keep the detection heuristics here rather than duplicating
-// them per endpoint.
-
-// VisionSupported reports whether the model can accept image inputs.
-//
-// We deliberately avoid HasUsecases(FLAG_VISION): GuessUsecases has no
-// FLAG_VISION branch and reports true for any chat model, so it would paint
-// vision onto text-only models. Instead we look for explicit signals: the
-// declared KnownUsecases bit, a multimodal projector, or a template/backend
-// multimodal marker.
-func (c *ModelConfig) VisionSupported() bool {
-	if c.KnownUsecases != nil && (*c.KnownUsecases&FLAG_VISION) == FLAG_VISION {
-		return true
-	}
-	if c.MMProj != "" {
-		return true
-	}
-	if c.TemplateConfig.Multimodal != "" {
-		return true
-	}
-	if c.MediaMarker != "" {
-		return true
-	}
-	return false
-}
-
-// ToolSupported reports whether the model is wired up for tool / function
-// calling. We look for any of the explicit knobs LocalAI uses to drive
-// function-call extraction (regex match, response regex, grammar triggers, XML
-// format) or the auto-detected tool-format markers the llama.cpp backend
-// populates during model load.
-func (c *ModelConfig) ToolSupported() bool {
-	fc := c.FunctionsConfig
-	if fc.ToolFormatMarkers != nil && fc.ToolFormatMarkers.FormatType != "" {
-		return true
-	}
-	if len(fc.JSONRegexMatch) > 0 || len(fc.ResponseRegex) > 0 {
-		return true
-	}
-	if fc.XMLFormatPreset != "" || fc.XMLFormat != nil {
-		return true
-	}
-	if len(fc.GrammarConfig.GrammarTriggers) > 0 || fc.GrammarConfig.SchemaType != "" {
-		return true
-	}
-	return false
-}
-
-// ThinkingSupported reports whether the model has reasoning / thinking enabled.
-// LocalAI sets DisableReasoning=false (or leaves thinking markers configured)
-// when the backend probe reports that the model supports thinking.
-func (c *ModelConfig) ThinkingSupported() bool {
-	rc := c.ReasoningConfig
-	if rc.DisableReasoning != nil && !*rc.DisableReasoning {
-		return true
-	}
-	if len(rc.ThinkingStartTokens) > 0 || len(rc.TagPairs) > 0 {
-		// Explicit thinking markers imply support unless explicitly disabled.
-		return rc.DisableReasoning == nil || !*rc.DisableReasoning
-	}
-	return false
-}
-
-// AudioInputSupported reports whether a chat/generation model accepts audio as
-// input (e.g. vLLM omni models). The signal is the vLLM per-prompt audio limit;
-// there is no FLAG_* for "chat model that hears audio", which is exactly why a
-// plain usecase list can't express it. Transcription models are handled
-// separately in InputModalities via FLAG_TRANSCRIPT.
-func (c *ModelConfig) AudioInputSupported() bool {
-	return c.LimitMMPerPrompt.LimitAudioPerPrompt > 0
-}
-
-// VideoInputSupported reports whether a chat/generation model accepts video as
-// input. The signal is the vLLM per-prompt video limit. Note this is distinct
-// from FLAG_VIDEO, which denotes video *generation* (diffusers) — an output
-// modality, not an input one.
-func (c *ModelConfig) VideoInputSupported() bool {
-	return c.LimitMMPerPrompt.LimitVideoPerPrompt > 0
-}
-
-// Capabilities returns the ordered list of capability strings the model
-// supports, using the canonical usecase vocabulary (chat, vision, transcript,
-// tts, embeddings, image, video, ...) plus the modifier capabilities "tools"
-// and "thinking". Vision is resolved via VisionSupported (not HasUsecases) to
-// avoid the guess-heuristic false positive.
-func (c *ModelConfig) Capabilities() []string {
-	chat := c.HasUsecases(FLAG_CHAT)
-	completion := c.HasUsecases(FLAG_COMPLETION)
-
-	var caps []string
-	add := func(cond bool, name string) {
-		if cond {
-			caps = append(caps, name)
-		}
-	}
-
-	add(chat, UsecaseChat)
-	add(completion, UsecaseCompletion)
-	add(c.HasUsecases(FLAG_EDIT), UsecaseEdit)
-	add(c.HasUsecases(FLAG_EMBEDDINGS), UsecaseEmbeddings)
-	add(c.HasUsecases(FLAG_RERANK), UsecaseRerank)
-	// Vision is only meaningful as an image-understanding modifier on a chat/
-	// completion model. Gating on (chat||completion) matches the Ollama surface
-	// and avoids a false positive when config defaults hydrate a MediaMarker on
-	// a non-chat model (e.g. a pure ASR/TTS backend).
-	add((chat || completion) && c.VisionSupported(), UsecaseVision)
-	// tools/thinking are modifiers on the chat/completion surface.
-	add((chat || completion) && c.ToolSupported(), "tools")
-	add((chat || completion) && c.ThinkingSupported(), "thinking")
-	add(c.HasUsecases(FLAG_TRANSCRIPT), UsecaseTranscript)
-	add(c.HasUsecases(FLAG_TTS), UsecaseTTS)
-	add(c.HasUsecases(FLAG_SOUND_GENERATION), UsecaseSoundGeneration)
-	add(c.HasUsecases(FLAG_IMAGE), UsecaseImage)
-	add(c.HasUsecases(FLAG_VIDEO), UsecaseVideo)
-	add(c.HasUsecases(FLAG_VAD), UsecaseVAD)
-	add(c.HasUsecases(FLAG_DETECTION), UsecaseDetection)
-	add(c.HasUsecases(FLAG_DEPTH), UsecaseDepth)
-	add(c.HasUsecases(FLAG_AUDIO_TRANSFORM), UsecaseAudioTransform)
-	add(c.HasUsecases(FLAG_DIARIZATION), UsecaseDiarization)
-	add(c.HasUsecases(FLAG_SOUND_CLASSIFICATION), UsecaseSoundClassification)
-	add(c.HasUsecases(FLAG_REALTIME_AUDIO), UsecaseRealtimeAudio)
-	add(c.HasUsecases(FLAG_FACE_RECOGNITION), UsecaseFaceRecognition)
-	add(c.HasUsecases(FLAG_SPEAKER_RECOGNITION), UsecaseSpeakerRecognition)
-	return caps
-}
-
-// InputModalities returns the set of modalities (text, image, audio, video) the
-// model accepts as input, ordered text→image→audio→video. This is what an
-// attachment router consults to decide whether an image/audio/video file can be
-// handed to the active model directly.
-func (c *ModelConfig) InputModalities() []string {
-	imageGen := c.HasUsecases(FLAG_IMAGE)
-	videoGen := c.HasUsecases(FLAG_VIDEO)
-	chatish := c.HasUsecases(FLAG_CHAT) || c.HasUsecases(FLAG_COMPLETION)
-
-	textIn := chatish || c.HasUsecases(FLAG_EDIT) ||
-		c.HasUsecases(FLAG_EMBEDDINGS) || c.HasUsecases(FLAG_RERANK) || c.HasUsecases(FLAG_TOKENIZE) ||
-		c.HasUsecases(FLAG_TTS) || c.HasUsecases(FLAG_SOUND_GENERATION) || imageGen || videoGen
-
-	// Image input via a chat model requires vision (gated on chat, like the
-	// Ollama surface); detection/depth/face models consume images directly.
-	imageIn := (chatish && c.VisionSupported()) || c.LimitMMPerPrompt.LimitImagePerPrompt > 0 ||
-		c.HasUsecases(FLAG_DETECTION) || c.HasUsecases(FLAG_DEPTH) || c.HasUsecases(FLAG_FACE_RECOGNITION)
-
-	audioIn := c.AudioInputSupported() || c.HasUsecases(FLAG_TRANSCRIPT) || c.HasUsecases(FLAG_AUDIO_TRANSFORM) ||
-		c.HasUsecases(FLAG_REALTIME_AUDIO) || c.HasUsecases(FLAG_VAD) || c.HasUsecases(FLAG_DIARIZATION) ||
-		c.HasUsecases(FLAG_SOUND_CLASSIFICATION) || c.HasUsecases(FLAG_SPEAKER_RECOGNITION)
-
-	videoIn := c.VideoInputSupported()
-
-	var mods []string
-	if textIn {
-		mods = append(mods, "text")
-	}
-	if imageIn {
-		mods = append(mods, "image")
-	}
-	if audioIn {
-		mods = append(mods, "audio")
-	}
-	if videoIn {
-		mods = append(mods, "video")
-	}
-	return mods
-}
-
-// OutputModalities returns the set of modalities (text, image, audio, video)
-// the model produces, ordered text→image→audio→video.
-func (c *ModelConfig) OutputModalities() []string {
-	textOut := c.HasUsecases(FLAG_CHAT) || c.HasUsecases(FLAG_COMPLETION) || c.HasUsecases(FLAG_EDIT) ||
-		c.HasUsecases(FLAG_TRANSCRIPT)
-	imageOut := c.HasUsecases(FLAG_IMAGE)
-	audioOut := c.HasUsecases(FLAG_TTS) || c.HasUsecases(FLAG_SOUND_GENERATION) ||
-		c.HasUsecases(FLAG_AUDIO_TRANSFORM) || c.HasUsecases(FLAG_REALTIME_AUDIO)
-	videoOut := c.HasUsecases(FLAG_VIDEO)
-
-	var mods []string
-	if textOut {
-		mods = append(mods, "text")
-	}
-	if imageOut {
-		mods = append(mods, "image")
-	}
-	if audioOut {
-		mods = append(mods, "audio")
-	}
-	if videoOut {
-		mods = append(mods, "video")
-	}
-	return mods
-}
--- a/core/config/model_capabilities_test.go
+++ b/core/config/model_capabilities_test.go
@@ -1,103 +0,0 @@
-package config
-
-import (
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-func usecaseBits(flags ModelConfigUsecase) *ModelConfigUsecase {
-	return &flags
-}
-
-var _ = Describe("Model capabilities derivation", func() {
-	Describe("VisionSupported", func() {
-		It("is false for a plain text chat model", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"}
-			Expect(cfg.VisionSupported()).To(BeFalse())
-		})
-
-		It("is true when the FLAG_VISION bit is declared", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT | FLAG_VISION), Backend: "llama.cpp"}
-			Expect(cfg.VisionSupported()).To(BeTrue())
-		})
-
-		It("is true when an mmproj projector is set", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"}
-			cfg.MMProj = "mmproj.gguf" // promoted field from the embedded options struct
-			Expect(cfg.VisionSupported()).To(BeTrue())
-		})
-
-		It("does not fall for the GuessUsecases FLAG_VISION false positive", func() {
-			// A chat model with a chat template would make HasUsecases(FLAG_VISION)
-			// return true via the guess heuristic; VisionSupported must not.
-			cfg := &ModelConfig{Backend: "llama.cpp"}
-			cfg.TemplateConfig.Chat = "{{.Input}}"
-			Expect(cfg.VisionSupported()).To(BeFalse())
-		})
-	})
-
-	Describe("AudioInputSupported / VideoInputSupported", func() {
-		It("detects vLLM omni audio input via limit_mm_per_prompt", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"}
-			cfg.LimitMMPerPrompt.LimitAudioPerPrompt = 1
-			Expect(cfg.AudioInputSupported()).To(BeTrue())
-			Expect(cfg.VideoInputSupported()).To(BeFalse())
-		})
-
-		It("detects vLLM omni video input via limit_mm_per_prompt", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"}
-			cfg.LimitMMPerPrompt.LimitVideoPerPrompt = 2
-			Expect(cfg.VideoInputSupported()).To(BeTrue())
-		})
-	})
-
-	Describe("Capabilities + modalities", func() {
-		It("a text-only chat model exposes chat and text-only modalities", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "llama.cpp"}
-			Expect(cfg.Capabilities()).To(ContainElement(UsecaseChat))
-			Expect(cfg.Capabilities()).NotTo(ContainElement(UsecaseVision))
-			Expect(cfg.Capabilities()).NotTo(ContainElement(UsecaseTranscript))
-			Expect(cfg.InputModalities()).To(Equal([]string{"text"}))
-			Expect(cfg.OutputModalities()).To(Equal([]string{"text"}))
-		})
-
-		It("a vision chat model accepts text+image input", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT | FLAG_VISION), Backend: "llama.cpp"}
-			Expect(cfg.Capabilities()).To(ContainElements(UsecaseChat, UsecaseVision))
-			Expect(cfg.InputModalities()).To(Equal([]string{"text", "image"}))
-			Expect(cfg.OutputModalities()).To(Equal([]string{"text"}))
-		})
-
-		It("an omni chat model accepts text+audio input without an audio capability flag", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_CHAT), Backend: "vllm"}
-			cfg.LimitMMPerPrompt.LimitAudioPerPrompt = 1
-			// audio-in is a modality, not a usecase string — this is exactly the
-			// case a plain capability list cannot express.
-			Expect(cfg.Capabilities()).To(ContainElement(UsecaseChat))
-			Expect(cfg.InputModalities()).To(Equal([]string{"text", "audio"}))
-		})
-
-		It("a transcription model reads audio and writes text", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_TRANSCRIPT), Backend: "parakeet-cpp"}
-			Expect(cfg.Capabilities()).To(Equal([]string{UsecaseTranscript}))
-			Expect(cfg.InputModalities()).To(Equal([]string{"audio"}))
-			Expect(cfg.OutputModalities()).To(Equal([]string{"text"}))
-		})
-
-		It("an image-generation model reads text and writes an image", func() {
-			// stablediffusion-ggml is image-only; plain "stablediffusion" is also
-			// in GuessUsecases' video-backend list, so it would report video too.
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_IMAGE), Backend: "stablediffusion-ggml"}
-			Expect(cfg.Capabilities()).To(Equal([]string{UsecaseImage}))
-			Expect(cfg.InputModalities()).To(Equal([]string{"text"}))
-			Expect(cfg.OutputModalities()).To(Equal([]string{"image"}))
-		})
-
-		It("a TTS model reads text and writes audio", func() {
-			cfg := &ModelConfig{KnownUsecases: usecaseBits(FLAG_TTS), Backend: "piper"}
-			Expect(cfg.Capabilities()).To(ContainElement(UsecaseTTS))
-			Expect(cfg.InputModalities()).To(Equal([]string{"text"}))
-			Expect(cfg.OutputModalities()).To(Equal([]string{"audio"}))
-		})
-	})
-})
--- a/core/http/auth/oauth.go
+++ b/core/http/auth/oauth.go
@@ -202,6 +202,11 @@ func (m *OAuthManager) CallbackHandler(providerName string, db *gorm.DB, adminEm
 			userInfo, err = fetchGitHubUserInfoAsOAuth(ctx, token.AccessToken)
 		}
 		if err != nil {
+			// Surface the real cause server-side: ID-token verify failures (issuer/
+			// audience mismatch behind a reverse proxy), a missing id_token, claim
+			// parse errors, or the GitHub userinfo HTTP status/body. The client still
+			// gets the generic message below; details go to logs only. See #10677.
+			xlog.Error("OAuth callback: failed to resolve user info", "provider", providerName, "error", err)
 			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "failed to fetch user info"})
 		}

--- a/core/http/endpoints/ollama/capabilities.go
+++ b/core/http/endpoints/ollama/capabilities.go
@@ -49,23 +49,62 @@ func modelCapabilities(cfg *config.ModelConfig) []string {
 	return caps
 }

-// hasVisionSupport reports whether the model can accept image inputs.
-// The detection heuristic is the canonical config.ModelConfig.VisionSupported —
-// kept as a thin wrapper here so the Ollama capability mapping reads cleanly.
+// hasVisionSupport reports whether the model can accept image inputs. We avoid
+// cfg.HasUsecases(FLAG_VISION) because GuessUsecases has no FLAG_VISION case
+// and returns true for any chat model — see core/config/model_config.go. Instead
+// we look for explicit signals: KnownUsecases bit, multimodal projector, or
+// template/backend-reported multimodal markers.
 func hasVisionSupport(cfg *config.ModelConfig) bool {
-	return cfg.VisionSupported()
+	if cfg.KnownUsecases != nil && (*cfg.KnownUsecases&config.FLAG_VISION) == config.FLAG_VISION {
+		return true
+	}
+	if cfg.MMProj != "" {
+		return true
+	}
+	if cfg.TemplateConfig.Multimodal != "" {
+		return true
+	}
+	if cfg.MediaMarker != "" {
+		return true
+	}
+	return false
 }

-// hasToolSupport reports whether the model is wired up for tool / function
-// calling. Delegates to the canonical config.ModelConfig.ToolSupported.
+// hasToolSupport reports whether the model is wired up for tool / function calling.
+// We look for any of the explicit configuration knobs LocalAI uses to drive
+// function-call extraction (regex match, response regex, grammar triggers, XML
+// format) or for the auto-detected tool-format markers populated by the
+// llama.cpp backend during model load.
 func hasToolSupport(cfg *config.ModelConfig) bool {
-	return cfg.ToolSupported()
+	fc := cfg.FunctionsConfig
+	if fc.ToolFormatMarkers != nil && fc.ToolFormatMarkers.FormatType != "" {
+		return true
+	}
+	if len(fc.JSONRegexMatch) > 0 || len(fc.ResponseRegex) > 0 {
+		return true
+	}
+	if fc.XMLFormatPreset != "" || fc.XMLFormat != nil {
+		return true
+	}
+	if len(fc.GrammarConfig.GrammarTriggers) > 0 || fc.GrammarConfig.SchemaType != "" {
+		return true
+	}
+	return false
 }

 // hasThinkingSupport reports whether the model has reasoning / thinking enabled.
-// Delegates to the canonical config.ModelConfig.ThinkingSupported.
+// LocalAI sets DisableReasoning=false (or leaves thinking markers configured)
+// when the backend probe reports that the model supports thinking.
 func hasThinkingSupport(cfg *config.ModelConfig) bool {
-	return cfg.ThinkingSupported()
+	rc := cfg.ReasoningConfig
+	if rc.DisableReasoning != nil && !*rc.DisableReasoning {
+		return true
+	}
+	if len(rc.ThinkingStartTokens) > 0 || len(rc.TagPairs) > 0 {
+		// Explicit thinking markers imply support unless explicitly disabled.
+		return rc.DisableReasoning == nil || !*rc.DisableReasoning
+	}
+	return false
 }

 // quantRegex matches GGUF-style quantization suffixes (Q4_K_M, Q8_0, IQ3_XS, F16, ...).
--- a/core/http/endpoints/openai/list.go
+++ b/core/http/endpoints/openai/list.go
@@ -21,11 +21,48 @@ func ListModelsEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, ap
 		authDB = db[0]
 	}
 	return func(c echo.Context) error {
-		modelNames, err := listVisibleModelNames(c, bcl, ml, authDB)
+		// If blank, no filter is applied.
+		filter := c.QueryParam("filter")
+
+		// By default, exclude any loose files that are already referenced by a configuration file.
+		var policy galleryop.LooseFilePolicy
+		excludeConfigured := c.QueryParam("excludeConfigured")
+		if excludeConfigured == "" || excludeConfigured == "true" {
+			policy = galleryop.SKIP_IF_CONFIGURED
+		} else {
+			policy = galleryop.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user?
+		}
+
+		filterFn, err := config.BuildNameFilterFn(filter)
 		if err != nil {
 			return err
 		}

+		modelNames, err := galleryop.ListModels(bcl, ml, filterFn, policy)
+		if err != nil {
+			return err
+		}
+
+		// Filter models by user's allowlist if auth is enabled
+		if authDB != nil {
+			if user := auth.GetUser(c); user != nil && user.Role != auth.RoleAdmin {
+				perm, err := auth.GetCachedUserPermissions(c, authDB, user.ID)
+				if err == nil && perm.AllowedModels.Enabled {
+					allowed := map[string]bool{}
+					for _, m := range perm.AllowedModels.Models {
+						allowed[m] = true
+					}
+					filtered := make([]string, 0, len(modelNames))
+					for _, m := range modelNames {
+						if allowed[m] {
+							filtered = append(filtered, m)
+						}
+					}
+					modelNames = filtered
+				}
+			}
+		}
+
 		// Map from a slice of names to a slice of OpenAIModel response objects
 		dataModels := []schema.OpenAIModel{}
 		for _, m := range modelNames {
@@ -38,53 +75,3 @@ func ListModelsEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, ap
 		})
 	}
 }
-
-// listVisibleModelNames resolves the model names visible to the caller, applying
-// the same query filters (filter, excludeConfigured) and per-user allowlist as
-// the OpenAI models listing. Shared by ListModelsEndpoint and
-// ListModelCapabilitiesEndpoint so both stay consistent.
-func listVisibleModelNames(c echo.Context, bcl *config.ModelConfigLoader, ml *model.ModelLoader, authDB *gorm.DB) ([]string, error) {
-	// If blank, no filter is applied.
-	filter := c.QueryParam("filter")
-
-	// By default, exclude any loose files that are already referenced by a configuration file.
-	var policy galleryop.LooseFilePolicy
-	excludeConfigured := c.QueryParam("excludeConfigured")
-	if excludeConfigured == "" || excludeConfigured == "true" {
-		policy = galleryop.SKIP_IF_CONFIGURED
-	} else {
-		policy = galleryop.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user?
-	}
-
-	filterFn, err := config.BuildNameFilterFn(filter)
-	if err != nil {
-		return nil, err
-	}
-
-	modelNames, err := galleryop.ListModels(bcl, ml, filterFn, policy)
-	if err != nil {
-		return nil, err
-	}
-
-	// Filter models by user's allowlist if auth is enabled
-	if authDB != nil {
-		if user := auth.GetUser(c); user != nil && user.Role != auth.RoleAdmin {
-			perm, err := auth.GetCachedUserPermissions(c, authDB, user.ID)
-			if err == nil && perm.AllowedModels.Enabled {
-				allowed := map[string]bool{}
-				for _, m := range perm.AllowedModels.Models {
-					allowed[m] = true
-				}
-				filtered := make([]string, 0, len(modelNames))
-				for _, m := range modelNames {
-					if allowed[m] {
-						filtered = append(filtered, m)
-					}
-				}
-				modelNames = filtered
-			}
-		}
-	}
-
-	return modelNames, nil
-}
--- a/core/http/endpoints/openai/list_capabilities.go
+++ b/core/http/endpoints/openai/list_capabilities.go
@@ -1,50 +0,0 @@
-package openai
-
-import (
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/schema"
-	model "github.com/mudler/LocalAI/pkg/model"
-	"gorm.io/gorm"
-)
-
-// ListModelCapabilitiesEndpoint is a LocalAI-specific extension of the OpenAI
-// models listing. It returns the same set of models as /v1/models but enriches
-// each entry with the capabilities and input/output modalities the model
-// supports, so clients can decide whether an image/audio/video attachment can be
-// handed to a given model directly (or must be converted/transcribed first).
-//
-// It is purely additive: clients that don't know about it keep using /v1/models
-// and see no change.
-// @Summary List available models enriched with capabilities and input/output modalities.
-// @Tags models
-// @Success 200 {object} schema.ModelCapabilitiesResponse "Response"
-// @Router /v1/models/capabilities [get]
-func ListModelCapabilitiesEndpoint(bcl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, db ...*gorm.DB) echo.HandlerFunc {
-	var authDB *gorm.DB
-	if len(db) > 0 {
-		authDB = db[0]
-	}
-	return func(c echo.Context) error {
-		modelNames, err := listVisibleModelNames(c, bcl, ml, authDB)
-		if err != nil {
-			return err
-		}
-
-		dataModels := []schema.ModelCapabilities{}
-		for _, m := range modelNames {
-			entry := schema.ModelCapabilities{ID: m, Object: "model"}
-			if cfg, ok := bcl.GetModelConfig(m); ok {
-				entry.Capabilities = cfg.Capabilities()
-				entry.InputModalities = cfg.InputModalities()
-				entry.OutputModalities = cfg.OutputModalities()
-			}
-			dataModels = append(dataModels, entry)
-		}
-
-		return c.JSON(200, schema.ModelCapabilitiesResponse{
-			Object: "list",
-			Data:   dataModels,
-		})
-	}
-}
--- a/core/http/endpoints/openai/list_capabilities_test.go
+++ b/core/http/endpoints/openai/list_capabilities_test.go
@@ -1,119 +0,0 @@
-package openai
-
-import (
-	"encoding/json"
-	"net/http"
-	"net/http/httptest"
-	"os"
-	"path/filepath"
-
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/LocalAI/pkg/system"
-
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("ListModelCapabilitiesEndpoint", func() {
-	var (
-		e       *echo.Echo
-		tmpDir  string
-		bcl     *config.ModelConfigLoader
-		ml      *model.ModelLoader
-		appConf *config.ApplicationConfig
-	)
-
-	BeforeEach(func() {
-		var err error
-		e = echo.New()
-		tmpDir, err = os.MkdirTemp("", "models-caps-test-*")
-		Expect(err).NotTo(HaveOccurred())
-
-		st, err := system.GetSystemState(system.WithModelPath(tmpDir))
-		Expect(err).NotTo(HaveOccurred())
-		ml = model.NewModelLoader(st)
-		bcl = config.NewModelConfigLoader(tmpDir)
-		appConf = config.NewApplicationConfig()
-	})
-
-	AfterEach(func() {
-		_ = os.RemoveAll(tmpDir)
-	})
-
-	writeConfig := func(name, yaml string) {
-		path := filepath.Join(tmpDir, name+".yaml")
-		Expect(os.WriteFile(path, []byte(yaml), 0o644)).To(Succeed())
-		Expect(bcl.ReadModelConfig(path)).To(Succeed())
-	}
-
-	// call exercises the endpoint with auth disabled (no auth DB), which is the
-	// standard deployment path. The per-user allowlist branch is shared verbatim
-	// with ListModelsEndpoint (listVisibleModelNames) and covered there.
-	call := func() schema.ModelCapabilitiesResponse {
-		req := httptest.NewRequest(http.MethodGet, "/v1/models/capabilities", nil)
-		rec := httptest.NewRecorder()
-		c := e.NewContext(req, rec)
-
-		handler := ListModelCapabilitiesEndpoint(bcl, ml, appConf)
-		Expect(handler(c)).To(Succeed())
-		Expect(rec.Code).To(Equal(http.StatusOK))
-
-		var resp schema.ModelCapabilitiesResponse
-		Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
-		return resp
-	}
-
-	entryFor := func(resp schema.ModelCapabilitiesResponse, id string) *schema.ModelCapabilities {
-		for i := range resp.Data {
-			if resp.Data[i].ID == id {
-				return &resp.Data[i]
-			}
-		}
-		return nil
-	}
-
-	It("returns the list envelope even with no models", func() {
-		resp := call()
-		Expect(resp.Object).To(Equal("list"))
-	})
-
-	It("enriches a vision chat model with capabilities and image input modality", func() {
-		writeConfig("vlm", `
-name: vlm
-backend: llama-cpp
-known_usecases:
-  - FLAG_CHAT
-  - FLAG_VISION
-template:
-  chat: "{{ .Input }}"
-parameters:
-  model: qwen2.5-vl-Q4_K_M.gguf
-`)
-		entry := entryFor(call(), "vlm")
-		Expect(entry).NotTo(BeNil())
-		Expect(entry.Object).To(Equal("model"))
-		Expect(entry.Capabilities).To(ContainElements("chat", "vision"))
-		Expect(entry.InputModalities).To(ContainElements("text", "image"))
-		Expect(entry.OutputModalities).To(ContainElement("text"))
-	})
-
-	It("marks a parakeet model as an audio-in/text-out transcription model", func() {
-		writeConfig("parakeet", `
-name: parakeet
-backend: parakeet-cpp
-known_usecases:
-  - FLAG_TRANSCRIPT
-parameters:
-  model: parakeet-tdt-0.6b
-`)
-		entry := entryFor(call(), "parakeet")
-		Expect(entry).NotTo(BeNil())
-		Expect(entry.Capabilities).To(ContainElement("transcript"))
-		Expect(entry.InputModalities).To(Equal([]string{"audio"}))
-		Expect(entry.OutputModalities).To(Equal([]string{"text"}))
-		Expect(entry.Capabilities).NotTo(ContainElement("chat"))
-	})
-})
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -272,27 +272,25 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 			"version": internal.PrintableVersion(),
 			// Flat endpoint list for backwards compatibility
 			"endpoints": map[string]any{
-				"models":              "/v1/models",
-				"models_capabilities": "/v1/models/capabilities",
-				"chat_completions":    "/v1/chat/completions",
-				"completions":         "/v1/completions",
-				"embeddings":          "/v1/embeddings",
-				"config_metadata":     "/api/models/config-metadata",
-				"config_json":         "/api/models/config-json/:name",
-				"config_patch":        "/api/models/config-json/:name",
-				"autocomplete":        "/api/models/config-metadata/autocomplete/:provider",
-				"vram_estimate":       "/api/models/vram-estimate",
-				"tts":                 "/tts",
-				"transcription":       "/v1/audio/transcriptions",
-				"image_generation":    "/v1/images/generations",
-				"swagger":             "/swagger/index.html",
-				"instructions":        "/api/instructions",
+				"models":           "/v1/models",
+				"chat_completions": "/v1/chat/completions",
+				"completions":      "/v1/completions",
+				"embeddings":       "/v1/embeddings",
+				"config_metadata":  "/api/models/config-metadata",
+				"config_json":      "/api/models/config-json/:name",
+				"config_patch":     "/api/models/config-json/:name",
+				"autocomplete":     "/api/models/config-metadata/autocomplete/:provider",
+				"vram_estimate":    "/api/models/vram-estimate",
+				"tts":              "/tts",
+				"transcription":    "/v1/audio/transcriptions",
+				"image_generation": "/v1/images/generations",
+				"swagger":          "/swagger/index.html",
+				"instructions":     "/api/instructions",
 			},
 			// Categorized endpoint groups for structured discovery
 			"endpoint_groups": map[string]any{
 				"openai_compatible": map[string]string{
 					"models":               "/v1/models",
-					"models_capabilities":  "/v1/models/capabilities",
 					"chat_completions":     "/v1/chat/completions",
 					"completions":          "/v1/completions",
 					"embeddings":           "/v1/embeddings",
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -257,10 +257,4 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	// List models
 	app.GET("/v1/models", openai.ListModelsEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB()))
 	app.GET("/models", openai.ListModelsEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB()))
-
-	// List models enriched with capabilities + input/output modalities
-	// (LocalAI-specific, additive superset of /v1/models).
-	capabilitiesHandler := openai.ListModelCapabilitiesEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), application.AuthDB())
-	app.GET("/v1/models/capabilities", capabilitiesHandler)
-	app.GET("/models/capabilities", capabilitiesHandler)
 }
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -251,27 +251,3 @@ type ModelsDataResponse struct {
 	Object string        `json:"object"`
 	Data   []OpenAIModel `json:"data"`
 }
-
-// ModelCapabilities is a strict superset of OpenAIModel that additionally
-// describes what a model can do and which modalities it accepts/produces. It is
-// served by the LocalAI-specific /v1/models/capabilities endpoint so clients can
-// route attachments (image/audio/video) to a model only when it can handle them.
-type ModelCapabilities struct {
-	ID     string `json:"id"`
-	Object string `json:"object"`
-	// Capabilities are canonical usecase strings (e.g. chat, vision, transcript,
-	// tts, embeddings, image, video) plus the modifiers "tools" and "thinking".
-	Capabilities []string `json:"capabilities"`
-	// InputModalities is the subset of {text,image,audio,video} the model accepts.
-	InputModalities []string `json:"input_modalities"`
-	// OutputModalities is the subset of {text,image,audio,video} the model produces.
-	OutputModalities []string `json:"output_modalities"`
-}
-
-// ModelCapabilitiesResponse is the envelope returned by /v1/models/capabilities.
-// It mirrors ModelsDataResponse so a client can treat it as an enriched
-// drop-in for /v1/models.
-type ModelCapabilitiesResponse struct {
-	Object string              `json:"object"`
-	Data   []ModelCapabilities `json:"data"`
-}
--- a/docs/content/features/api-discovery.md
+++ b/docs/content/features/api-discovery.md
@@ -36,7 +36,6 @@ Returns the instance version, all available endpoint URLs (flat and categorized)
  "endpoints": {
    "chat_completions": "/v1/chat/completions",
    "models": "/v1/models",
-    "models_capabilities": "/v1/models/capabilities",
    "config_metadata": "/api/models/config-metadata",
    "instructions": "/api/instructions",
    "swagger": "/swagger/index.html"
@@ -124,45 +123,6 @@ Add `?format=json` to get a raw **OpenAPI fragment** (filtered Swagger spec with
 curl http://localhost:8080/api/instructions/config-management?format=json
 ```

-## Model Capabilities
-
-`GET /v1/models/capabilities`
-
-An additive, LocalAI-specific superset of `/v1/models`. It returns the same set of models but enriches each entry with the **capabilities** the model supports and the **input/output modalities** it accepts and produces. Use it to decide, before sending a request, whether a given model can take an image, audio, or video attachment directly — or whether the input needs converting/transcribing first.
-
-Because it is purely additive, clients that only understand `/v1/models` keep working unchanged; they simply never call this route.
-
-```bash
-curl http://localhost:8080/v1/models/capabilities
-```
-
-```json
-{
-  "object": "list",
-  "data": [
-    {
-      "id": "qwen2.5-omni",
-      "object": "model",
-      "capabilities": ["chat", "vision", "tools"],
-      "input_modalities": ["text", "image", "audio"],
-      "output_modalities": ["text"]
-    },
-    {
-      "id": "parakeet",
-      "object": "model",
-      "capabilities": ["transcript"],
-      "input_modalities": ["audio"],
-      "output_modalities": ["text"]
-    }
-  ]
-}
-```
-
- **`capabilities`** — canonical usecase strings (e.g. `chat`, `vision`, `transcript`, `tts`, `embeddings`, `image`, `video`) plus the modifiers `tools` and `thinking`.
- **`input_modalities` / `output_modalities`** — the subsets of `{text, image, audio, video}` the model accepts and produces. Audio and video *input* are derived from the model's multimodal limits (e.g. vLLM `limit_mm_per_prompt`), which no single usecase flag expresses — which is why this endpoint exists alongside the plain listing.
-
-The same query parameters as `/v1/models` are honored (`filter`, `excludeConfigured`), and the same per-user model allowlist is applied when authentication is enabled.
-
 ## Configuration Management APIs

 These endpoints let agents discover model configuration fields, read current settings, modify them, and estimate VRAM usage.
--- a/docs/content/whats-new.md
+++ b/docs/content/whats-new.md
@@ -17,7 +17,6 @@ You can see the release notes [here](https://github.com/mudler/LocalAI/releases)
 - **May 2026**: [Speaker diarization](/features/audio-diarization/) — new `/v1/audio/diarization` endpoint returning "who spoke when" segments. Backed by `sherpa-onnx` (pyannote-3.0 + speaker embeddings + clustering) for pure diarization, and `vibevoice-cpp` for diarization bundled with long-form ASR. Supports `json` / `verbose_json` / `rttm` response formats.
 - **June 2026**: [Sound classification](/features/audio-classification/) — new `/v1/audio/classification` endpoint for audio tagging / sound-event classification, returning scored [AudioSet](https://research.google.com/audioset/) labels (baby cry, glass breaking, alarms, ...). Backed by [ced.cpp](https://github.com/mudler/ced.cpp), a 527-class AudioSet tagger ported to ggml.
 - **June 2026**: [PII analyze / redact API](/features/middleware/#analyze--redact-api) — the PII detection pipeline (NER + restricted-regex pattern tiers) is now a standalone service: `POST /api/pii/analyze` returns detected entity spans and `POST /api/pii/redact` returns the sanitised text (or `400 pii_blocked`), without routing a chat request through the middleware. Events gain an `origin` (`middleware` / `proxy` / `pii_analyze` / `pii_redact`) so `/api/pii/events` can be filtered by source.
- **July 2026**: [Model capabilities endpoint](/features/api-discovery/#model-capabilities) — `GET /v1/models/capabilities`, an additive superset of `/v1/models` that reports each model's `capabilities` plus its `input_modalities` / `output_modalities` (`text` / `image` / `audio` / `video`). Lets clients route image/audio/video attachments to a model only when it can handle them; audio/video *input* is derived from the model's multimodal limits, which no single usecase flag expresses.
 - **June 2026**: Concurrent scoring and PII NER on llama.cpp — the `Score` (router classifier) and `TokenClassify` (PII NER) primitives now ride llama.cpp's server task queue instead of locking the context, so they run concurrently with chat/completion/embedding traffic and with each other. The `known_usecases` restriction that forced dedicated scorer/NER model configs on llama-cpp is lifted, repeated scoring calls reuse the prompt KV cache across candidates, and scoring inputs are no longer capped by the physical batch size.

 ## 2024 Highlights
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -2728,22 +2728,6 @@ const docTemplate = `{
                }
            }
        },
-        "/v1/models/capabilities": {
-            "get": {
-                "tags": [
-                    "models"
-                ],
-                "summary": "List available models enriched with capabilities and input/output modalities.",
-                "responses": {
-                    "200": {
-                        "description": "Response",
-                        "schema": {
-                            "$ref": "#/definitions/schema.ModelCapabilitiesResponse"
-                        }
-                    }
-                }
-            }
-        },
        "/v1/rerank": {
            "post": {
                "tags": [
@@ -5198,52 +5182,6 @@ const docTemplate = `{
                }
            }
        },
-        "schema.ModelCapabilities": {
-            "type": "object",
-            "properties": {
-                "capabilities": {
-                    "description": "Capabilities are canonical usecase strings (e.g. chat, vision, transcript,\ntts, embeddings, image, video) plus the modifiers \"tools\" and \"thinking\".",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "id": {
-                    "type": "string"
-                },
-                "input_modalities": {
-                    "description": "InputModalities is the subset of {text,image,audio,video} the model accepts.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "object": {
-                    "type": "string"
-                },
-                "output_modalities": {
-                    "description": "OutputModalities is the subset of {text,image,audio,video} the model produces.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                }
-            }
-        },
-        "schema.ModelCapabilitiesResponse": {
-            "type": "object",
-            "properties": {
-                "data": {
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.ModelCapabilities"
-                    }
-                },
-                "object": {
-                    "type": "string"
-                }
-            }
-        },
        "schema.ModelLoadRequest": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -2725,22 +2725,6 @@
                }
            }
        },
-        "/v1/models/capabilities": {
-            "get": {
-                "tags": [
-                    "models"
-                ],
-                "summary": "List available models enriched with capabilities and input/output modalities.",
-                "responses": {
-                    "200": {
-                        "description": "Response",
-                        "schema": {
-                            "$ref": "#/definitions/schema.ModelCapabilitiesResponse"
-                        }
-                    }
-                }
-            }
-        },
        "/v1/rerank": {
            "post": {
                "tags": [
@@ -5195,52 +5179,6 @@
                }
            }
        },
-        "schema.ModelCapabilities": {
-            "type": "object",
-            "properties": {
-                "capabilities": {
-                    "description": "Capabilities are canonical usecase strings (e.g. chat, vision, transcript,\ntts, embeddings, image, video) plus the modifiers \"tools\" and \"thinking\".",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "id": {
-                    "type": "string"
-                },
-                "input_modalities": {
-                    "description": "InputModalities is the subset of {text,image,audio,video} the model accepts.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                },
-                "object": {
-                    "type": "string"
-                },
-                "output_modalities": {
-                    "description": "OutputModalities is the subset of {text,image,audio,video} the model produces.",
-                    "type": "array",
-                    "items": {
-                        "type": "string"
-                    }
-                }
-            }
-        },
-        "schema.ModelCapabilitiesResponse": {
-            "type": "object",
-            "properties": {
-                "data": {
-                    "type": "array",
-                    "items": {
-                        "$ref": "#/definitions/schema.ModelCapabilities"
-                    }
-                },
-                "object": {
-                    "type": "string"
-                }
-            }
-        },
        "schema.ModelLoadRequest": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -1362,41 +1362,6 @@ definitions:
          $ref: '#/definitions/schema.ToolCall'
        type: array
    type: object
-  schema.ModelCapabilities:
-    properties:
-      capabilities:
-        description: |-
-          Capabilities are canonical usecase strings (e.g. chat, vision, transcript,
-          tts, embeddings, image, video) plus the modifiers "tools" and "thinking".
-        items:
-          type: string
-        type: array
-      id:
-        type: string
-      input_modalities:
-        description: InputModalities is the subset of {text,image,audio,video} the
-          model accepts.
-        items:
-          type: string
-        type: array
-      object:
-        type: string
-      output_modalities:
-        description: OutputModalities is the subset of {text,image,audio,video} the
-          model produces.
-        items:
-          type: string
-        type: array
-    type: object
-  schema.ModelCapabilitiesResponse:
-    properties:
-      data:
-        items:
-          $ref: '#/definitions/schema.ModelCapabilities'
-        type: array
-      object:
-        type: string
-    type: object
  schema.ModelLoadRequest:
    properties:
      model:
@@ -4393,16 +4358,6 @@ paths:
      summary: List and describe the various models available in the API.
      tags:
      - models
-  /v1/models/capabilities:
-    get:
-      responses:
-        "200":
-          description: Response
-          schema:
-            $ref: '#/definitions/schema.ModelCapabilitiesResponse'
-      summary: List available models enriched with capabilities and input/output modalities.
-      tags:
-      - models
  /v1/rerank:
    post:
      parameters: