mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-18 21:58:58 -04:00
Squashed feat/pii-ner-tier-engine rebased onto master (was 45 commits; see backup/pii-ner-tier-engine-prerebase). Net change: - privacy-filter.cpp: standalone GGML engine for the openai-privacy-filter PII/NER token classifier, wired as a LocalAI gRPC backend (CPU/CUDA/Vulkan). TokenClassify moves off the patched llama.cpp path onto this backend. - PII filter reworked to be NER-centric (encoder/NER detection tier scanning whole conversations as one document), with a recreated bounded restricted- regex secret-matching pattern detector tier alongside it (per-model pii_detection.builtins / .patterns + core/services/routing/piipattern). - Detection labelled by source (ner vs pattern); backend trace / confidence / debug observability; analyze/redact exposed as a synchronous API. - Instance-wide default detector policy + per-usecase default-on; request filtering extended to completions, embeddings, edits & Ollama. - React UI: NER-centric PII editor, detector-models table, pattern/builtins editor, middleware default-policy UI. - Gallery: privacy-filter-multilingual token-classify model + NER install filter; token_classify known_usecase; batch sized to context for NER models. privacy-filter backend registered in the backend gallery (cpu/vulkan/cuda-13 meta + image entries with a capabilities map) matching its CI matrix jobs, and an /import-model auto-detect importer (PrivacyFilterImporter, narrow privacy-filter GGUF detection) replacing the prior pref-only registration. Reconciled against master's independent evolution: - Dropped master's PIIPatternOverrides feature (global-pattern runtime overrides + /api/pii/patterns API + runtime_settings.json persistence). The per-model NER + pattern-detector design supersedes it; it was built on the global redactor pattern set this branch replaced. - Reverted the llama.cpp Score carry-patch (0006-server-task-type-score): removed the patch and restored master's grpc-server.cpp Score RPC (direct llama_decode, slot-loop bypass) and LLAMA_VERSION pin, plus master's model_config validation forbidding score + chat/completion/embeddings on llama-cpp. token_classify is unaffected (it runs on the privacy-filter backend, not llama-cpp). Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Richard Palethorpe <io@richiejp.com>
625 lines
25 KiB
Go
625 lines
25 KiB
Go
package config
|
|
|
|
import (
|
|
"slices"
|
|
"strings"
|
|
)
|
|
|
|
// Usecase name constants — the canonical string values used in gallery entries,
|
|
// model configs (known_usecases), and UsecaseInfoMap keys.
|
|
const (
|
|
UsecaseChat = "chat"
|
|
UsecaseCompletion = "completion"
|
|
UsecaseEdit = "edit"
|
|
UsecaseVision = "vision"
|
|
UsecaseEmbeddings = "embeddings"
|
|
UsecaseTokenize = "tokenize"
|
|
UsecaseImage = "image"
|
|
UsecaseVideo = "video"
|
|
UsecaseTranscript = "transcript"
|
|
UsecaseTTS = "tts"
|
|
UsecaseSoundGeneration = "sound_generation"
|
|
UsecaseRerank = "rerank"
|
|
UsecaseDetection = "detection"
|
|
UsecaseDepth = "depth"
|
|
UsecaseVAD = "vad"
|
|
UsecaseAudioTransform = "audio_transform"
|
|
UsecaseDiarization = "diarization"
|
|
UsecaseRealtimeAudio = "realtime_audio"
|
|
UsecaseFaceRecognition = "face_recognition"
|
|
UsecaseSpeakerRecognition = "speaker_recognition"
|
|
UsecaseTokenClassify = "token_classify"
|
|
)
|
|
|
|
// GRPCMethod identifies a Backend service RPC from backend.proto.
|
|
type GRPCMethod string
|
|
|
|
const (
|
|
MethodPredict GRPCMethod = "Predict"
|
|
MethodPredictStream GRPCMethod = "PredictStream"
|
|
MethodEmbedding GRPCMethod = "Embedding"
|
|
MethodGenerateImage GRPCMethod = "GenerateImage"
|
|
MethodGenerateVideo GRPCMethod = "GenerateVideo"
|
|
MethodAudioTranscription GRPCMethod = "AudioTranscription"
|
|
MethodTTS GRPCMethod = "TTS"
|
|
MethodTTSStream GRPCMethod = "TTSStream"
|
|
MethodSoundGeneration GRPCMethod = "SoundGeneration"
|
|
MethodTokenizeString GRPCMethod = "TokenizeString"
|
|
MethodDetect GRPCMethod = "Detect"
|
|
MethodDepth GRPCMethod = "Depth"
|
|
MethodRerank GRPCMethod = "Rerank"
|
|
MethodVAD GRPCMethod = "VAD"
|
|
MethodAudioTransform GRPCMethod = "AudioTransform"
|
|
MethodDiarize GRPCMethod = "Diarize"
|
|
MethodAudioToAudioStream GRPCMethod = "AudioToAudioStream"
|
|
MethodFaceVerify GRPCMethod = "FaceVerify"
|
|
MethodFaceAnalyze GRPCMethod = "FaceAnalyze"
|
|
MethodVoiceVerify GRPCMethod = "VoiceVerify"
|
|
MethodVoiceEmbed GRPCMethod = "VoiceEmbed"
|
|
MethodVoiceAnalyze GRPCMethod = "VoiceAnalyze"
|
|
MethodTokenClassify GRPCMethod = "TokenClassify"
|
|
)
|
|
|
|
// UsecaseInfo describes a single known_usecase value and how it maps
|
|
// to the gRPC backend API.
|
|
type UsecaseInfo struct {
|
|
// Flag is the ModelConfigUsecase bitmask value.
|
|
Flag ModelConfigUsecase
|
|
// GRPCMethod is the primary Backend service RPC this usecase maps to.
|
|
GRPCMethod GRPCMethod
|
|
// IsModifier is true when this usecase doesn't map to its own gRPC RPC
|
|
// but modifies how another RPC behaves (e.g., vision uses Predict with images).
|
|
IsModifier bool
|
|
// DependsOn names the usecase(s) this modifier requires (e.g., "chat").
|
|
DependsOn string
|
|
// Description is a human/LLM-readable explanation of what this usecase means.
|
|
Description string
|
|
}
|
|
|
|
// UsecaseInfoMap maps each known_usecase string to its gRPC and semantic info.
|
|
var UsecaseInfoMap = map[string]UsecaseInfo{
|
|
UsecaseChat: {
|
|
Flag: FLAG_CHAT,
|
|
GRPCMethod: MethodPredict,
|
|
Description: "Conversational/instruction-following via the Predict RPC with chat templates.",
|
|
},
|
|
UsecaseCompletion: {
|
|
Flag: FLAG_COMPLETION,
|
|
GRPCMethod: MethodPredict,
|
|
Description: "Text completion via the Predict RPC with a completion template.",
|
|
},
|
|
UsecaseEdit: {
|
|
Flag: FLAG_EDIT,
|
|
GRPCMethod: MethodPredict,
|
|
Description: "Text editing via the Predict RPC with an edit template.",
|
|
},
|
|
UsecaseVision: {
|
|
Flag: FLAG_VISION,
|
|
GRPCMethod: MethodPredict,
|
|
IsModifier: true,
|
|
DependsOn: UsecaseChat,
|
|
Description: "The model accepts images alongside text in the Predict RPC. For llama-cpp this requires an mmproj file.",
|
|
},
|
|
UsecaseEmbeddings: {
|
|
Flag: FLAG_EMBEDDINGS,
|
|
GRPCMethod: MethodEmbedding,
|
|
Description: "Vector embedding generation via the Embedding RPC.",
|
|
},
|
|
UsecaseTokenize: {
|
|
Flag: FLAG_TOKENIZE,
|
|
GRPCMethod: MethodTokenizeString,
|
|
Description: "Tokenization via the TokenizeString RPC without running inference.",
|
|
},
|
|
UsecaseImage: {
|
|
Flag: FLAG_IMAGE,
|
|
GRPCMethod: MethodGenerateImage,
|
|
Description: "Image generation via the GenerateImage RPC (Stable Diffusion, Flux, etc.).",
|
|
},
|
|
UsecaseVideo: {
|
|
Flag: FLAG_VIDEO,
|
|
GRPCMethod: MethodGenerateVideo,
|
|
Description: "Video generation via the GenerateVideo RPC.",
|
|
},
|
|
UsecaseTranscript: {
|
|
Flag: FLAG_TRANSCRIPT,
|
|
GRPCMethod: MethodAudioTranscription,
|
|
Description: "Speech-to-text via the AudioTranscription RPC.",
|
|
},
|
|
UsecaseTTS: {
|
|
Flag: FLAG_TTS,
|
|
GRPCMethod: MethodTTS,
|
|
Description: "Text-to-speech via the TTS RPC.",
|
|
},
|
|
UsecaseSoundGeneration: {
|
|
Flag: FLAG_SOUND_GENERATION,
|
|
GRPCMethod: MethodSoundGeneration,
|
|
Description: "Music/sound generation via the SoundGeneration RPC (not speech).",
|
|
},
|
|
UsecaseRerank: {
|
|
Flag: FLAG_RERANK,
|
|
GRPCMethod: MethodRerank,
|
|
Description: "Document reranking via the Rerank RPC.",
|
|
},
|
|
UsecaseDetection: {
|
|
Flag: FLAG_DETECTION,
|
|
GRPCMethod: MethodDetect,
|
|
Description: "Object detection via the Detect RPC with bounding boxes.",
|
|
},
|
|
UsecaseDepth: {
|
|
Flag: FLAG_DEPTH,
|
|
GRPCMethod: MethodDepth,
|
|
Description: "Per-pixel metric depth, camera pose and 3D point cloud via the Depth RPC (Depth Anything 3).",
|
|
},
|
|
UsecaseVAD: {
|
|
Flag: FLAG_VAD,
|
|
GRPCMethod: MethodVAD,
|
|
Description: "Voice activity detection via the VAD RPC.",
|
|
},
|
|
UsecaseAudioTransform: {
|
|
Flag: FLAG_AUDIO_TRANSFORM,
|
|
GRPCMethod: MethodAudioTransform,
|
|
Description: "Audio-in / audio-out transformations (echo cancellation, noise suppression, dereverberation, voice conversion) via the AudioTransform RPC.",
|
|
},
|
|
UsecaseDiarization: {
|
|
Flag: FLAG_DIARIZATION,
|
|
GRPCMethod: MethodDiarize,
|
|
Description: "Speaker diarization (who-spoke-when, per-speaker segments) via the Diarize RPC.",
|
|
},
|
|
UsecaseRealtimeAudio: {
|
|
Flag: FLAG_REALTIME_AUDIO,
|
|
GRPCMethod: MethodAudioToAudioStream,
|
|
Description: "Self-contained any-to-any audio model for the Realtime API — accepts microphone audio and emits speech + transcript (+ optional function calls) from a single backend via the AudioToAudioStream RPC.",
|
|
},
|
|
UsecaseFaceRecognition: {
|
|
Flag: FLAG_FACE_RECOGNITION,
|
|
GRPCMethod: MethodFaceVerify,
|
|
Description: "Face recognition — verify identity, analyze attributes (age/gender/emotion) via FaceVerify and FaceAnalyze RPCs.",
|
|
},
|
|
UsecaseSpeakerRecognition: {
|
|
Flag: FLAG_SPEAKER_RECOGNITION,
|
|
GRPCMethod: MethodVoiceVerify,
|
|
Description: "Speaker recognition — verify identity, embed and analyze voice via VoiceVerify, VoiceEmbed and VoiceAnalyze RPCs.",
|
|
},
|
|
UsecaseTokenClassify: {
|
|
Flag: FLAG_TOKEN_CLASSIFY,
|
|
GRPCMethod: MethodTokenClassify,
|
|
Description: "Per-token classification (NER) via the TokenClassify RPC — the PII detector tier. Declared explicitly via known_usecases; never auto-guessed, since the token-classification head is not useful as general generation or embeddings.",
|
|
},
|
|
}
|
|
|
|
// BackendCapability describes which gRPC methods and usecases a backend supports.
|
|
// Derived from reviewing actual implementations in backend/go/ and backend/python/.
|
|
type BackendCapability struct {
|
|
// GRPCMethods lists the Backend service RPCs this backend implements.
|
|
GRPCMethods []GRPCMethod
|
|
// PossibleUsecases lists all usecase strings this backend can support.
|
|
PossibleUsecases []string
|
|
// DefaultUsecases lists the conservative safe defaults.
|
|
DefaultUsecases []string
|
|
// AcceptsImages indicates multimodal image input in Predict.
|
|
AcceptsImages bool
|
|
// AcceptsVideos indicates multimodal video input in Predict.
|
|
AcceptsVideos bool
|
|
// AcceptsAudios indicates multimodal audio input in Predict.
|
|
AcceptsAudios bool
|
|
// Description is a human-readable summary of the backend.
|
|
Description string
|
|
}
|
|
|
|
// BackendCapabilities maps each backend name (as used in model configs and gallery
|
|
// entries) to its verified capabilities. This is the single source of truth for
|
|
// what each backend supports.
|
|
//
|
|
// Backend names use hyphens (e.g., "llama-cpp") matching the gallery convention.
|
|
// Use NormalizeBackendName() for names with dots (e.g., "llama.cpp").
|
|
var BackendCapabilities = map[string]BackendCapability{
|
|
// --- LLM / text generation backends ---
|
|
"llama-cpp": {
|
|
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding, MethodTokenizeString},
|
|
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEdit, UsecaseEmbeddings, UsecaseTokenize, UsecaseVision},
|
|
DefaultUsecases: []string{UsecaseChat},
|
|
AcceptsImages: true, // requires mmproj
|
|
Description: "llama.cpp GGUF models — LLM inference with optional vision via mmproj",
|
|
},
|
|
// privacy-filter is the standalone GGML engine (backend/cpp/privacy-filter,
|
|
// wrapping privacy-filter.cpp) for the openai-privacy-filter PII/NER token
|
|
// classifier — the dedicated TokenClassify path that replaces the
|
|
// patched-llama.cpp route. Never auto-guessed; declared explicitly via
|
|
// known_usecases: [token_classify].
|
|
"privacy-filter": {
|
|
GRPCMethods: []GRPCMethod{MethodTokenClassify},
|
|
PossibleUsecases: []string{UsecaseTokenClassify},
|
|
DefaultUsecases: []string{UsecaseTokenClassify},
|
|
Description: "privacy-filter.cpp — standalone GGML backend for openai-privacy-filter PII/NER token classification",
|
|
},
|
|
"vllm": {
|
|
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding},
|
|
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings, UsecaseVision},
|
|
DefaultUsecases: []string{UsecaseChat},
|
|
AcceptsImages: true,
|
|
AcceptsVideos: true,
|
|
Description: "vLLM engine — high-throughput LLM serving with optional multimodal",
|
|
},
|
|
"sglang": {
|
|
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodTokenizeString},
|
|
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseTokenize, UsecaseVision},
|
|
DefaultUsecases: []string{UsecaseChat},
|
|
AcceptsImages: true,
|
|
Description: "SGLang — fast LLM inference with structured generation and optional vision",
|
|
},
|
|
"vllm-omni": {
|
|
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodGenerateImage, MethodGenerateVideo, MethodTTS},
|
|
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseImage, UsecaseVideo, UsecaseTTS, UsecaseVision},
|
|
DefaultUsecases: []string{UsecaseChat},
|
|
AcceptsImages: true,
|
|
AcceptsVideos: true,
|
|
AcceptsAudios: true,
|
|
Description: "vLLM omni-modal — supports text, image, video generation and TTS",
|
|
},
|
|
"transformers": {
|
|
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding, MethodTTS, MethodSoundGeneration},
|
|
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings, UsecaseTTS, UsecaseSoundGeneration},
|
|
DefaultUsecases: []string{UsecaseChat},
|
|
Description: "HuggingFace transformers — general-purpose Python inference",
|
|
},
|
|
"mlx": {
|
|
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding},
|
|
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings},
|
|
DefaultUsecases: []string{UsecaseChat},
|
|
Description: "Apple MLX framework — optimized for Apple Silicon",
|
|
},
|
|
"mlx-distributed": {
|
|
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding},
|
|
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings},
|
|
DefaultUsecases: []string{UsecaseChat},
|
|
Description: "MLX distributed inference across multiple Apple Silicon devices",
|
|
},
|
|
"mlx-vlm": {
|
|
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodEmbedding},
|
|
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseEmbeddings, UsecaseVision},
|
|
DefaultUsecases: []string{UsecaseChat, UsecaseVision},
|
|
AcceptsImages: true,
|
|
AcceptsAudios: true,
|
|
Description: "MLX vision-language models with multimodal input",
|
|
},
|
|
"mlx-audio": {
|
|
GRPCMethods: []GRPCMethod{MethodPredict, MethodTTS},
|
|
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseChat},
|
|
Description: "MLX audio models — text generation and TTS",
|
|
},
|
|
|
|
// --- Image/video generation backends ---
|
|
"diffusers": {
|
|
GRPCMethods: []GRPCMethod{MethodGenerateImage, MethodGenerateVideo},
|
|
PossibleUsecases: []string{UsecaseImage, UsecaseVideo},
|
|
DefaultUsecases: []string{UsecaseImage},
|
|
Description: "HuggingFace diffusers — Stable Diffusion, Flux, video generation",
|
|
},
|
|
"stablediffusion": {
|
|
GRPCMethods: []GRPCMethod{MethodGenerateImage},
|
|
PossibleUsecases: []string{UsecaseImage},
|
|
DefaultUsecases: []string{UsecaseImage},
|
|
Description: "Stable Diffusion native backend",
|
|
},
|
|
"stablediffusion-ggml": {
|
|
GRPCMethods: []GRPCMethod{MethodGenerateImage},
|
|
PossibleUsecases: []string{UsecaseImage},
|
|
DefaultUsecases: []string{UsecaseImage},
|
|
Description: "Stable Diffusion via GGML quantized models",
|
|
},
|
|
|
|
// --- Speech-to-text backends ---
|
|
"whisper": {
|
|
GRPCMethods: []GRPCMethod{MethodAudioTranscription, MethodVAD},
|
|
PossibleUsecases: []string{UsecaseTranscript, UsecaseVAD},
|
|
DefaultUsecases: []string{UsecaseTranscript},
|
|
Description: "OpenAI Whisper — speech recognition and voice activity detection",
|
|
},
|
|
"faster-whisper": {
|
|
GRPCMethods: []GRPCMethod{MethodAudioTranscription},
|
|
PossibleUsecases: []string{UsecaseTranscript},
|
|
DefaultUsecases: []string{UsecaseTranscript},
|
|
Description: "CTranslate2-accelerated Whisper for faster transcription",
|
|
},
|
|
"whisperx": {
|
|
GRPCMethods: []GRPCMethod{MethodAudioTranscription},
|
|
PossibleUsecases: []string{UsecaseTranscript},
|
|
DefaultUsecases: []string{UsecaseTranscript},
|
|
Description: "WhisperX — Whisper with word-level timestamps and speaker diarization",
|
|
},
|
|
"moonshine": {
|
|
GRPCMethods: []GRPCMethod{MethodAudioTranscription},
|
|
PossibleUsecases: []string{UsecaseTranscript},
|
|
DefaultUsecases: []string{UsecaseTranscript},
|
|
Description: "Moonshine speech recognition",
|
|
},
|
|
"nemo": {
|
|
GRPCMethods: []GRPCMethod{MethodAudioTranscription},
|
|
PossibleUsecases: []string{UsecaseTranscript},
|
|
DefaultUsecases: []string{UsecaseTranscript},
|
|
Description: "NVIDIA NeMo speech recognition",
|
|
},
|
|
"parakeet-cpp": {
|
|
GRPCMethods: []GRPCMethod{MethodAudioTranscription},
|
|
PossibleUsecases: []string{UsecaseTranscript},
|
|
DefaultUsecases: []string{UsecaseTranscript},
|
|
Description: "NVIDIA NeMo Parakeet ASR (parakeet.cpp)",
|
|
},
|
|
"qwen-asr": {
|
|
GRPCMethods: []GRPCMethod{MethodAudioTranscription},
|
|
PossibleUsecases: []string{UsecaseTranscript},
|
|
DefaultUsecases: []string{UsecaseTranscript},
|
|
Description: "Qwen automatic speech recognition",
|
|
},
|
|
"voxtral": {
|
|
GRPCMethods: []GRPCMethod{MethodAudioTranscription},
|
|
PossibleUsecases: []string{UsecaseTranscript},
|
|
DefaultUsecases: []string{UsecaseTranscript},
|
|
Description: "Voxtral speech recognition",
|
|
},
|
|
"vibevoice": {
|
|
GRPCMethods: []GRPCMethod{MethodAudioTranscription, MethodTTS},
|
|
PossibleUsecases: []string{UsecaseTranscript, UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTranscript, UsecaseTTS},
|
|
Description: "VibeVoice — bidirectional speech (transcription and synthesis)",
|
|
},
|
|
"vibevoice-cpp": {
|
|
GRPCMethods: []GRPCMethod{MethodAudioTranscription, MethodTTS, MethodTTSStream},
|
|
PossibleUsecases: []string{UsecaseTranscript, UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTranscript, UsecaseTTS},
|
|
Description: "VibeVoice C++ — bidirectional speech, C++ backend with streaming TTS",
|
|
},
|
|
"sherpa-onnx": {
|
|
GRPCMethods: []GRPCMethod{MethodAudioTranscription, MethodTTS, MethodTTSStream, MethodVAD},
|
|
PossibleUsecases: []string{UsecaseTranscript, UsecaseTTS, UsecaseVAD},
|
|
DefaultUsecases: []string{UsecaseTranscript},
|
|
Description: "Sherpa-ONNX — multi-model speech toolkit (ASR, TTS, VAD)",
|
|
},
|
|
|
|
// --- TTS backends ---
|
|
"piper": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS},
|
|
PossibleUsecases: []string{UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTTS},
|
|
Description: "Piper — fast neural TTS optimized for Raspberry Pi",
|
|
},
|
|
"kokoro": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS},
|
|
PossibleUsecases: []string{UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTTS},
|
|
Description: "Kokoro TTS",
|
|
},
|
|
"coqui": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS},
|
|
PossibleUsecases: []string{UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTTS},
|
|
Description: "Coqui TTS — multi-speaker neural synthesis",
|
|
},
|
|
"kitten-tts": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS},
|
|
PossibleUsecases: []string{UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTTS},
|
|
Description: "Kitten TTS",
|
|
},
|
|
"outetts": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS},
|
|
PossibleUsecases: []string{UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTTS},
|
|
Description: "OuteTTS",
|
|
},
|
|
"pocket-tts": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS},
|
|
PossibleUsecases: []string{UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTTS},
|
|
Description: "Pocket TTS — lightweight text-to-speech",
|
|
},
|
|
"qwen-tts": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS},
|
|
PossibleUsecases: []string{UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTTS},
|
|
Description: "Qwen TTS",
|
|
},
|
|
"qwen3-tts-cpp": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS, MethodTTSStream},
|
|
PossibleUsecases: []string{UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTTS},
|
|
Description: "Qwen3 TTS C++ - text-to-speech with streaming, named speakers, voice design and cloning (qwentts.cpp / GGML)",
|
|
},
|
|
"faster-qwen3-tts": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS},
|
|
PossibleUsecases: []string{UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTTS},
|
|
Description: "Faster Qwen3 TTS — accelerated Qwen TTS",
|
|
},
|
|
"fish-speech": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS},
|
|
PossibleUsecases: []string{UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTTS},
|
|
Description: "Fish Speech TTS",
|
|
},
|
|
"neutts": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS},
|
|
PossibleUsecases: []string{UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTTS},
|
|
Description: "NeuTTS — neural text-to-speech",
|
|
},
|
|
"chatterbox": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS},
|
|
PossibleUsecases: []string{UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTTS},
|
|
Description: "Chatterbox TTS",
|
|
},
|
|
"voxcpm": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS, MethodTTSStream},
|
|
PossibleUsecases: []string{UsecaseTTS},
|
|
DefaultUsecases: []string{UsecaseTTS},
|
|
Description: "VoxCPM TTS with streaming support",
|
|
},
|
|
|
|
// --- Sound generation backends ---
|
|
"ace-step": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS, MethodSoundGeneration},
|
|
PossibleUsecases: []string{UsecaseTTS, UsecaseSoundGeneration},
|
|
DefaultUsecases: []string{UsecaseSoundGeneration},
|
|
Description: "ACE-Step — music and sound generation",
|
|
},
|
|
"acestep-cpp": {
|
|
GRPCMethods: []GRPCMethod{MethodSoundGeneration},
|
|
PossibleUsecases: []string{UsecaseSoundGeneration},
|
|
DefaultUsecases: []string{UsecaseSoundGeneration},
|
|
Description: "ACE-Step C++ — native sound generation",
|
|
},
|
|
"transformers-musicgen": {
|
|
GRPCMethods: []GRPCMethod{MethodTTS, MethodSoundGeneration},
|
|
PossibleUsecases: []string{UsecaseTTS, UsecaseSoundGeneration},
|
|
DefaultUsecases: []string{UsecaseSoundGeneration},
|
|
Description: "Meta MusicGen via transformers — music generation from text",
|
|
},
|
|
|
|
// --- Any-to-any audio backends ---
|
|
"liquid-audio": {
|
|
GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodAudioTranscription, MethodTTS, MethodAudioToAudioStream, MethodVAD},
|
|
PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseTranscript, UsecaseTTS, UsecaseRealtimeAudio, UsecaseVAD},
|
|
DefaultUsecases: []string{UsecaseRealtimeAudio, UsecaseChat, UsecaseTranscript, UsecaseTTS, UsecaseVAD},
|
|
AcceptsAudios: true,
|
|
Description: "LFM2 / LFM2.5-Audio — self-contained any-to-any audio model for the Realtime API; also exposes chat, transcription, TTS and a stub energy-based VAD endpoint",
|
|
},
|
|
|
|
// --- Audio transform backends ---
|
|
"localvqe": {
|
|
GRPCMethods: []GRPCMethod{MethodAudioTransform},
|
|
PossibleUsecases: []string{UsecaseAudioTransform},
|
|
DefaultUsecases: []string{UsecaseAudioTransform},
|
|
Description: "LocalVQE — joint AEC, noise suppression, and dereverberation for 16 kHz mono speech",
|
|
},
|
|
|
|
// --- Utility backends ---
|
|
"rerankers": {
|
|
GRPCMethods: []GRPCMethod{MethodRerank},
|
|
PossibleUsecases: []string{UsecaseRerank},
|
|
DefaultUsecases: []string{UsecaseRerank},
|
|
Description: "Cross-encoder reranking models",
|
|
},
|
|
"rfdetr": {
|
|
GRPCMethods: []GRPCMethod{MethodDetect},
|
|
PossibleUsecases: []string{UsecaseDetection},
|
|
DefaultUsecases: []string{UsecaseDetection},
|
|
Description: "RF-DETR object detection",
|
|
},
|
|
"rfdetr-cpp": {
|
|
GRPCMethods: []GRPCMethod{MethodDetect},
|
|
PossibleUsecases: []string{UsecaseDetection},
|
|
DefaultUsecases: []string{UsecaseDetection},
|
|
Description: "RF-DETR C++ object detection",
|
|
},
|
|
"depth-anything": {
|
|
GRPCMethods: []GRPCMethod{MethodDepth, MethodPredict, MethodGenerateImage},
|
|
PossibleUsecases: []string{UsecaseDepth},
|
|
DefaultUsecases: []string{UsecaseDepth},
|
|
AcceptsImages: true,
|
|
Description: "Depth Anything 3 C++ — per-pixel metric depth, camera pose and 3D point cloud",
|
|
},
|
|
|
|
// --- Face and speaker recognition backends ---
|
|
"insightface": {
|
|
GRPCMethods: []GRPCMethod{MethodEmbedding, MethodDetect, MethodFaceVerify, MethodFaceAnalyze},
|
|
PossibleUsecases: []string{UsecaseEmbeddings, UsecaseDetection, UsecaseFaceRecognition},
|
|
DefaultUsecases: []string{UsecaseFaceRecognition},
|
|
AcceptsImages: true,
|
|
Description: "InsightFace — face detection, embedding, verification and attribute analysis",
|
|
},
|
|
"speaker-recognition": {
|
|
GRPCMethods: []GRPCMethod{MethodVoiceVerify, MethodVoiceEmbed, MethodVoiceAnalyze},
|
|
PossibleUsecases: []string{UsecaseSpeakerRecognition},
|
|
DefaultUsecases: []string{UsecaseSpeakerRecognition},
|
|
Description: "Speaker recognition — voice identity verification and analysis",
|
|
},
|
|
"silero-vad": {
|
|
GRPCMethods: []GRPCMethod{MethodVAD},
|
|
PossibleUsecases: []string{UsecaseVAD},
|
|
DefaultUsecases: []string{UsecaseVAD},
|
|
Description: "Silero VAD — voice activity detection",
|
|
},
|
|
}
|
|
|
|
// NormalizeBackendName converts backend names to the canonical hyphenated form
|
|
// used in gallery entries (e.g., "llama.cpp" → "llama-cpp").
|
|
func NormalizeBackendName(backend string) string {
|
|
return strings.ReplaceAll(backend, ".", "-")
|
|
}
|
|
|
|
// nonLlamaSamplerBackends lists backends whose native sampler defaults differ
|
|
// from llama.cpp's, so LocalAI must NOT inject llama.cpp's top_k=40 default for
|
|
// them (issue #6632). mlx_lm's intended default is top_k=0 (disabled) and mlx
|
|
// does not remap 0->40, so shipping 40 silently changes sampling for clients
|
|
// that omit top_k. Leaving TopK nil lets the wire value default to 0.
|
|
//
|
|
// This is intentionally a small allow-list of KNOWN non-llama backends: empty
|
|
// and unknown backends fall through to the llama.cpp default to preserve the
|
|
// GGUF auto-detect path's behavior.
|
|
var nonLlamaSamplerBackends = map[string]struct{}{
|
|
"mlx": {},
|
|
"mlx-vlm": {},
|
|
"mlx-distributed": {},
|
|
}
|
|
|
|
// UsesLlamaSamplerDefaults reports whether a backend should receive llama.cpp's
|
|
// sampler defaults (e.g. top_k=40). Empty/unknown backends return true so the
|
|
// GGUF auto-detect path (which resolves to llama.cpp) keeps today's behavior;
|
|
// only the known non-llama backends in nonLlamaSamplerBackends return false.
|
|
func UsesLlamaSamplerDefaults(backend string) bool {
|
|
if backend == "" {
|
|
return true
|
|
}
|
|
_, isNonLlama := nonLlamaSamplerBackends[NormalizeBackendName(backend)]
|
|
return !isNonLlama
|
|
}
|
|
|
|
// GetBackendCapability returns the capability info for a backend, or nil if unknown.
|
|
// Handles backend name normalization.
|
|
func GetBackendCapability(backend string) *BackendCapability {
|
|
if cap, ok := BackendCapabilities[NormalizeBackendName(backend)]; ok {
|
|
return &cap
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// PossibleUsecasesForBackend returns all usecases a backend can support.
|
|
// Returns nil if the backend is unknown.
|
|
func PossibleUsecasesForBackend(backend string) []string {
|
|
if cap := GetBackendCapability(backend); cap != nil {
|
|
return cap.PossibleUsecases
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// DefaultUsecasesForBackend returns the conservative default usecases.
|
|
// Returns nil if the backend is unknown.
|
|
func DefaultUsecasesForBackendCap(backend string) []string {
|
|
if cap := GetBackendCapability(backend); cap != nil {
|
|
return cap.DefaultUsecases
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// IsValidUsecaseForBackend checks whether a usecase is in a backend's possible set.
|
|
// Returns true for unknown backends (permissive fallback).
|
|
func IsValidUsecaseForBackend(backend, usecase string) bool {
|
|
cap := GetBackendCapability(backend)
|
|
if cap == nil {
|
|
return true // unknown backend — don't restrict
|
|
}
|
|
return slices.Contains(cap.PossibleUsecases, usecase)
|
|
}
|
|
|
|
// AllBackendNames returns a sorted list of all known backend names.
|
|
func AllBackendNames() []string {
|
|
names := make([]string, 0, len(BackendCapabilities))
|
|
for name := range BackendCapabilities {
|
|
names = append(names, name)
|
|
}
|
|
slices.Sort(names)
|
|
return names
|
|
}
|