Compare commits

..

1 Commits

Author SHA1 Message Date
copilot-swe-agent[bot]
4b0dbe8ae0 Initial plan 2026-02-02 21:21:41 +00:00
92 changed files with 212 additions and 2914 deletions

View File

@@ -104,58 +104,6 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cpu-ace-step'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'true'
backend: "ace-step"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cpu-mlx'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'true'
backend: "mlx"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cpu-mlx-vlm'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'true'
backend: "mlx-vlm"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cpu-mlx-audio'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'true'
backend: "mlx-audio"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
# CUDA 12 builds
- build-type: 'cublas'
cuda-major-version: "12"
@@ -300,19 +248,6 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-ace-step'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "ace-step"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
@@ -365,19 +300,6 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-outetts'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "outetts"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
@@ -404,45 +326,6 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-mlx'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "mlx"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-mlx-vlm'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "mlx-vlm"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-mlx-audio'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "mlx-audio"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
@@ -626,19 +509,6 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-ace-step'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "ace-step"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'l4t'
cuda-major-version: "13"
cuda-minor-version: "0"
@@ -704,45 +574,6 @@ jobs:
backend: "diffusers"
dockerfile: "./backend/Dockerfile.python"
context: "./"
- build-type: 'l4t'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/arm64'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-cuda-13-arm64-mlx'
runs-on: 'ubuntu-24.04-arm'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
ubuntu-version: '2404'
backend: "mlx"
dockerfile: "./backend/Dockerfile.python"
context: "./"
- build-type: 'l4t'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/arm64'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-cuda-13-arm64-mlx-vlm'
runs-on: 'ubuntu-24.04-arm'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
ubuntu-version: '2404'
backend: "mlx-vlm"
dockerfile: "./backend/Dockerfile.python"
context: "./"
- build-type: 'l4t'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/arm64'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-cuda-13-arm64-mlx-audio'
runs-on: 'ubuntu-24.04-arm'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
ubuntu-version: '2404'
backend: "mlx-audio"
dockerfile: "./backend/Dockerfile.python"
context: "./"
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
@@ -808,45 +639,6 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-mlx'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "mlx"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-mlx-vlm'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "mlx-vlm"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-mlx-audio'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "mlx-audio"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
@@ -991,19 +783,6 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-rocm-hipblas-ace-step'
runs-on: 'arc-runner-set'
base-image: "rocm/dev-ubuntu-24.04:6.4.4"
skip-drivers: 'false'
backend: "ace-step"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
# ROCm additional backends
- build-type: 'hipblas'
cuda-major-version: ""
@@ -1201,19 +980,6 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'intel'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-intel-ace-step'
runs-on: 'ubuntu-latest'
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
skip-drivers: 'false'
backend: "ace-step"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: 'l4t'
cuda-major-version: "12"
cuda-minor-version: "0"
@@ -1279,45 +1045,6 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2204'
- build-type: 'l4t'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-mlx'
runs-on: 'ubuntu-24.04-arm'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
skip-drivers: 'true'
backend: "mlx"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2204'
- build-type: 'l4t'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-mlx-vlm'
runs-on: 'ubuntu-24.04-arm'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
skip-drivers: 'true'
backend: "mlx-vlm"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2204'
- build-type: 'l4t'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-mlx-audio'
runs-on: 'ubuntu-24.04-arm'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
skip-drivers: 'true'
backend: "mlx-audio"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2204'
# SYCL additional backends
- build-type: 'intel'
cuda-major-version: ""
@@ -1835,19 +1562,6 @@ jobs:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-cpu-outetts'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'true'
backend: "outetts"
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
backend-jobs-darwin:
uses: ./.github/workflows/backend_build_darwin.yml
strategy:
@@ -1856,9 +1570,6 @@ jobs:
- backend: "diffusers"
tag-suffix: "-metal-darwin-arm64-diffusers"
build-type: "mps"
- backend: "ace-step"
tag-suffix: "-metal-darwin-arm64-ace-step"
build-type: "mps"
- backend: "mlx"
tag-suffix: "-metal-darwin-arm64-mlx"
build-type: "mps"
@@ -1879,64 +1590,6 @@ jobs:
tag-suffix: "-metal-darwin-arm64-whisper"
build-type: "metal"
lang: "go"
- backend: "vibevoice"
tag-suffix: "-metal-darwin-arm64-vibevoice"
build-type: "mps"
- backend: "qwen-asr"
tag-suffix: "-metal-darwin-arm64-qwen-asr"
build-type: "mps"
- backend: "qwen-tts"
tag-suffix: "-metal-darwin-arm64-qwen-tts"
build-type: "mps"
- backend: "voxcpm"
tag-suffix: "-metal-darwin-arm64-voxcpm"
build-type: "mps"
- backend: "pocket-tts"
tag-suffix: "-metal-darwin-arm64-pocket-tts"
build-type: "mps"
- backend: "moonshine"
tag-suffix: "-metal-darwin-arm64-moonshine"
build-type: "mps"
- backend: "whisperx"
tag-suffix: "-metal-darwin-arm64-whisperx"
build-type: "mps"
- backend: "rerankers"
tag-suffix: "-metal-darwin-arm64-rerankers"
build-type: "mps"
- backend: "transformers"
tag-suffix: "-metal-darwin-arm64-transformers"
build-type: "mps"
- backend: "kokoro"
tag-suffix: "-metal-darwin-arm64-kokoro"
build-type: "mps"
- backend: "faster-whisper"
tag-suffix: "-metal-darwin-arm64-faster-whisper"
build-type: "mps"
- backend: "coqui"
tag-suffix: "-metal-darwin-arm64-coqui"
build-type: "mps"
- backend: "rfdetr"
tag-suffix: "-metal-darwin-arm64-rfdetr"
build-type: "mps"
- backend: "kitten-tts"
tag-suffix: "-metal-darwin-arm64-kitten-tts"
build-type: "mps"
- backend: "piper"
tag-suffix: "-metal-darwin-arm64-piper"
build-type: "metal"
lang: "go"
- backend: "silero-vad"
tag-suffix: "-metal-darwin-arm64-silero-vad"
build-type: "metal"
lang: "go"
- backend: "local-store"
tag-suffix: "-metal-darwin-arm64-local-store"
build-type: "metal"
lang: "go"
- backend: "huggingface"
tag-suffix: "-metal-darwin-arm64-huggingface"
build-type: "metal"
lang: "go"
with:
backend: ${{ matrix.backend }}
build-type: ${{ matrix.build-type }}

View File

@@ -33,7 +33,7 @@ jobs:
run: |
CGO_ENABLED=0 make build
- name: rm
uses: appleboy/ssh-action@v1.2.5
uses: appleboy/ssh-action@v1.2.4
with:
host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -53,7 +53,7 @@ jobs:
rm: true
target: ./local-ai
- name: restarting
uses: appleboy/ssh-action@v1.2.5
uses: appleboy/ssh-action@v1.2.4
with:
host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }}

View File

@@ -1,5 +1,5 @@
# Disable parallel execution for backend builds
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/qwen-asr backends/voxcpm backends/whisperx backends/ace-step
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/qwen-asr backends/voxcpm backends/whisperx
GOCMD=go
GOTEST=$(GOCMD) test
@@ -308,7 +308,6 @@ protogen-go-clean:
prepare-test-extra: protogen-python
$(MAKE) -C backend/python/transformers
$(MAKE) -C backend/python/outetts
$(MAKE) -C backend/python/diffusers
$(MAKE) -C backend/python/chatterbox
$(MAKE) -C backend/python/vllm
@@ -320,11 +319,9 @@ prepare-test-extra: protogen-python
$(MAKE) -C backend/python/qwen-asr
$(MAKE) -C backend/python/voxcpm
$(MAKE) -C backend/python/whisperx
$(MAKE) -C backend/python/ace-step
test-extra: prepare-test-extra
$(MAKE) -C backend/python/transformers test
$(MAKE) -C backend/python/outetts test
$(MAKE) -C backend/python/diffusers test
$(MAKE) -C backend/python/chatterbox test
$(MAKE) -C backend/python/vllm test
@@ -336,7 +333,6 @@ test-extra: prepare-test-extra
$(MAKE) -C backend/python/qwen-asr test
$(MAKE) -C backend/python/voxcpm test
$(MAKE) -C backend/python/whisperx test
$(MAKE) -C backend/python/ace-step test
DOCKER_IMAGE?=local-ai
DOCKER_AIO_IMAGE?=local-ai-aio
@@ -455,7 +451,6 @@ BACKEND_WHISPER = whisper|golang|.|false|true
# Python backends with root context
BACKEND_RERANKERS = rerankers|python|.|false|true
BACKEND_TRANSFORMERS = transformers|python|.|false|true
BACKEND_OUTETTS = outetts|python|.|false|true
BACKEND_FASTER_WHISPER = faster-whisper|python|.|false|true
BACKEND_COQUI = coqui|python|.|false|true
BACKEND_RFDETR = rfdetr|python|.|false|true
@@ -473,7 +468,6 @@ BACKEND_QWEN_TTS = qwen-tts|python|.|false|true
BACKEND_QWEN_ASR = qwen-asr|python|.|false|true
BACKEND_VOXCPM = voxcpm|python|.|false|true
BACKEND_WHISPERX = whisperx|python|.|false|true
BACKEND_ACE_STEP = ace-step|python|.|false|true
# Helper function to build docker image for a backend
# Usage: $(call docker-build-backend,BACKEND_NAME,DOCKERFILE_TYPE,BUILD_CONTEXT,PROGRESS_FLAG,NEEDS_BACKEND_ARG)
@@ -505,7 +499,6 @@ $(eval $(call generate-docker-build-target,$(BACKEND_STABLEDIFFUSION_GGML)))
$(eval $(call generate-docker-build-target,$(BACKEND_WHISPER)))
$(eval $(call generate-docker-build-target,$(BACKEND_RERANKERS)))
$(eval $(call generate-docker-build-target,$(BACKEND_TRANSFORMERS)))
$(eval $(call generate-docker-build-target,$(BACKEND_OUTETTS)))
$(eval $(call generate-docker-build-target,$(BACKEND_FASTER_WHISPER)))
$(eval $(call generate-docker-build-target,$(BACKEND_COQUI)))
$(eval $(call generate-docker-build-target,$(BACKEND_RFDETR)))
@@ -523,13 +516,12 @@ $(eval $(call generate-docker-build-target,$(BACKEND_QWEN_TTS)))
$(eval $(call generate-docker-build-target,$(BACKEND_QWEN_ASR)))
$(eval $(call generate-docker-build-target,$(BACKEND_VOXCPM)))
$(eval $(call generate-docker-build-target,$(BACKEND_WHISPERX)))
$(eval $(call generate-docker-build-target,$(BACKEND_ACE_STEP)))
# Pattern rule for docker-save targets
docker-save-%: backend-images
docker save local-ai-backend:$* -o backend-images/$*.tar
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-qwen-asr docker-build-voxcpm docker-build-whisperx docker-build-ace-step
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-qwen-asr docker-build-voxcpm docker-build-whisperx
########################################################
### Mock Backend for E2E Tests

View File

@@ -203,8 +203,7 @@ local-ai run oci://localai/phi-2:latest
For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html), if you are interested in our roadmap items and future enhancements, you can see the [Issues labeled as Roadmap here](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
## 📰 Latest project news
- February 2026: [Realtime API for audio-to-audio with tool calling](https://github.com/mudler/LocalAI/pull/6245), [ACE-Step 1.5 support](https://github.com/mudler/LocalAI/pull/8396)
- January 2026: **LocalAI 3.10.0** - Major release with Anthropic API support, Open Responses API for stateful agents, video & image generation suite (LTX-2), unified GPU backends, tool streaming & XML parsing, system-aware backend gallery, crash fixes for AVX-only CPUs and AMD VRAM reporting, request tracing, and new backends: **Moonshine** (ultra-fast transcription), **Pocket-TTS** (lightweight TTS). Vulkan arm64 builds now available. [Release notes](https://github.com/mudler/LocalAI/releases/tag/v3.10.0).
- December 2025: [Dynamic Memory Resource reclaimer](https://github.com/mudler/LocalAI/pull/7583), [Automatic fitting of models to multiple GPUS(llama.cpp)](https://github.com/mudler/LocalAI/pull/7584), [Added Vibevoice backend](https://github.com/mudler/LocalAI/pull/7494)
- November 2025: Major improvements to the UX. Among these: [Import models via URL](https://github.com/mudler/LocalAI/pull/7245) and [Multiple chats and history](https://github.com/mudler/LocalAI/pull/7325)
- October 2025: 🔌 [Model Context Protocol (MCP)](https://localai.io/docs/features/mcp/) support added for agentic capabilities with external tools
@@ -270,7 +269,6 @@ LocalAI supports a comprehensive range of AI backends with multiple acceleration
|---------|-------------|---------------------|
| **whisper.cpp** | OpenAI Whisper in C/C++ | CUDA 12/13, ROCm, Intel SYCL, Vulkan, CPU |
| **faster-whisper** | Fast Whisper with CTranslate2 | CUDA 12/13, ROCm, Intel, CPU |
| **moonshine** | Ultra-fast transcription engine for low-end devices | CUDA 12/13, Metal, CPU |
| **coqui** | Advanced TTS with 1100+ languages | CUDA 12/13, ROCm, Intel, CPU |
| **kokoro** | Lightweight TTS model | CUDA 12/13, ROCm, Intel, CPU |
| **chatterbox** | Production-grade TTS | CUDA 12/13, CPU |
@@ -281,7 +279,6 @@ LocalAI supports a comprehensive range of AI backends with multiple acceleration
| **vibevoice** | Real-time TTS with voice cloning | CUDA 12/13, ROCm, Intel, CPU |
| **pocket-tts** | Lightweight CPU-based TTS | CUDA 12/13, ROCm, Intel, CPU |
| **qwen-tts** | High-quality TTS with custom voice, voice design, and voice cloning | CUDA 12/13, ROCm, Intel, CPU |
| **ace-step** | Music generation from text descriptions, lyrics, or audio samples | CUDA 12/13, ROCm, Intel, Metal, CPU |
### Image & Video Generation
| Backend | Description | Acceleration Support |
@@ -303,11 +300,11 @@ LocalAI supports a comprehensive range of AI backends with multiple acceleration
|-------------------|-------------------|------------------|
| **NVIDIA CUDA 12** | All CUDA-compatible backends | Nvidia hardware |
| **NVIDIA CUDA 13** | All CUDA-compatible backends | Nvidia hardware |
| **AMD ROCm** | llama.cpp, whisper, vllm, transformers, diffusers, rerankers, coqui, kokoro, neutts, vibevoice, pocket-tts, qwen-tts, ace-step | AMD Graphics |
| **Intel oneAPI** | llama.cpp, whisper, stablediffusion, vllm, transformers, diffusers, rfdetr, rerankers, coqui, kokoro, vibevoice, pocket-tts, qwen-tts, ace-step | Intel Arc, Intel iGPUs |
| **Apple Metal** | llama.cpp, whisper, diffusers, MLX, MLX-VLM, moonshine, ace-step | Apple M1/M2/M3+ |
| **AMD ROCm** | llama.cpp, whisper, vllm, transformers, diffusers, rerankers, coqui, kokoro, neutts, vibevoice, pocket-tts, qwen-tts | AMD Graphics |
| **Intel oneAPI** | llama.cpp, whisper, stablediffusion, vllm, transformers, diffusers, rfdetr, rerankers, coqui, kokoro, vibevoice, pocket-tts, qwen-tts | Intel Arc, Intel iGPUs |
| **Apple Metal** | llama.cpp, whisper, diffusers, MLX, MLX-VLM | Apple M1/M2/M3+ |
| **Vulkan** | llama.cpp, whisper, stablediffusion | Cross-platform GPUs |
| **NVIDIA Jetson (CUDA 12)** | llama.cpp, whisper, stablediffusion, diffusers, rfdetr, ace-step | ARM64 embedded AI (AGX Orin, etc.) |
| **NVIDIA Jetson (CUDA 12)** | llama.cpp, whisper, stablediffusion, diffusers, rfdetr | ARM64 embedded AI (AGX Orin, etc.) |
| **NVIDIA Jetson (CUDA 13)** | llama.cpp, whisper, stablediffusion, diffusers, rfdetr | ARM64 embedded AI (DGX Spark) |
| **CPU Optimized** | All backends | AVX/AVX2/AVX512, quantization support |

View File

@@ -365,14 +365,6 @@ message SoundGenerationRequest {
optional bool sample = 6;
optional string src = 7;
optional int32 src_divisor = 8;
optional bool think = 9;
optional string caption = 10;
optional string lyrics = 11;
optional int32 bpm = 12;
optional string keyscale = 13;
optional string language = 14;
optional string timesignature = 15;
optional bool instrumental = 17;
}
message TokenizationResponse {

View File

@@ -1,5 +1,5 @@
LLAMA_VERSION?=b536eb023368701fe3564210440e2df6151c3e65
LLAMA_VERSION?=2634ed207a17db1a54bd8df0555bd8499a6ab691
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?=

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# whisper.cpp version
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
WHISPER_CPP_VERSION?=941bdabbe4561bc6de68981aea01bc5ab05781c5
WHISPER_CPP_VERSION?=aa1bc0d1a6dfd70dbb9f60c11df12441e03a9075
SO_TARGET?=libgowhisper.so
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

View File

@@ -105,7 +105,6 @@
intel: "intel-rfdetr"
#amd: "rocm-rfdetr"
nvidia-l4t: "nvidia-l4t-arm64-rfdetr"
metal: "metal-rfdetr"
default: "cpu-rfdetr"
nvidia-cuda-13: "cuda13-rfdetr"
nvidia-cuda-12: "cuda12-rfdetr"
@@ -183,15 +182,6 @@
- text-to-text
- LLM
- MLX
capabilities:
default: "cpu-mlx"
nvidia: "cuda12-mlx"
metal: "metal-mlx"
nvidia-cuda-12: "cuda12-mlx"
nvidia-cuda-13: "cuda13-mlx"
nvidia-l4t: "nvidia-l4t-mlx"
nvidia-l4t-cuda-12: "nvidia-l4t-mlx"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx"
- &mlx-vlm
name: "mlx-vlm"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-vlm"
@@ -209,15 +199,6 @@
- vision-language
- LLM
- MLX
capabilities:
default: "cpu-mlx-vlm"
nvidia: "cuda12-mlx-vlm"
metal: "metal-mlx-vlm"
nvidia-cuda-12: "cuda12-mlx-vlm"
nvidia-cuda-13: "cuda13-mlx-vlm"
nvidia-l4t: "nvidia-l4t-mlx-vlm"
nvidia-l4t-cuda-12: "nvidia-l4t-mlx-vlm"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-vlm"
- &mlx-audio
name: "mlx-audio"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-mlx-audio"
@@ -235,15 +216,6 @@
- text-to-audio
- LLM
- MLX
capabilities:
default: "cpu-mlx-audio"
nvidia: "cuda12-mlx-audio"
metal: "metal-mlx-audio"
nvidia-cuda-12: "cuda12-mlx-audio"
nvidia-cuda-13: "cuda13-mlx-audio"
nvidia-l4t: "nvidia-l4t-mlx-audio"
nvidia-l4t-cuda-12: "nvidia-l4t-mlx-audio"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-mlx-audio"
- &rerankers
name: "rerankers"
alias: "rerankers"
@@ -251,7 +223,6 @@
nvidia: "cuda12-rerankers"
intel: "intel-rerankers"
amd: "rocm-rerankers"
metal: "metal-rerankers"
- &transformers
name: "transformers"
icon: https://avatars.githubusercontent.com/u/25720743?s=200&v=4
@@ -269,7 +240,6 @@
nvidia: "cuda12-transformers"
intel: "intel-transformers"
amd: "rocm-transformers"
metal: "metal-transformers"
nvidia-cuda-13: "cuda13-transformers"
nvidia-cuda-12: "cuda12-transformers"
- &diffusers
@@ -296,34 +266,6 @@
nvidia-cuda-12: "cuda12-diffusers"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-diffusers"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-diffusers"
- &ace-step
name: "ace-step"
description: |
ACE-Step 1.5 is an open-source music generation model. It supports simple mode (natural language description) and advanced mode (caption, lyrics, think, bpm, keyscale, etc.). Uses in-process acestep (LLMHandler for metadata, DiT for audio).
urls:
- https://github.com/ace-step/ACE-Step-1.5
tags:
- music-generation
- sound-generation
alias: "ace-step"
capabilities:
nvidia: "cuda12-ace-step"
intel: "intel-ace-step"
amd: "rocm-ace-step"
metal: "metal-ace-step"
default: "cpu-ace-step"
nvidia-cuda-13: "cuda13-ace-step"
nvidia-cuda-12: "cuda12-ace-step"
- !!merge <<: *ace-step
name: "ace-step-development"
capabilities:
nvidia: "cuda12-ace-step-development"
intel: "intel-ace-step-development"
amd: "rocm-ace-step-development"
metal: "metal-ace-step-development"
default: "cpu-ace-step-development"
nvidia-cuda-13: "cuda13-ace-step-development"
nvidia-cuda-12: "cuda12-ace-step-development"
- &faster-whisper
icon: https://avatars.githubusercontent.com/u/1520500?s=200&v=4
description: |
@@ -340,7 +282,6 @@
nvidia: "cuda12-faster-whisper"
intel: "intel-faster-whisper"
amd: "rocm-faster-whisper"
metal: "metal-faster-whisper"
nvidia-cuda-13: "cuda13-faster-whisper"
nvidia-cuda-12: "cuda12-faster-whisper"
- &moonshine
@@ -358,7 +299,6 @@
alias: "moonshine"
capabilities:
nvidia: "cuda12-moonshine"
metal: "metal-moonshine"
default: "cpu-moonshine"
nvidia-cuda-13: "cuda13-moonshine"
nvidia-cuda-12: "cuda12-moonshine"
@@ -378,7 +318,6 @@
capabilities:
nvidia: "cuda12-whisperx"
amd: "rocm-whisperx"
metal: "metal-whisperx"
default: "cpu-whisperx"
nvidia-cuda-13: "cuda13-whisperx"
nvidia-cuda-12: "cuda12-whisperx"
@@ -401,7 +340,6 @@
intel: "intel-kokoro"
amd: "rocm-kokoro"
nvidia-l4t: "nvidia-l4t-kokoro"
metal: "metal-kokoro"
nvidia-cuda-13: "cuda13-kokoro"
nvidia-cuda-12: "cuda12-kokoro"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-kokoro"
@@ -426,25 +364,9 @@
nvidia: "cuda12-coqui"
intel: "intel-coqui"
amd: "rocm-coqui"
metal: "metal-coqui"
nvidia-cuda-13: "cuda13-coqui"
nvidia-cuda-12: "cuda12-coqui"
icon: https://avatars.githubusercontent.com/u/1338804?s=200&v=4
- &outetts
urls:
- https://github.com/OuteAI/outetts
description: |
OuteTTS is an open-weight text-to-speech model from OuteAI (OuteAI/OuteTTS-0.3-1B).
Supports custom speaker voices via audio path or default speakers.
tags:
- text-to-speech
- TTS
license: apache-2.0
name: "outetts"
alias: "outetts"
capabilities:
default: "cpu-outetts"
nvidia-cuda-12: "cuda12-outetts"
- &chatterbox
urls:
- https://github.com/resemble-ai/chatterbox
@@ -483,7 +405,6 @@
intel: "intel-vibevoice"
amd: "rocm-vibevoice"
nvidia-l4t: "nvidia-l4t-vibevoice"
metal: "metal-vibevoice"
default: "cpu-vibevoice"
nvidia-cuda-13: "cuda13-vibevoice"
nvidia-cuda-12: "cuda12-vibevoice"
@@ -506,7 +427,6 @@
intel: "intel-qwen-tts"
amd: "rocm-qwen-tts"
nvidia-l4t: "nvidia-l4t-qwen-tts"
metal: "metal-qwen-tts"
default: "cpu-qwen-tts"
nvidia-cuda-13: "cuda13-qwen-tts"
nvidia-cuda-12: "cuda12-qwen-tts"
@@ -529,7 +449,6 @@
intel: "intel-qwen-asr"
amd: "rocm-qwen-asr"
nvidia-l4t: "nvidia-l4t-qwen-asr"
metal: "metal-qwen-asr"
default: "cpu-qwen-asr"
nvidia-cuda-13: "cuda13-qwen-asr"
nvidia-cuda-12: "cuda12-qwen-asr"
@@ -551,7 +470,6 @@
nvidia: "cuda12-voxcpm"
intel: "intel-voxcpm"
amd: "rocm-voxcpm"
metal: "metal-voxcpm"
default: "cpu-voxcpm"
nvidia-cuda-13: "cuda13-voxcpm"
nvidia-cuda-12: "cuda12-voxcpm"
@@ -572,7 +490,6 @@
intel: "intel-pocket-tts"
amd: "rocm-pocket-tts"
nvidia-l4t: "nvidia-l4t-pocket-tts"
metal: "metal-pocket-tts"
default: "cpu-pocket-tts"
nvidia-cuda-13: "cuda13-pocket-tts"
nvidia-cuda-12: "cuda12-pocket-tts"
@@ -733,234 +650,31 @@
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-mlx-audio"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-mlx-audio
## mlx
- !!merge <<: *mlx
name: "cpu-mlx"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-mlx"
mirrors:
- localai/localai-backends:latest-cpu-mlx
- !!merge <<: *mlx
name: "cpu-mlx-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-mlx"
mirrors:
- localai/localai-backends:master-cpu-mlx
- !!merge <<: *mlx
name: "cuda12-mlx"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-mlx"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-mlx
- !!merge <<: *mlx
name: "cuda12-mlx-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-mlx"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-mlx
- !!merge <<: *mlx
name: "cuda13-mlx"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-mlx"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-mlx
- !!merge <<: *mlx
name: "cuda13-mlx-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-mlx"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-mlx
- !!merge <<: *mlx
name: "nvidia-l4t-mlx"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-mlx"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-mlx
- !!merge <<: *mlx
name: "nvidia-l4t-mlx-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-mlx"
mirrors:
- localai/localai-backends:master-nvidia-l4t-mlx
- !!merge <<: *mlx
name: "cuda13-nvidia-l4t-arm64-mlx"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-mlx"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-mlx
- !!merge <<: *mlx
name: "cuda13-nvidia-l4t-arm64-mlx-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-mlx"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-mlx
## mlx-vlm
- !!merge <<: *mlx-vlm
name: "cpu-mlx-vlm"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-mlx-vlm"
mirrors:
- localai/localai-backends:latest-cpu-mlx-vlm
- !!merge <<: *mlx-vlm
name: "cpu-mlx-vlm-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-mlx-vlm"
mirrors:
- localai/localai-backends:master-cpu-mlx-vlm
- !!merge <<: *mlx-vlm
name: "cuda12-mlx-vlm"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-mlx-vlm"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-mlx-vlm
- !!merge <<: *mlx-vlm
name: "cuda12-mlx-vlm-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-mlx-vlm"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-mlx-vlm
- !!merge <<: *mlx-vlm
name: "cuda13-mlx-vlm"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-mlx-vlm"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-mlx-vlm
- !!merge <<: *mlx-vlm
name: "cuda13-mlx-vlm-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-mlx-vlm"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-mlx-vlm
- !!merge <<: *mlx-vlm
name: "nvidia-l4t-mlx-vlm"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-mlx-vlm"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-mlx-vlm
- !!merge <<: *mlx-vlm
name: "nvidia-l4t-mlx-vlm-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-mlx-vlm"
mirrors:
- localai/localai-backends:master-nvidia-l4t-mlx-vlm
- !!merge <<: *mlx-vlm
name: "cuda13-nvidia-l4t-arm64-mlx-vlm"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-mlx-vlm"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-mlx-vlm
- !!merge <<: *mlx-vlm
name: "cuda13-nvidia-l4t-arm64-mlx-vlm-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-mlx-vlm"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-mlx-vlm
## mlx-audio
- !!merge <<: *mlx-audio
name: "cpu-mlx-audio"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-mlx-audio"
mirrors:
- localai/localai-backends:latest-cpu-mlx-audio
- !!merge <<: *mlx-audio
name: "cpu-mlx-audio-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-mlx-audio"
mirrors:
- localai/localai-backends:master-cpu-mlx-audio
- !!merge <<: *mlx-audio
name: "cuda12-mlx-audio"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-mlx-audio"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-mlx-audio
- !!merge <<: *mlx-audio
name: "cuda12-mlx-audio-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-mlx-audio"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-mlx-audio
- !!merge <<: *mlx-audio
name: "cuda13-mlx-audio"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-mlx-audio"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-mlx-audio
- !!merge <<: *mlx-audio
name: "cuda13-mlx-audio-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-mlx-audio"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-mlx-audio
- !!merge <<: *mlx-audio
name: "nvidia-l4t-mlx-audio"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-mlx-audio"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-mlx-audio
- !!merge <<: *mlx-audio
name: "nvidia-l4t-mlx-audio-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-mlx-audio"
mirrors:
- localai/localai-backends:master-nvidia-l4t-mlx-audio
- !!merge <<: *mlx-audio
name: "cuda13-nvidia-l4t-arm64-mlx-audio"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-mlx-audio"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-mlx-audio
- !!merge <<: *mlx-audio
name: "cuda13-nvidia-l4t-arm64-mlx-audio-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-mlx-audio"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-mlx-audio
- !!merge <<: *kitten-tts
name: "kitten-tts-development"
uri: "quay.io/go-skynet/local-ai-backends:master-kitten-tts"
mirrors:
- localai/localai-backends:master-kitten-tts
- !!merge <<: *kitten-tts
name: "metal-kitten-tts"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-kitten-tts"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-kitten-tts
- !!merge <<: *kitten-tts
name: "metal-kitten-tts-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-kitten-tts"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-kitten-tts
- !!merge <<: *huggingface
name: "huggingface-development"
uri: "quay.io/go-skynet/local-ai-backends:master-huggingface"
mirrors:
- localai/localai-backends:master-huggingface
- !!merge <<: *huggingface
name: "metal-huggingface"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-huggingface"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-huggingface
- !!merge <<: *huggingface
name: "metal-huggingface-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-huggingface"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-huggingface
- !!merge <<: *local-store
name: "local-store-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-local-store"
mirrors:
- localai/localai-backends:master-cpu-local-store
- !!merge <<: *local-store
name: "metal-local-store"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-local-store"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-local-store
- !!merge <<: *local-store
name: "metal-local-store-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-local-store"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-local-store
- !!merge <<: *silero-vad
name: "silero-vad-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-silero-vad"
mirrors:
- localai/localai-backends:master-cpu-silero-vad
- !!merge <<: *silero-vad
name: "metal-silero-vad"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-silero-vad"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-silero-vad
- !!merge <<: *silero-vad
name: "metal-silero-vad-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-silero-vad"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-silero-vad
- !!merge <<: *piper
name: "piper-development"
uri: "quay.io/go-skynet/local-ai-backends:master-piper"
mirrors:
- localai/localai-backends:master-piper
- !!merge <<: *piper
name: "metal-piper"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-piper"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-piper
- !!merge <<: *piper
name: "metal-piper-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-piper"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-piper
## llama-cpp
- !!merge <<: *llamacpp
name: "nvidia-l4t-arm64-llama-cpp"
@@ -1334,7 +1048,6 @@
intel: "intel-rfdetr-development"
#amd: "rocm-rfdetr-development"
nvidia-l4t: "nvidia-l4t-arm64-rfdetr-development"
metal: "metal-rfdetr-development"
default: "cpu-rfdetr-development"
nvidia-cuda-13: "cuda13-rfdetr-development"
- !!merge <<: *rfdetr
@@ -1402,16 +1115,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-rfdetr"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-rfdetr
- !!merge <<: *rfdetr
name: "metal-rfdetr"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-rfdetr"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-rfdetr
- !!merge <<: *rfdetr
name: "metal-rfdetr-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-rfdetr"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-rfdetr
## Rerankers
- !!merge <<: *rerankers
name: "rerankers-development"
@@ -1419,7 +1122,6 @@
nvidia: "cuda12-rerankers-development"
intel: "intel-rerankers-development"
amd: "rocm-rerankers-development"
metal: "metal-rerankers-development"
nvidia-cuda-13: "cuda13-rerankers-development"
- !!merge <<: *rerankers
name: "cuda12-rerankers"
@@ -1461,16 +1163,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-rerankers"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-rerankers
- !!merge <<: *rerankers
name: "metal-rerankers"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-rerankers"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-rerankers
- !!merge <<: *rerankers
name: "metal-rerankers-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-rerankers"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-rerankers
## Transformers
- !!merge <<: *transformers
name: "transformers-development"
@@ -1478,7 +1170,6 @@
nvidia: "cuda12-transformers-development"
intel: "intel-transformers-development"
amd: "rocm-transformers-development"
metal: "metal-transformers-development"
nvidia-cuda-13: "cuda13-transformers-development"
- !!merge <<: *transformers
name: "cuda12-transformers"
@@ -1520,16 +1211,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-transformers"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-transformers
- !!merge <<: *transformers
name: "metal-transformers"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-transformers"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-transformers
- !!merge <<: *transformers
name: "metal-transformers-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-transformers"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-transformers
## Diffusers
- !!merge <<: *diffusers
name: "diffusers-development"
@@ -1621,67 +1302,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-diffusers"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-diffusers
## ace-step
- !!merge <<: *ace-step
name: "cpu-ace-step"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-ace-step"
mirrors:
- localai/localai-backends:latest-cpu-ace-step
- !!merge <<: *ace-step
name: "cpu-ace-step-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ace-step"
mirrors:
- localai/localai-backends:master-cpu-ace-step
- !!merge <<: *ace-step
name: "cuda12-ace-step"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-ace-step"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-ace-step
- !!merge <<: *ace-step
name: "cuda12-ace-step-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-ace-step"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-ace-step
- !!merge <<: *ace-step
name: "cuda13-ace-step"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-ace-step"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-ace-step
- !!merge <<: *ace-step
name: "cuda13-ace-step-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ace-step"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-ace-step
- !!merge <<: *ace-step
name: "rocm-ace-step"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-ace-step"
mirrors:
- localai/localai-backends:latest-gpu-rocm-hipblas-ace-step
- !!merge <<: *ace-step
name: "rocm-ace-step-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-ace-step"
mirrors:
- localai/localai-backends:master-gpu-rocm-hipblas-ace-step
- !!merge <<: *ace-step
name: "intel-ace-step"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-ace-step"
mirrors:
- localai/localai-backends:latest-gpu-intel-ace-step
- !!merge <<: *ace-step
name: "intel-ace-step-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-ace-step"
mirrors:
- localai/localai-backends:master-gpu-intel-ace-step
- !!merge <<: *ace-step
name: "metal-ace-step"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-ace-step"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-ace-step
- !!merge <<: *ace-step
name: "metal-ace-step-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ace-step"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-ace-step
## kokoro
- !!merge <<: *kokoro
name: "kokoro-development"
@@ -1690,7 +1310,6 @@
intel: "intel-kokoro-development"
amd: "rocm-kokoro-development"
nvidia-l4t: "nvidia-l4t-kokoro-development"
metal: "metal-kokoro-development"
- !!merge <<: *kokoro
name: "cuda12-kokoro-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-kokoro"
@@ -1741,16 +1360,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-kokoro"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-kokoro
- !!merge <<: *kokoro
name: "metal-kokoro"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-kokoro"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-kokoro
- !!merge <<: *kokoro
name: "metal-kokoro-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-kokoro"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-kokoro
## faster-whisper
- !!merge <<: *faster-whisper
name: "faster-whisper-development"
@@ -1758,7 +1367,6 @@
nvidia: "cuda12-faster-whisper-development"
intel: "intel-faster-whisper-development"
amd: "rocm-faster-whisper-development"
metal: "metal-faster-whisper-development"
nvidia-cuda-13: "cuda13-faster-whisper-development"
- !!merge <<: *faster-whisper
name: "cuda12-faster-whisper-development"
@@ -1790,16 +1398,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-faster-whisper"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-faster-whisper
- !!merge <<: *faster-whisper
name: "metal-faster-whisper"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-faster-whisper"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-faster-whisper
- !!merge <<: *faster-whisper
name: "metal-faster-whisper-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-faster-whisper"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-faster-whisper
## moonshine
- !!merge <<: *moonshine
name: "moonshine-development"
@@ -1838,23 +1436,12 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-moonshine"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-moonshine
- !!merge <<: *moonshine
name: "metal-moonshine"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-moonshine"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-moonshine
- !!merge <<: *moonshine
name: "metal-moonshine-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-moonshine"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-moonshine
## whisperx
- !!merge <<: *whisperx
name: "whisperx-development"
capabilities:
nvidia: "cuda12-whisperx-development"
amd: "rocm-whisperx-development"
metal: "metal-whisperx-development"
default: "cpu-whisperx-development"
nvidia-cuda-13: "cuda13-whisperx-development"
nvidia-cuda-12: "cuda12-whisperx-development"
@@ -1898,16 +1485,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-whisperx"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-whisperx
- !!merge <<: *whisperx
name: "metal-whisperx"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-whisperx"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-whisperx
- !!merge <<: *whisperx
name: "metal-whisperx-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-whisperx"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-whisperx
## coqui
- !!merge <<: *coqui
@@ -1916,7 +1493,6 @@
nvidia: "cuda12-coqui-development"
intel: "intel-coqui-development"
amd: "rocm-coqui-development"
metal: "metal-coqui-development"
- !!merge <<: *coqui
name: "cuda12-coqui"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-coqui"
@@ -1947,42 +1523,6 @@
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-coqui"
mirrors:
- localai/localai-backends:latest-gpu-rocm-hipblas-coqui
- !!merge <<: *coqui
name: "metal-coqui"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-coqui"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-coqui
- !!merge <<: *coqui
name: "metal-coqui-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-coqui"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-coqui
## outetts
- !!merge <<: *outetts
name: "outetts-development"
capabilities:
default: "cpu-outetts-development"
nvidia-cuda-12: "cuda12-outetts-development"
- !!merge <<: *outetts
name: "cpu-outetts"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-outetts"
mirrors:
- localai/localai-backends:latest-cpu-outetts
- !!merge <<: *outetts
name: "cpu-outetts-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-outetts"
mirrors:
- localai/localai-backends:master-cpu-outetts
- !!merge <<: *outetts
name: "cuda12-outetts"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-outetts"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-outetts
- !!merge <<: *outetts
name: "cuda12-outetts-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-outetts"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-outetts
## chatterbox
- !!merge <<: *chatterbox
name: "chatterbox-development"
@@ -2063,7 +1603,6 @@
intel: "intel-vibevoice-development"
amd: "rocm-vibevoice-development"
nvidia-l4t: "nvidia-l4t-vibevoice-development"
metal: "metal-vibevoice-development"
default: "cpu-vibevoice-development"
nvidia-cuda-13: "cuda13-vibevoice-development"
nvidia-cuda-12: "cuda12-vibevoice-development"
@@ -2139,16 +1678,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-vibevoice"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-vibevoice
- !!merge <<: *vibevoice
name: "metal-vibevoice"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-vibevoice"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-vibevoice
- !!merge <<: *vibevoice
name: "metal-vibevoice-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-vibevoice"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-vibevoice
## qwen-tts
- !!merge <<: *qwen-tts
name: "qwen-tts-development"
@@ -2157,7 +1686,6 @@
intel: "intel-qwen-tts-development"
amd: "rocm-qwen-tts-development"
nvidia-l4t: "nvidia-l4t-qwen-tts-development"
metal: "metal-qwen-tts-development"
default: "cpu-qwen-tts-development"
nvidia-cuda-13: "cuda13-qwen-tts-development"
nvidia-cuda-12: "cuda12-qwen-tts-development"
@@ -2233,16 +1761,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-tts"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-tts
- !!merge <<: *qwen-tts
name: "metal-qwen-tts"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-qwen-tts"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-qwen-tts
- !!merge <<: *qwen-tts
name: "metal-qwen-tts-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-qwen-tts"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-qwen-tts
## qwen-asr
- !!merge <<: *qwen-asr
name: "qwen-asr-development"
@@ -2251,7 +1769,6 @@
intel: "intel-qwen-asr-development"
amd: "rocm-qwen-asr-development"
nvidia-l4t: "nvidia-l4t-qwen-asr-development"
metal: "metal-qwen-asr-development"
default: "cpu-qwen-asr-development"
nvidia-cuda-13: "cuda13-qwen-asr-development"
nvidia-cuda-12: "cuda12-qwen-asr-development"
@@ -2327,16 +1844,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-asr"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-qwen-asr
- !!merge <<: *qwen-asr
name: "metal-qwen-asr"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-qwen-asr"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-qwen-asr
- !!merge <<: *qwen-asr
name: "metal-qwen-asr-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-qwen-asr"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-qwen-asr
## voxcpm
- !!merge <<: *voxcpm
name: "voxcpm-development"
@@ -2344,7 +1851,6 @@
nvidia: "cuda12-voxcpm-development"
intel: "intel-voxcpm-development"
amd: "rocm-voxcpm-development"
metal: "metal-voxcpm-development"
default: "cpu-voxcpm-development"
nvidia-cuda-13: "cuda13-voxcpm-development"
nvidia-cuda-12: "cuda12-voxcpm-development"
@@ -2398,16 +1904,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voxcpm"
mirrors:
- localai/localai-backends:master-gpu-rocm-hipblas-voxcpm
- !!merge <<: *voxcpm
name: "metal-voxcpm"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-voxcpm"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-voxcpm
- !!merge <<: *voxcpm
name: "metal-voxcpm-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-voxcpm"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-voxcpm
## pocket-tts
- !!merge <<: *pocket-tts
name: "pocket-tts-development"
@@ -2416,7 +1912,6 @@
intel: "intel-pocket-tts-development"
amd: "rocm-pocket-tts-development"
nvidia-l4t: "nvidia-l4t-pocket-tts-development"
metal: "metal-pocket-tts-development"
default: "cpu-pocket-tts-development"
nvidia-cuda-13: "cuda13-pocket-tts-development"
nvidia-cuda-12: "cuda12-pocket-tts-development"
@@ -2492,13 +1987,3 @@
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-pocket-tts"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-pocket-tts
- !!merge <<: *pocket-tts
name: "metal-pocket-tts"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-pocket-tts"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-pocket-tts
- !!merge <<: *pocket-tts
name: "metal-pocket-tts-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-pocket-tts"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-pocket-tts

View File

@@ -1,16 +0,0 @@
.DEFAULT_GOAL := install
.PHONY: install
install:
bash install.sh
.PHONY: protogen-clean
protogen-clean:
$(RM) backend_pb2_grpc.py backend_pb2.py
.PHONY: clean
clean: protogen-clean
rm -rf venv __pycache__
test: install
bash test.sh

View File

@@ -1,472 +0,0 @@
#!/usr/bin/env python3
"""
LocalAI ACE-Step Backend
gRPC backend for ACE-Step 1.5 music generation. Aligns with upstream acestep API:
- LoadModel: initializes AceStepHandler (DiT) and LLMHandler, parses Options.
- SoundGeneration: uses create_sample (simple mode), format_sample (optional), then
generate_music from acestep.inference. Writes first output to request.dst.
- Fail hard: no fallback WAV on error; exceptions propagate to gRPC.
"""
from concurrent import futures
import argparse
import shutil
import signal
import sys
import os
import tempfile
import backend_pb2
import backend_pb2_grpc
import grpc
from acestep.inference import (
GenerationParams,
GenerationConfig,
generate_music,
create_sample,
format_sample,
)
from acestep.handler import AceStepHandler
from acestep.llm_inference import LLMHandler
from acestep.model_downloader import ensure_lm_model
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
MAX_WORKERS = int(os.environ.get("PYTHON_GRPC_MAX_WORKERS", "1"))
# Model name -> HuggingFace/ModelScope repo (from upstream api_server.py)
MODEL_REPO_MAPPING = {
"acestep-v15-turbo": "ACE-Step/Ace-Step1.5",
"acestep-5Hz-lm-0.6B": "ACE-Step/Ace-Step1.5",
"acestep-5Hz-lm-1.7B": "ACE-Step/Ace-Step1.5",
"vae": "ACE-Step/Ace-Step1.5",
"Qwen3-Embedding-0.6B": "ACE-Step/Ace-Step1.5",
"acestep-v15-base": "ACE-Step/acestep-v15-base",
"acestep-v15-sft": "ACE-Step/acestep-v15-sft",
"acestep-v15-turbo-shift3": "ACE-Step/acestep-v15-turbo-shift3",
"acestep-5Hz-lm-4B": "ACE-Step/acestep-5Hz-lm-4B",
}
DEFAULT_REPO_ID = "ACE-Step/Ace-Step1.5"
def _is_float(s):
try:
float(s)
return True
except (ValueError, TypeError):
return False
def _is_int(s):
try:
int(s)
return True
except (ValueError, TypeError):
return False
def _parse_timesteps(s):
if s is None or (isinstance(s, str) and not s.strip()):
return None
if isinstance(s, (list, tuple)):
return [float(x) for x in s]
try:
return [float(x.strip()) for x in str(s).split(",") if x.strip()]
except (ValueError, TypeError):
return None
def _parse_options(opts_list):
"""Parse repeated 'key:value' options into a dict. Coerce numeric and bool."""
out = {}
for opt in opts_list or []:
if ":" not in opt:
continue
key, value = opt.split(":", 1)
key = key.strip()
value = value.strip()
if _is_int(value):
out[key] = int(value)
elif _is_float(value):
out[key] = float(value)
elif value.lower() in ("true", "false"):
out[key] = value.lower() == "true"
else:
out[key] = value
return out
def _generate_audio_sync(servicer, payload, dst_path):
"""
Run full ACE-Step pipeline using acestep.inference:
- If sample_mode/sample_query: create_sample() for caption/lyrics/metadata.
- If use_format and caption/lyrics: format_sample().
- Build GenerationParams and GenerationConfig, then generate_music().
Writes the first generated audio to dst_path. Raises on failure.
"""
opts = servicer.options
dit_handler = servicer.dit_handler
llm_handler = servicer.llm_handler
for key, value in opts.items():
if key not in payload:
payload[key] = value
def _opt(name, default):
return opts.get(name, default)
lm_temperature = _opt("temperature", 0.85)
lm_cfg_scale = _opt("lm_cfg_scale", _opt("cfg_scale", 2.0))
lm_top_k = opts.get("top_k")
lm_top_p = _opt("top_p", 0.9)
if lm_top_p is not None and lm_top_p >= 1.0:
lm_top_p = None
inference_steps = _opt("inference_steps", 8)
guidance_scale = _opt("guidance_scale", 7.0)
batch_size = max(1, int(_opt("batch_size", 1)))
use_simple = bool(payload.get("sample_query") or payload.get("text"))
sample_mode = use_simple and (payload.get("thinking") or payload.get("sample_mode"))
sample_query = (payload.get("sample_query") or payload.get("text") or "").strip()
use_format = bool(payload.get("use_format"))
caption = (payload.get("prompt") or payload.get("caption") or "").strip()
lyrics = (payload.get("lyrics") or "").strip()
vocal_language = (payload.get("vocal_language") or "en").strip()
instrumental = bool(payload.get("instrumental"))
bpm = payload.get("bpm")
key_scale = (payload.get("key_scale") or "").strip()
time_signature = (payload.get("time_signature") or "").strip()
audio_duration = payload.get("audio_duration")
if audio_duration is not None:
try:
audio_duration = float(audio_duration)
except (TypeError, ValueError):
audio_duration = None
if sample_mode and llm_handler and getattr(llm_handler, "llm_initialized", False):
parsed_language = None
if sample_query:
for hint in ("english", "en", "chinese", "zh", "japanese", "ja"):
if hint in sample_query.lower():
parsed_language = "en" if hint == "english" or hint == "en" else hint
break
vocal_lang = vocal_language if vocal_language and vocal_language != "unknown" else parsed_language
sample_result = create_sample(
llm_handler=llm_handler,
query=sample_query or "NO USER INPUT",
instrumental=instrumental,
vocal_language=vocal_lang,
temperature=lm_temperature,
top_k=lm_top_k,
top_p=lm_top_p,
use_constrained_decoding=True,
)
if not sample_result.success:
raise RuntimeError(f"create_sample failed: {sample_result.error or sample_result.status_message}")
caption = sample_result.caption or caption
lyrics = sample_result.lyrics or lyrics
bpm = sample_result.bpm
key_scale = sample_result.keyscale or key_scale
time_signature = sample_result.timesignature or time_signature
if sample_result.duration is not None:
audio_duration = sample_result.duration
if getattr(sample_result, "language", None):
vocal_language = sample_result.language
if use_format and (caption or lyrics) and llm_handler and getattr(llm_handler, "llm_initialized", False):
user_metadata = {}
if bpm is not None:
user_metadata["bpm"] = bpm
if audio_duration is not None and float(audio_duration) > 0:
user_metadata["duration"] = int(audio_duration)
if key_scale:
user_metadata["keyscale"] = key_scale
if time_signature:
user_metadata["timesignature"] = time_signature
if vocal_language and vocal_language != "unknown":
user_metadata["language"] = vocal_language
format_result = format_sample(
llm_handler=llm_handler,
caption=caption,
lyrics=lyrics,
user_metadata=user_metadata if user_metadata else None,
temperature=lm_temperature,
top_k=lm_top_k,
top_p=lm_top_p,
use_constrained_decoding=True,
)
if format_result.success:
caption = format_result.caption or caption
lyrics = format_result.lyrics or lyrics
if format_result.duration is not None:
audio_duration = format_result.duration
if format_result.bpm is not None:
bpm = format_result.bpm
if format_result.keyscale:
key_scale = format_result.keyscale
if format_result.timesignature:
time_signature = format_result.timesignature
if getattr(format_result, "language", None):
vocal_language = format_result.language
thinking = bool(payload.get("thinking"))
use_cot_metas = not sample_mode
params = GenerationParams(
task_type=payload.get("task_type", "text2music"),
instruction=payload.get("instruction", "Fill the audio semantic mask based on the given conditions:"),
reference_audio=payload.get("reference_audio_path"),
src_audio=payload.get("src_audio_path"),
audio_codes=payload.get("audio_code_string", ""),
caption=caption,
lyrics=lyrics,
instrumental=instrumental or (not lyrics or str(lyrics).strip().lower() in ("[inst]", "[instrumental]")),
vocal_language=vocal_language or "unknown",
bpm=bpm,
keyscale=key_scale,
timesignature=time_signature,
duration=float(audio_duration) if audio_duration and float(audio_duration) > 0 else -1.0,
inference_steps=inference_steps,
seed=int(payload.get("seed", -1)),
guidance_scale=guidance_scale,
use_adg=bool(payload.get("use_adg")),
cfg_interval_start=float(payload.get("cfg_interval_start", 0.0)),
cfg_interval_end=float(payload.get("cfg_interval_end", 1.0)),
shift=float(payload.get("shift", 1.0)),
infer_method=(payload.get("infer_method") or "ode").strip(),
timesteps=_parse_timesteps(payload.get("timesteps")),
repainting_start=float(payload.get("repainting_start", 0.0)),
repainting_end=float(payload.get("repainting_end", -1)) if payload.get("repainting_end") is not None else -1,
audio_cover_strength=float(payload.get("audio_cover_strength", 1.0)),
thinking=thinking,
lm_temperature=lm_temperature,
lm_cfg_scale=lm_cfg_scale,
lm_top_k=lm_top_k or 0,
lm_top_p=lm_top_p if lm_top_p is not None and lm_top_p < 1.0 else 0.9,
lm_negative_prompt=payload.get("lm_negative_prompt", "NO USER INPUT"),
use_cot_metas=use_cot_metas,
use_cot_caption=bool(payload.get("use_cot_caption", True)),
use_cot_language=bool(payload.get("use_cot_language", True)),
use_constrained_decoding=True,
)
config = GenerationConfig(
batch_size=batch_size,
allow_lm_batch=bool(payload.get("allow_lm_batch", False)),
use_random_seed=bool(payload.get("use_random_seed", True)),
seeds=payload.get("seeds"),
lm_batch_chunk_size=max(1, int(payload.get("lm_batch_chunk_size", 8))),
constrained_decoding_debug=bool(payload.get("constrained_decoding_debug")),
audio_format=(payload.get("audio_format") or "flac").strip() or "flac",
)
save_dir = tempfile.mkdtemp(prefix="ace_step_")
try:
result = generate_music(
dit_handler=dit_handler,
llm_handler=llm_handler if (llm_handler and getattr(llm_handler, "llm_initialized", False)) else None,
params=params,
config=config,
save_dir=save_dir,
progress=None,
)
if not result.success:
raise RuntimeError(result.error or result.status_message or "generate_music failed")
audios = result.audios or []
if not audios:
raise RuntimeError("generate_music returned no audio")
first_path = audios[0].get("path") or ""
if not first_path or not os.path.isfile(first_path):
raise RuntimeError("first generated audio path missing or not a file")
shutil.copy2(first_path, dst_path)
finally:
try:
shutil.rmtree(save_dir, ignore_errors=True)
except Exception:
pass
class BackendServicer(backend_pb2_grpc.BackendServicer):
def __init__(self):
self.model_path = None
self.model_dir = None
self.checkpoint_dir = None
self.project_root = None
self.options = {}
self.dit_handler = None
self.llm_handler = None
def Health(self, request, context):
return backend_pb2.Reply(message=b"OK")
def LoadModel(self, request, context):
try:
self.options = _parse_options(list(getattr(request, "Options", []) or []))
model_path = getattr(request, "ModelPath", None) or ""
model_name = (request.Model or "").strip()
model_file = (getattr(request, "ModelFile", None) or "").strip()
# Model dir: where we store checkpoints (always under LocalAI models path, never backend dir)
if model_path and model_name:
model_dir = os.path.join(model_path, model_name)
elif model_file:
model_dir = model_file
else:
model_dir = os.path.abspath(model_name or ".")
self.model_dir = model_dir
self.checkpoint_dir = os.path.join(model_dir, "checkpoints")
self.project_root = model_dir
self.model_path = os.path.join(self.checkpoint_dir, model_name or os.path.basename(model_dir.rstrip("/\\")))
config_path = model_name or os.path.basename(model_dir.rstrip("/\\"))
os.makedirs(self.checkpoint_dir, exist_ok=True)
self.dit_handler = AceStepHandler()
# Patch handler so it uses our model dir instead of site-packages/checkpoints
self.dit_handler._get_project_root = lambda: self.project_root
device = self.options.get("device", "auto")
use_flash = self.options.get("use_flash_attention", True)
if isinstance(use_flash, str):
use_flash = str(use_flash).lower() in ("1", "true", "yes")
offload = self.options.get("offload_to_cpu", False)
if isinstance(offload, str):
offload = str(offload).lower() in ("1", "true", "yes")
status_msg, ok = self.dit_handler.initialize_service(
project_root=self.project_root,
config_path=config_path,
device=device,
use_flash_attention=use_flash,
compile_model=False,
offload_to_cpu=offload,
offload_dit_to_cpu=bool(self.options.get("offload_dit_to_cpu", False)),
)
if not ok:
return backend_pb2.Result(success=False, message=f"DiT init failed: {status_msg}")
self.llm_handler = None
if self.options.get("init_lm", True):
lm_model = self.options.get("lm_model_path", "acestep-5Hz-lm-0.6B")
# Ensure LM model is downloaded before initializing
try:
from pathlib import Path
lm_success, lm_msg = ensure_lm_model(
model_name=lm_model,
checkpoints_dir=Path(self.checkpoint_dir),
prefer_source=None, # Auto-detect HuggingFace vs ModelScope
)
if not lm_success:
print(f"[ace-step] Warning: LM model download failed: {lm_msg}", file=sys.stderr)
# Continue anyway - LLM initialization will fail gracefully
else:
print(f"[ace-step] LM model ready: {lm_msg}", file=sys.stderr)
except Exception as e:
print(f"[ace-step] Warning: LM model download check failed: {e}", file=sys.stderr)
# Continue anyway - LLM initialization will fail gracefully
self.llm_handler = LLMHandler()
lm_backend = (self.options.get("lm_backend") or "vllm").strip().lower()
if lm_backend not in ("vllm", "pt"):
lm_backend = "vllm"
lm_status, lm_ok = self.llm_handler.initialize(
checkpoint_dir=self.checkpoint_dir,
lm_model_path=lm_model,
backend=lm_backend,
device=device,
offload_to_cpu=offload,
dtype=getattr(self.dit_handler, "dtype", None),
)
if not lm_ok:
self.llm_handler = None
print(f"[ace-step] LM init failed (optional): {lm_status}", file=sys.stderr)
print(f"[ace-step] LoadModel: model={self.model_path}, options={list(self.options.keys())}", file=sys.stderr)
return backend_pb2.Result(success=True, message="Model loaded successfully")
except Exception as err:
return backend_pb2.Result(success=False, message=f"LoadModel error: {err}")
def SoundGeneration(self, request, context):
if not request.dst:
return backend_pb2.Result(success=False, message="request.dst is required")
use_simple = bool(request.text)
if use_simple:
payload = {
"sample_query": request.text or "",
"sample_mode": True,
"thinking": True,
"vocal_language": request.language or request.GetLanguage() or "en",
"instrumental": request.instrumental if request.HasField("instrumental") else False,
}
else:
caption = request.caption or request.GetCaption() or request.text
payload = {
"prompt": caption,
"lyrics": request.lyrics or request.lyrics or "",
"thinking": request.think if request.HasField("think") else False,
"vocal_language": request.language or request.GetLanguage() or "en",
}
if request.HasField("bpm"):
payload["bpm"] = request.bpm
if request.HasField("keyscale") and request.keyscale:
payload["key_scale"] = request.keyscale
if request.HasField("timesignature") and request.timesignature:
payload["time_signature"] = request.timesignature
if request.HasField("duration") and request.duration:
payload["audio_duration"] = int(request.duration) if request.duration else None
if request.src:
payload["src_audio_path"] = request.src
_generate_audio_sync(self, payload, request.dst)
return backend_pb2.Result(success=True, message="Sound generated successfully")
def TTS(self, request, context):
if not request.dst:
return backend_pb2.Result(success=False, message="request.dst is required")
payload = {
"sample_query": request.text,
"sample_mode": True,
"thinking": False,
"vocal_language": (request.language if request.language else "") or "en",
"instrumental": False,
}
_generate_audio_sync(self, payload, request.dst)
return backend_pb2.Result(success=True, message="TTS (music fallback) generated successfully")
def serve(address):
server = grpc.server(
futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
options=[
("grpc.max_message_length", 50 * 1024 * 1024),
("grpc.max_send_message_length", 50 * 1024 * 1024),
("grpc.max_receive_message_length", 50 * 1024 * 1024),
],
)
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
server.start()
print(f"[ace-step] Server listening on {address}", file=sys.stderr)
def shutdown(sig, frame):
server.stop(0)
sys.exit(0)
signal.signal(signal.SIGINT, shutdown)
signal.signal(signal.SIGTERM, shutdown)
try:
while True:
import time
time.sleep(_ONE_DAY_IN_SECONDS)
except KeyboardInterrupt:
server.stop(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--addr", default="localhost:50051", help="Listen address")
args = parser.parse_args()
serve(args.addr)

View File

@@ -1,26 +0,0 @@
#!/bin/bash
set -e
backend_dir=$(dirname $0)
if [ -d $backend_dir/common ]; then
source $backend_dir/common/libbackend.sh
else
source $backend_dir/../common/libbackend.sh
fi
PYTHON_VERSION="3.11"
PYTHON_PATCH="14"
PY_STANDALONE_TAG="20260203"
installRequirements
if [ ! -d ACE-Step-1.5 ]; then
git clone https://github.com/ace-step/ACE-Step-1.5
cd ACE-Step-1.5/
if [ "x${USE_PIP}" == "xtrue" ]; then
pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
else
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
fi
fi

View File

@@ -1,22 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cpu
torch
torchaudio
torchvision
# Core dependencies
transformers>=4.51.0,<4.58.0
diffusers
gradio
matplotlib>=3.7.5
scipy>=1.10.1
soundfile>=0.13.1
loguru>=0.7.3
einops>=0.8.1
accelerate>=1.12.0
fastapi>=0.110.0
uvicorn[standard]>=0.27.0
numba>=0.63.1
vector-quantize-pytorch>=1.27.15
torchcodec>=0.9.1
torchao
modelscope

View File

@@ -1,30 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu121
torch
torchaudio
torchvision
# Core dependencies
transformers>=4.51.0,<4.58.0
diffusers
gradio>=6.5.1
matplotlib>=3.7.5
scipy>=1.10.1
soundfile>=0.13.1
loguru>=0.7.3
einops>=0.8.1
accelerate>=1.12.0
fastapi>=0.110.0
uvicorn[standard]>=0.27.0
numba>=0.63.1
vector-quantize-pytorch>=1.27.15
torchcodec>=0.9.1; sys_platform != 'darwin' or platform_machine == 'arm64'
torchao
modelscope
# LoRA Training dependencies (optional)
peft>=0.7.0
lightning>=2.0.0
triton>=3.0.0
flash-attn
xxhash

View File

@@ -1,30 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu130
torch==2.7.1
torchaudio==2.7.1
torchvision
# Core dependencies
transformers>=4.51.0,<4.58.0
diffusers
gradio>=6.5.1
matplotlib>=3.7.5
scipy>=1.10.1
soundfile>=0.13.1
loguru>=0.7.3
einops>=0.8.1
accelerate>=1.12.0
fastapi>=0.110.0
uvicorn[standard]>=0.27.0
numba>=0.63.1
vector-quantize-pytorch>=1.27.15
torchcodec>=0.9.1; sys_platform != 'darwin' or platform_machine == 'arm64'
torchao
modelscope
# LoRA Training dependencies (optional)
peft>=0.7.0
lightning>=2.0.0
triton>=3.0.0
flash-attn
xxhash

View File

@@ -1,22 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.4
torch==2.8.0+rocm6.4
torchaudio
torchvision
# Core dependencies
transformers>=4.51.0,<4.58.0
diffusers
gradio>=6.5.1
matplotlib>=3.7.5
scipy>=1.10.1
soundfile>=0.13.1
loguru>=0.7.3
einops>=0.8.1
accelerate>=1.12.0
fastapi>=0.110.0
uvicorn[standard]>=0.27.0
numba>=0.63.1
vector-quantize-pytorch>=1.27.15
torchcodec>=0.9.1; sys_platform != 'darwin' or platform_machine == 'arm64'
torchao
modelscope

View File

@@ -1,26 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/xpu
torch
torchaudio
torchvision
# Core dependencies
transformers>=4.51.0,<4.58.0
diffusers
gradio
matplotlib>=3.7.5
scipy>=1.10.1
soundfile>=0.13.1
loguru>=0.7.3
einops>=0.8.1
accelerate>=1.12.0
fastapi>=0.110.0
uvicorn[standard]>=0.27.0
numba>=0.63.1
vector-quantize-pytorch>=1.27.15
torchcodec>=0.9.1
torchao
modelscope
# LoRA Training dependencies (optional)
peft>=0.7.0
lightning>=2.0.0

View File

@@ -1,29 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu130
torch
torchaudio
torchvision
# Core dependencies
transformers>=4.51.0,<4.58.0
diffusers
gradio>=6.5.1
matplotlib>=3.7.5
scipy>=1.10.1
soundfile>=0.13.1
loguru>=0.7.3
einops>=0.8.1
accelerate>=1.12.0
fastapi>=0.110.0
uvicorn[standard]>=0.27.0
numba>=0.63.1
vector-quantize-pytorch>=1.27.15
torchcodec>=0.9.1; sys_platform != 'darwin' or platform_machine == 'arm64'
torchao
modelscope
# LoRA Training dependencies (optional)
peft>=0.7.0
lightning>=2.0.0
triton>=3.0.0
#flash-attn
xxhash

View File

@@ -1,25 +0,0 @@
torch
torchaudio
torchvision
# Core dependencies
transformers>=4.51.0,<4.58.0
diffusers
gradio
matplotlib>=3.7.5
scipy>=1.10.1
soundfile>=0.13.1
loguru>=0.7.3
einops>=0.8.1
accelerate>=1.12.0
fastapi>=0.110.0
uvicorn[standard]>=0.27.0
numba>=0.63.1
vector-quantize-pytorch>=1.27.15
torchcodec>=0.9.1
torchao
modelscope
# LoRA Training dependencies (optional)
peft>=0.7.0
lightning>=2.0.0

View File

@@ -1,4 +0,0 @@
setuptools
grpcio==1.76.0
protobuf
certifi

View File

@@ -1,9 +0,0 @@
#!/bin/bash
backend_dir=$(dirname $0)
if [ -d $backend_dir/common ]; then
source $backend_dir/common/libbackend.sh
else
source $backend_dir/../common/libbackend.sh
fi
startBackend $@

View File

@@ -1,53 +0,0 @@
"""
Tests for the ACE-Step gRPC backend.
"""
import os
import tempfile
import unittest
import backend_pb2
import backend_pb2_grpc
import grpc
class TestACEStepBackend(unittest.TestCase):
"""Test Health, LoadModel, and SoundGeneration (minimal; no real model required)."""
@classmethod
def setUpClass(cls):
port = os.environ.get("BACKEND_PORT", "50051")
cls.channel = grpc.insecure_channel(f"localhost:{port}")
cls.stub = backend_pb2_grpc.BackendStub(cls.channel)
@classmethod
def tearDownClass(cls):
cls.channel.close()
def test_health(self):
response = self.stub.Health(backend_pb2.HealthMessage())
self.assertEqual(response.message, b"OK")
def test_load_model(self):
response = self.stub.LoadModel(backend_pb2.ModelOptions(Model="ace-step-test"))
self.assertTrue(response.success, response.message)
def test_sound_generation_minimal(self):
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
dst = f.name
try:
req = backend_pb2.SoundGenerationRequest(
text="upbeat pop song",
model="ace-step-test",
dst=dst,
)
response = self.stub.SoundGeneration(req)
self.assertTrue(response.success, response.message)
self.assertTrue(os.path.exists(dst), f"Output file not created: {dst}")
self.assertGreater(os.path.getsize(dst), 0)
finally:
if os.path.exists(dst):
os.unlink(dst)
if __name__ == "__main__":
unittest.main()

View File

@@ -1,19 +0,0 @@
#!/bin/bash
set -e
backend_dir=$(dirname $0)
if [ -d $backend_dir/common ]; then
source $backend_dir/common/libbackend.sh
else
source $backend_dir/../common/libbackend.sh
fi
# Start backend in background (use env to avoid port conflict in parallel tests)
export PYTHONUNBUFFERED=1
BACKEND_PORT=${BACKEND_PORT:-50051}
python backend.py --addr "localhost:${BACKEND_PORT}" &
BACKEND_PID=$!
trap "kill $BACKEND_PID 2>/dev/null || true" EXIT
sleep 3
export BACKEND_PORT
runUnittests

View File

@@ -1,7 +0,0 @@
torch
torchaudio
accelerate
numpy>=1.24.0,<1.26.0
transformers
# https://github.com/mudler/LocalAI/pull/6240#issuecomment-3329518289
chatterbox-tts@git+https://git@github.com/mudler/chatterbox.git@faster

View File

@@ -1,4 +0,0 @@
torch==2.7.1
transformers==4.48.3
accelerate
coqui-tts

View File

@@ -1,8 +0,0 @@
torch==2.7.1
faster-whisper
opencv-python
accelerate
compel
peft
sentencepiece
optimum-quanto

View File

@@ -1,5 +0,0 @@
grpcio==1.71.0
protobuf
certifi
packaging==24.1
https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl

View File

@@ -1,5 +0,0 @@
torch==2.7.1
transformers
accelerate
kokoro
soundfile

View File

@@ -1,2 +0,0 @@
git+https://github.com/Blaizzy/mlx-audio
mlx[cpu]

View File

@@ -1,2 +0,0 @@
git+https://github.com/Blaizzy/mlx-audio
mlx[cuda12]

View File

@@ -1,2 +0,0 @@
git+https://github.com/Blaizzy/mlx-audio
mlx[cuda13]

View File

@@ -1,2 +0,0 @@
git+https://github.com/Blaizzy/mlx-audio
mlx[cuda12]

View File

@@ -1,2 +0,0 @@
git+https://github.com/Blaizzy/mlx-audio
mlx[cuda13]

View File

@@ -1,2 +0,0 @@
git+https://github.com/Blaizzy/mlx-vlm
mlx[cpu]

View File

@@ -1,2 +0,0 @@
git+https://github.com/Blaizzy/mlx-vlm
mlx[cuda12]

View File

@@ -1,2 +0,0 @@
git+https://github.com/Blaizzy/mlx-vlm
mlx[cuda13]

View File

@@ -1,2 +0,0 @@
git+https://github.com/Blaizzy/mlx-vlm
mlx[cuda12]

View File

@@ -1,2 +0,0 @@
git+https://github.com/Blaizzy/mlx-vlm
mlx[cuda13]

View File

@@ -1,2 +0,0 @@
mlx-lm
mlx[cpu]

View File

@@ -1,2 +0,0 @@
mlx-lm
mlx[cuda12]

View File

@@ -1,2 +0,0 @@
mlx-lm
mlx[cuda13]

View File

@@ -1,2 +0,0 @@
mlx-lm
mlx[cuda12]

View File

@@ -1,2 +0,0 @@
mlx-lm
mlx[cuda13]

View File

@@ -1,4 +0,0 @@
grpcio==1.71.0
protobuf
grpcio-tools
useful-moonshine-onnx@git+https://git@github.com/moonshine-ai/moonshine.git#subdirectory=moonshine-onnx

View File

@@ -1,23 +0,0 @@
.PHONY: outetts
outetts:
bash install.sh
.PHONY: run
run: outetts
@echo "Running outetts..."
bash run.sh
@echo "outetts run."
.PHONY: test
test: outetts
@echo "Testing outetts..."
bash test.sh
@echo "outetts tested."
.PHONY: protogen-clean
protogen-clean:
$(RM) backend_pb2_grpc.py backend_pb2.py
.PHONY: clean
clean: protogen-clean
rm -rf venv __pycache__

View File

@@ -1,138 +0,0 @@
#!/usr/bin/env python3
"""
gRPC server for OuteTTS (OuteAI TTS) models.
"""
from concurrent import futures
import argparse
import signal
import sys
import os
import asyncio
import backend_pb2
import backend_pb2_grpc
import grpc
import outetts
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
class BackendServicer(backend_pb2_grpc.BackendServicer):
def Health(self, request, context):
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
def LoadModel(self, request, context):
model_name = request.Model
if os.path.exists(request.ModelFile):
model_name = request.ModelFile
self.options = {}
for opt in request.Options:
if ":" not in opt:
continue
key, value = opt.split(":", 1)
try:
if "." in value:
value = float(value)
else:
value = int(value)
except ValueError:
pass
self.options[key] = value
MODELNAME = "OuteAI/OuteTTS-0.3-1B"
TOKENIZER = "OuteAI/OuteTTS-0.3-1B"
VERSION = "0.3"
SPEAKER = "en_male_1"
for opt in request.Options:
if opt.startswith("tokenizer:"):
TOKENIZER = opt.split(":")[1]
break
if opt.startswith("version:"):
VERSION = opt.split(":")[1]
break
if opt.startswith("speaker:"):
SPEAKER = opt.split(":")[1]
break
if model_name != "":
MODELNAME = model_name
try:
model_config = outetts.HFModelConfig_v2(
model_path=MODELNAME,
tokenizer_path=TOKENIZER
)
self.interface = outetts.InterfaceHF(model_version=VERSION, cfg=model_config)
self.interface.print_default_speakers()
if request.AudioPath:
if os.path.isabs(request.AudioPath):
self.AudioPath = request.AudioPath
else:
self.AudioPath = os.path.join(request.ModelPath, request.AudioPath)
self.speaker = self.interface.create_speaker(audio_path=self.AudioPath)
else:
self.speaker = self.interface.load_default_speaker(name=SPEAKER)
if request.ContextSize > 0:
self.max_tokens = request.ContextSize
else:
self.max_tokens = self.options.get("max_new_tokens", 512)
except Exception as err:
print("Error:", err, file=sys.stderr)
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(message="Model loaded successfully", success=True)
def TTS(self, request, context):
try:
text = request.text if request.text else "Speech synthesis is the artificial production of human speech."
print("[OuteTTS] generating TTS", file=sys.stderr)
gen_cfg = outetts.GenerationConfig(
text=text,
temperature=self.options.get("temperature", 0.1),
repetition_penalty=self.options.get("repetition_penalty", 1.1),
max_length=self.max_tokens,
speaker=self.speaker,
)
output = self.interface.generate(config=gen_cfg)
print("[OuteTTS] Generated TTS", file=sys.stderr)
output.save(request.dst)
print("[OuteTTS] TTS done", file=sys.stderr)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(success=True)
async def serve(address):
server = grpc.aio.server(
migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
options=[
('grpc.max_message_length', 50 * 1024 * 1024),
('grpc.max_send_message_length', 50 * 1024 * 1024),
('grpc.max_receive_message_length', 50 * 1024 * 1024),
])
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
server.add_insecure_port(address)
loop = asyncio.get_event_loop()
for sig in (signal.SIGINT, signal.SIGTERM):
loop.add_signal_handler(
sig, lambda: asyncio.ensure_future(server.stop(5))
)
await server.start()
print("Server started. Listening on: " + address, file=sys.stderr)
await server.wait_for_termination()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run the OuteTTS gRPC server.")
parser.add_argument("--addr", default="localhost:50051", help="The address to bind the server to.")
args = parser.parse_args()
asyncio.run(serve(args.addr))

View File

@@ -1,11 +0,0 @@
#!/bin/bash
set -e
backend_dir=$(dirname $0)
if [ -d $backend_dir/common ]; then
source $backend_dir/common/libbackend.sh
else
source $backend_dir/../common/libbackend.sh
fi
installRequirements

View File

@@ -1,7 +0,0 @@
torch==2.7.1
llvmlite==0.43.0
numba==0.60.0
accelerate
bitsandbytes
outetts
protobuf==6.33.5

View File

@@ -1,7 +0,0 @@
torch==2.7.1
accelerate
llvmlite==0.43.0
numba==0.60.0
bitsandbytes
protobuf==6.33.5
outetts

View File

@@ -1,7 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu130
torch==2.9.0
llvmlite==0.43.0
numba==0.60.0
bitsandbytes
outetts
protobuf==6.33.5

View File

@@ -1,8 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.4
torch==2.8.0+rocm6.4
accelerate
llvmlite==0.43.0
numba==0.60.0
bitsandbytes
outetts
protobuf==6.33.5

View File

@@ -1,8 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/xpu
torch
optimum[openvino]
llvmlite==0.43.0
numba==0.60.0
bitsandbytes
outetts
protobuf==6.33.5

View File

@@ -1,6 +0,0 @@
grpcio==1.76.0
protobuf==6.33.5
certifi
setuptools
scipy==1.15.1
numpy>=2.0.0

View File

@@ -1,9 +0,0 @@
#!/bin/bash
backend_dir=$(dirname $0)
if [ -d $backend_dir/common ]; then
source $backend_dir/common/libbackend.sh
else
source $backend_dir/../common/libbackend.sh
fi
startBackend $@

View File

@@ -1,35 +0,0 @@
"""
Test script for the OuteTTS gRPC service.
"""
import unittest
import subprocess
import time
import backend_pb2
import backend_pb2_grpc
import grpc
class TestBackendServicer(unittest.TestCase):
def setUp(self):
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
time.sleep(5)
def tearDown(self):
self.service.terminate()
self.service.wait()
def test_health(self):
try:
with grpc.insecure_channel("localhost:50051") as channel:
stub = backend_pb2_grpc.BackendStub(channel)
response = stub.Health(backend_pb2.HealthMessage())
self.assertEqual(response.message, b'OK')
except Exception as err:
self.fail(f"Health check failed: {err}")
finally:
self.tearDown()
if __name__ == "__main__":
unittest.main()

View File

@@ -1,11 +0,0 @@
#!/bin/bash
set -e
backend_dir=$(dirname $0)
if [ -d $backend_dir/common ]; then
source $backend_dir/common/libbackend.sh
else
source $backend_dir/../common/libbackend.sh
fi
runUnittests

View File

@@ -1,4 +1,4 @@
torch
torchaudio
torch==2.7.1
torchaudio==0.22.1
qwen-tts
sox
sox

View File

@@ -28,7 +28,7 @@ class TestBackendServicer(unittest.TestCase):
stderr=subprocess.PIPE,
text=True
)
time.sleep(30)
time.sleep(5)
def tearDown(self) -> None:
"""

View File

@@ -1,4 +0,0 @@
torch==2.7.1
transformers
accelerate
rerankers[transformers]

View File

@@ -1,7 +0,0 @@
torch==2.7.1
rfdetr
opencv-python
accelerate
peft
inference
optimum-quanto

View File

@@ -24,6 +24,7 @@ XPU=os.environ.get("XPU", "0") == "1"
from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria, MambaConfig, MambaForCausalLM
from transformers import AutoProcessor, MusicgenForConditionalGeneration, DiaForConditionalGeneration
from scipy.io import wavfile
import outetts
from sentence_transformers import SentenceTransformer
@@ -88,6 +89,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
self.CUDA = torch.cuda.is_available()
self.OV=False
self.OuteTTS=False
self.DiaTTS=False
self.SentenceTransformer = False
@@ -237,6 +239,45 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
self.processor = self.processor.to("cuda")
print("DiaForConditionalGeneration loaded", file=sys.stderr)
self.DiaTTS = True
elif request.Type == "OuteTTS":
autoTokenizer = False
options = request.Options
MODELNAME = "OuteAI/OuteTTS-0.3-1B"
TOKENIZER = "OuteAI/OuteTTS-0.3-1B"
VERSION = "0.3"
SPEAKER = "en_male_1"
for opt in options:
if opt.startswith("tokenizer:"):
TOKENIZER = opt.split(":")[1]
break
if opt.startswith("version:"):
VERSION = opt.split(":")[1]
break
if opt.startswith("speaker:"):
SPEAKER = opt.split(":")[1]
break
if model_name != "":
MODELNAME = model_name
# Configure the model
model_config = outetts.HFModelConfig_v2(
model_path=MODELNAME,
tokenizer_path=TOKENIZER
)
# Initialize the interface
self.interface = outetts.InterfaceHF(model_version=VERSION, cfg=model_config)
self.OuteTTS = True
self.interface.print_default_speakers()
if request.AudioPath:
if os.path.isabs(request.AudioPath):
self.AudioPath = request.AudioPath
else:
self.AudioPath = os.path.join(request.ModelPath, request.AudioPath)
self.speaker = self.interface.create_speaker(audio_path=self.AudioPath)
else:
self.speaker = self.interface.load_default_speaker(name=SPEAKER)
elif request.Type == "SentenceTransformer":
autoTokenizer = False
self.model = SentenceTransformer(model_name, trust_remote_code=request.TrustRemoteCode)
@@ -547,8 +588,30 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
return backend_pb2.Result(success=True)
def CallOuteTTS(self, request, context):
try:
print("[OuteTTS] generating TTS", file=sys.stderr)
gen_cfg = outetts.GenerationConfig(
text="Speech synthesis is the artificial production of human speech.",
temperature=self.options.get("temperature", 0.1),
repetition_penalty=self.options.get("repetition_penalty", 1.1),
max_length=self.max_tokens,
speaker=self.speaker,
# voice_characteristics="upbeat enthusiasm, friendliness, clarity, professionalism, and trustworthiness"
)
output = self.interface.generate(config=gen_cfg)
print("[OuteTTS] Generated TTS", file=sys.stderr)
output.save(request.dst)
print("[OuteTTS] TTS done", file=sys.stderr)
except Exception as err:
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
return backend_pb2.Result(success=True)
# The TTS endpoint is older, and provides fewer features, but exists for compatibility reasons
def TTS(self, request, context):
if self.OuteTTS:
return self.CallOuteTTS(request, context)
if self.DiaTTS:
print("DiaTTS", file=sys.stderr)
return self.CallDiaTTS(request, context)

View File

@@ -4,5 +4,6 @@ numba==0.60.0
accelerate
transformers
bitsandbytes
sentence-transformers==5.2.2
protobuf==6.33.5
outetts
sentence-transformers==5.2.0
protobuf==6.33.4

View File

@@ -4,5 +4,6 @@ llvmlite==0.43.0
numba==0.60.0
transformers
bitsandbytes
sentence-transformers==5.2.2
protobuf==6.33.5
outetts
sentence-transformers==5.2.0
protobuf==6.33.4

View File

@@ -4,5 +4,6 @@ llvmlite==0.43.0
numba==0.60.0
transformers
bitsandbytes
sentence-transformers==5.2.2
protobuf==6.33.5
outetts
sentence-transformers==5.2.0
protobuf==6.33.4

View File

@@ -5,5 +5,7 @@ transformers
llvmlite==0.43.0
numba==0.60.0
bitsandbytes
sentence-transformers==5.2.2
protobuf==6.33.5
outetts
bitsandbytes
sentence-transformers==5.2.0
protobuf==6.33.4

View File

@@ -5,5 +5,6 @@ llvmlite==0.43.0
numba==0.60.0
transformers
bitsandbytes
sentence-transformers==5.2.2
protobuf==6.33.5
outetts
sentence-transformers==5.2.0
protobuf==6.33.4

View File

@@ -1,8 +0,0 @@
torch==2.7.1
llvmlite==0.43.0
numba==0.60.0
accelerate
transformers
bitsandbytes
sentence-transformers==5.2.2
protobuf==6.33.5

View File

@@ -1,5 +1,5 @@
grpcio==1.76.0
protobuf==6.33.5
protobuf==6.33.4
certifi
setuptools
scipy==1.15.1

View File

@@ -8,8 +8,6 @@ else
source $backend_dir/../common/libbackend.sh
fi
if [ "x${BUILD_PROFILE}" != "xmetal" ] && [ "x${BUILD_PROFILE}" != "xmps" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy unsafe-best-match"
fi
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy unsafe-best-match"
installRequirements

View File

@@ -1,2 +0,0 @@
torch
whisperx @ git+https://github.com/m-bain/whisperX.git

View File

@@ -19,14 +19,6 @@ func SoundGeneration(
doSample *bool,
sourceFile *string,
sourceDivisor *int32,
think *bool,
caption string,
lyrics string,
bpm *int32,
keyscale string,
language string,
timesignature string,
instrumental *bool,
loader *model.ModelLoader,
appConfig *config.ApplicationConfig,
modelConfig config.ModelConfig,
@@ -53,11 +45,8 @@ func SoundGeneration(
fileName := utils.GenerateUniqueFileName(audioDir, "sound_generation", ".wav")
filePath := filepath.Join(audioDir, fileName)
if filePath, err = filepath.Abs(filePath); err != nil {
return "", nil, fmt.Errorf("failed resolving sound generation path: %w", err)
}
req := &proto.SoundGenerationRequest{
res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
Text: text,
Model: modelConfig.Model,
Dst: filePath,
@@ -66,38 +55,12 @@ func SoundGeneration(
Temperature: temperature,
Src: sourceFile,
SrcDivisor: sourceDivisor,
}
if think != nil {
req.Think = think
}
if caption != "" {
req.Caption = &caption
}
if lyrics != "" {
req.Lyrics = &lyrics
}
if bpm != nil {
req.Bpm = bpm
}
if keyscale != "" {
req.Keyscale = &keyscale
}
if language != "" {
req.Language = &language
}
if timesignature != "" {
req.Timesignature = &timesignature
}
if instrumental != nil {
req.Instrumental = instrumental
}
})
res, err := soundGenModel.SoundGeneration(context.Background(), req)
if err != nil {
return "", nil, err
}
if res != nil && !res.Success {
// return RPC error if any
if !res.Success {
return "", nil, fmt.Errorf("error during sound generation: %s", res.Message)
}
return filePath, res, nil
return filePath, res, err
}

View File

@@ -100,9 +100,7 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
filePath, _, err := backend.SoundGeneration(text,
parseToFloat32Ptr(t.Duration), parseToFloat32Ptr(t.Temperature), &t.DoSample,
inputFile, parseToInt32Ptr(t.InputFileSampleDivisor),
nil, "", "", nil, "", "", "", nil,
ml, opts, options)
inputFile, parseToInt32Ptr(t.InputFileSampleDivisor), ml, opts, options)
if err != nil {
return err

View File

@@ -671,8 +671,7 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
}
if (u & FLAG_SOUND_GENERATION) == FLAG_SOUND_GENERATION {
soundGenBackends := []string{"transformers-musicgen", "ace-step", "mock-backend"}
if !slices.Contains(soundGenBackends, c.Backend) {
if c.Backend != "transformers-musicgen" {
return false
}
}

View File

@@ -32,22 +32,8 @@ func SoundGenerationEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader
xlog.Debug("Sound Generation Request about to be sent to backend", "modelFile", "modelFile", "backend", cfg.Backend)
language := input.Language
if language == "" {
language = input.VocalLanguage
}
var bpm *int32
if input.BPM != nil {
b := int32(*input.BPM)
bpm = &b
}
filePath, _, err := backend.SoundGeneration(
input.Text, input.Duration, input.Temperature, input.DoSample,
nil, nil,
input.Think, input.Caption, input.Lyrics, bpm, input.Keyscale,
language, input.Timesignature,
input.Instrumental,
ml, appConfig, *cfg)
// TODO: Support uploading files?
filePath, _, err := backend.SoundGeneration(input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
if err != nil {
return err
}

View File

@@ -28,7 +28,7 @@ type wrappedModel struct {
TTSConfig *config.ModelConfig
TranscriptionConfig *config.ModelConfig
LLMConfig *config.ModelConfig
VADConfig *config.ModelConfig
VADConfig *config.ModelConfig
appConfig *config.ApplicationConfig
modelLoader *model.ModelLoader
@@ -114,35 +114,6 @@ func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, im
})
}
}
// Add noAction function before templating so it's included in the prompt
// Allow the user to set custom actions via config file
noActionName := "answer"
noActionDescription := "use this action to answer without performing any action"
if m.LLMConfig.FunctionsConfig.NoActionFunctionName != "" {
noActionName = m.LLMConfig.FunctionsConfig.NoActionFunctionName
}
if m.LLMConfig.FunctionsConfig.NoActionDescriptionName != "" {
noActionDescription = m.LLMConfig.FunctionsConfig.NoActionDescriptionName
}
noActionGrammar := functions.Function{
Name: noActionName,
Description: noActionDescription,
Parameters: map[string]interface{}{
"properties": map[string]interface{}{
"message": map[string]interface{}{
"type": "string",
"description": "The message to reply the user with",
},
},
},
}
if !m.LLMConfig.FunctionsConfig.DisableNoAction {
funcs = append(funcs, noActionGrammar)
}
}
predInput = m.evaluator.TemplateMessages(input, input.Messages, m.LLMConfig, funcs, len(funcs) > 0)
@@ -153,29 +124,38 @@ func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, im
}
}
// Handle tool_choice parameter similar to the chat endpoint
if toolChoice != nil {
if toolChoice.Mode != "" {
// String values: "auto", "required", "none"
switch toolChoice.Mode {
case types.ToolChoiceModeRequired:
m.LLMConfig.SetFunctionCallString("required")
case types.ToolChoiceModeNone:
// Don't use tools
m.LLMConfig.SetFunctionCallString("none")
case types.ToolChoiceModeAuto:
// Default behavior - let model decide
}
} else if toolChoice.Function != nil {
// Specific function specified
m.LLMConfig.SetFunctionCallString(toolChoice.Function.Name)
}
}
// Generate grammar for function calling if tools are provided and grammar generation is enabled
shouldUseFn := len(tools) > 0 && m.LLMConfig.ShouldUseFunctions()
if !m.LLMConfig.FunctionsConfig.GrammarConfig.NoGrammar && shouldUseFn {
// Allow the user to set custom actions via config file
noActionName := "answer"
noActionDescription := "use this action to answer without performing any action"
if m.LLMConfig.FunctionsConfig.NoActionFunctionName != "" {
noActionName = m.LLMConfig.FunctionsConfig.NoActionFunctionName
}
if m.LLMConfig.FunctionsConfig.NoActionDescriptionName != "" {
noActionDescription = m.LLMConfig.FunctionsConfig.NoActionDescriptionName
}
noActionGrammar := functions.Function{
Name: noActionName,
Description: noActionDescription,
Parameters: map[string]interface{}{
"properties": map[string]interface{}{
"message": map[string]interface{}{
"type": "string",
"description": "The message to reply the user with",
},
},
},
}
if !m.LLMConfig.FunctionsConfig.DisableNoAction {
funcs = append(funcs, noActionGrammar)
}
// Force picking one of the functions by the request
if m.LLMConfig.FunctionToCall() != "" {
funcs = functions.Functions(funcs).Select(m.LLMConfig.FunctionToCall())
@@ -204,7 +184,7 @@ func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, im
toolChoiceJSON = string(b)
}
return backend.ModelInference(ctx, predInput, messages, images, videos, audios, m.modelLoader, m.LLMConfig, m.confLoader, m.appConfig, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias)
return backend.ModelInference(ctx, predInput, messages, images, videos, audios, m.modelLoader, m.LLMConfig, m.confLoader, m.appConfig, tokenCallback, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias, )
}
func (m *wrappedModel) TTS(ctx context.Context, text, voice, language string) (string, *proto.Result, error) {
@@ -238,11 +218,11 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
return &transcriptOnlyModel{
TranscriptionConfig: cfgSST,
VADConfig: cfgVAD,
VADConfig: cfgVAD,
confLoader: cl,
confLoader: cl,
modelLoader: ml,
appConfig: appConfig,
appConfig: appConfig,
}, cfgSST, nil
}
@@ -317,11 +297,11 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
TTSConfig: cfgTTS,
TranscriptionConfig: cfgSST,
LLMConfig: cfgLLM,
VADConfig: cfgVAD,
VADConfig: cfgVAD,
confLoader: cl,
confLoader: cl,
modelLoader: ml,
appConfig: appConfig,
evaluator: evaluator,
appConfig: appConfig,
evaluator: evaluator,
}, nil
}

View File

@@ -318,49 +318,6 @@ func RegisterUIRoutes(app *echo.Echo,
return c.Render(200, "views/tts", summary)
})
app.GET("/sound/:model", func(c echo.Context) error {
modelConfigs := cl.GetAllModelsConfigs()
modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)
summary := map[string]interface{}{
"Title": "LocalAI - Generate sound with " + c.Param("model"),
"BaseURL": middleware.BaseURL(c),
"ModelsConfig": modelConfigs,
"ModelsWithoutConfig": modelsWithoutConfig,
"Model": c.Param("model"),
"Version": internal.PrintableVersion(),
}
return c.Render(200, "views/sound", summary)
})
app.GET("/sound", func(c echo.Context) error {
modelConfigs := cl.GetAllModelsConfigs()
modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)
if len(modelConfigs)+len(modelsWithoutConfig) == 0 {
return c.Redirect(302, middleware.BaseURL(c))
}
modelThatCanBeUsed := ""
title := "LocalAI - Generate sound"
for _, b := range modelConfigs {
if b.HasUsecases(config.FLAG_SOUND_GENERATION) {
modelThatCanBeUsed = b.Name
title = "LocalAI - Generate sound with " + modelThatCanBeUsed
break
}
}
summary := map[string]interface{}{
"Title": title,
"BaseURL": middleware.BaseURL(c),
"ModelsConfig": modelConfigs,
"ModelsWithoutConfig": modelsWithoutConfig,
"Model": modelThatCanBeUsed,
"Version": internal.PrintableVersion(),
}
return c.Render(200, "views/sound", summary)
})
app.GET("/video/:model", func(c echo.Context) error {
modelConfigs := cl.GetAllModelsConfigs()
modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)

View File

@@ -1,144 +0,0 @@
function showNotification(type, message) {
const existing = document.getElementById('sound-notification');
if (existing) existing.remove();
const notification = document.createElement('div');
notification.id = 'sound-notification';
notification.className = 'fixed top-24 right-4 z-50 p-4 rounded-lg shadow-lg flex items-center gap-2 transition-all duration-300';
if (type === 'error') {
notification.classList.add('bg-red-900/90', 'border', 'border-red-700', 'text-red-200');
notification.innerHTML = '<i class="fas fa-circle-exclamation text-red-400 mr-2"></i>' + message;
} else {
notification.classList.add('bg-green-900/90', 'border', 'border-green-700', 'text-green-200');
notification.innerHTML = '<i class="fas fa-circle-check text-green-400 mr-2"></i>' + message;
}
document.body.appendChild(notification);
setTimeout(function() {
if (document.getElementById('sound-notification')) {
document.getElementById('sound-notification').remove();
}
}, 5000);
}
function buildRequestBody() {
const model = document.getElementById('sound-model').value;
const body = { model_id: model };
const isSimple = document.getElementById('mode-simple').checked;
if (isSimple) {
const text = document.getElementById('text').value.trim();
if (text) body.text = text;
body.instrumental = document.getElementById('instrumental').checked;
const vocal = document.getElementById('vocal_language').value.trim();
if (vocal) body.vocal_language = vocal;
} else {
// Advanced mode: do NOT send 'text' field - it triggers simple mode in backend
// Only send caption, lyrics, and other advanced fields
const caption = document.getElementById('caption').value.trim();
if (caption) body.caption = caption;
const lyrics = document.getElementById('lyrics').value.trim();
if (lyrics) body.lyrics = lyrics;
body.think = document.getElementById('think').checked;
const bpm = document.getElementById('bpm').value.trim();
if (bpm) body.bpm = parseInt(bpm, 10);
const duration = document.getElementById('duration_seconds').value.trim();
if (duration) body.duration_seconds = parseFloat(duration);
const keyscale = document.getElementById('keyscale').value.trim();
if (keyscale) body.keyscale = keyscale;
const language = document.getElementById('language').value.trim();
if (language) body.language = language;
const timesignature = document.getElementById('timesignature').value.trim();
if (timesignature) body.timesignature = timesignature;
}
return body;
}
async function generateSound(event) {
event.preventDefault();
const isSimple = document.getElementById('mode-simple').checked;
if (isSimple) {
const text = document.getElementById('text').value.trim();
if (!text) {
showNotification('error', 'Please enter text (description)');
return;
}
} else {
// Advanced mode: only check caption and lyrics (text field is hidden and not used)
const caption = document.getElementById('caption').value.trim();
const lyrics = document.getElementById('lyrics').value.trim();
if (!caption && !lyrics) {
showNotification('error', 'Please enter at least caption or lyrics');
return;
}
}
const loader = document.getElementById('loader');
const resultDiv = document.getElementById('result');
const generateBtn = document.getElementById('generate-btn');
loader.style.display = 'block';
generateBtn.disabled = true;
resultDiv.innerHTML = '<p class="text-[var(--color-text-secondary)] italic">Generating sound...</p>';
try {
const response = await fetch('v1/sound-generation', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(buildRequestBody()),
});
if (!response.ok) {
let errMsg = 'Request failed';
const ct = response.headers.get('content-type');
if (ct && ct.indexOf('application/json') !== -1) {
const json = await response.json();
if (json && json.error && json.error.message) errMsg = json.error.message;
}
resultDiv.innerHTML = '<div class="text-red-400 flex items-center gap-2"><i class="fas fa-circle-exclamation"></i> ' + errMsg + '</div>';
showNotification('error', 'Failed to generate sound');
return;
}
const blob = await response.blob();
const audioUrl = window.URL.createObjectURL(blob);
const wrap = document.createElement('div');
wrap.className = 'flex flex-col items-center gap-4 w-full';
const audio = document.createElement('audio');
audio.controls = true;
audio.src = audioUrl;
audio.className = 'w-full max-w-md';
const actions = document.createElement('div');
actions.className = 'flex flex-wrap justify-center gap-3';
const downloadLink = document.createElement('a');
downloadLink.href = audioUrl;
downloadLink.download = 'sound-' + new Date().toISOString().slice(0, 10) + '.wav';
downloadLink.className = 'inline-flex items-center gap-2 px-4 py-2 rounded-lg bg-[var(--color-primary)] text-[var(--color-bg-primary)] hover:opacity-90 transition';
downloadLink.innerHTML = '<i class="fas fa-download"></i> Download';
actions.appendChild(downloadLink);
wrap.appendChild(audio);
wrap.appendChild(actions);
resultDiv.innerHTML = '';
resultDiv.appendChild(wrap);
audio.play().catch(function() {});
showNotification('success', 'Sound generated successfully');
} catch (err) {
console.error('Sound generation error:', err);
resultDiv.innerHTML = '<div class="text-red-400 flex items-center gap-2"><i class="fas fa-circle-exclamation"></i> Network error</div>';
showNotification('error', 'Network error');
} finally {
loader.style.display = 'none';
generateBtn.disabled = false;
}
}
document.addEventListener('DOMContentLoaded', function() {
document.getElementById('sound-form').addEventListener('submit', generateSound);
document.getElementById('loader').style.display = 'none';
});

View File

@@ -324,11 +324,6 @@
<i class="fas fa-microphone text-[8px] mr-1"></i>TTS
</a>
{{ end }}
{{ if eq . "FLAG_SOUND_GENERATION" }}
<a href="sound/{{$backendCfg.Name}}" class="inline-flex items-center px-1.5 py-0.5 rounded text-[10px] font-medium bg-[var(--color-primary)]/10 text-[var(--color-primary)] hover:bg-[var(--color-primary)]/20 transition-colors" title="Sound">
<i class="fas fa-waveform-lines text-[8px] mr-1"></i>Sound
</a>
{{ end }}
{{ end }}
</div>
</td>

View File

@@ -34,9 +34,6 @@
<a href="tts/" class="text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] px-2 py-2 rounded-lg transition duration-300 ease-in-out hover:bg-[var(--color-bg-secondary)] flex items-center group text-sm">
<i class="fa-solid fa-music text-[var(--color-primary)] mr-1.5 text-sm group-hover:scale-110 transition-transform"></i>TTS
</a>
<a href="sound/" class="text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] px-2 py-2 rounded-lg transition duration-300 ease-in-out hover:bg-[var(--color-bg-secondary)] flex items-center group text-sm">
<i class="fas fa-volume-high text-[var(--color-primary)] mr-1.5 text-sm group-hover:scale-110 transition-transform"></i>Sound
</a>
<a href="talk/" class="text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] px-2 py-2 rounded-lg transition duration-300 ease-in-out hover:bg-[var(--color-bg-secondary)] flex items-center group text-sm">
<i class="fa-solid fa-phone text-[var(--color-primary)] mr-1.5 text-sm group-hover:scale-110 transition-transform"></i>Talk
</a>
@@ -100,9 +97,6 @@
<a href="tts/" class="block text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] hover:bg-[var(--color-bg-secondary)] px-3 py-2 rounded-lg transition duration-300 ease-in-out flex items-center text-sm">
<i class="fa-solid fa-music text-[var(--color-primary)] mr-3 w-5 text-center text-sm"></i>TTS
</a>
<a href="sound/" class="block text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] hover:bg-[var(--color-bg-secondary)] px-3 py-2 rounded-lg transition duration-300 ease-in-out flex items-center text-sm">
<i class="fas fa-volume-high text-[var(--color-primary)] mr-3 w-5 text-center text-sm"></i>Sound
</a>
<a href="talk/" class="block text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] hover:bg-[var(--color-bg-secondary)] px-3 py-2 rounded-lg transition duration-300 ease-in-out flex items-center text-sm">
<i class="fa-solid fa-phone text-[var(--color-primary)] mr-3 w-5 text-center text-sm"></i>Talk
</a>

View File

@@ -1,187 +0,0 @@
<!DOCTYPE html>
<html lang="en">
{{template "views/partials/head" .}}
<script defer src="static/sound.js"></script>
<body class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">
<div class="flex flex-col min-h-screen">
{{template "views/partials/navbar" .}}
<div class="container mx-auto px-4 py-8 flex-grow">
<!-- Hero Section -->
<div class="hero-section">
<div class="hero-content">
<h1 class="hero-title">
<i class="fas fa-music mr-2"></i>Sound Generation {{ if .Model }} with {{.Model}} {{ end }}
</h1>
<p class="hero-subtitle">Generate music and audio from descriptions or structured prompts</p>
</div>
</div>
<!-- Sound Generation Interface -->
<div class="max-w-3xl mx-auto">
<div class="card overflow-hidden">
<!-- Header with Model Selection -->
<div class="border-b border-[var(--color-border)] p-5">
<div class="flex flex-col sm:flex-row items-center justify-between gap-4">
<div class="flex items-center" x-data="{ link : '{{ if .Model }}sound/{{.Model}}{{ end }}' }">
<label for="model-select" class="mr-3 text-[var(--color-text-secondary)] font-medium">
<i class="fas fa-microphone-lines text-[var(--color-primary)] mr-2"></i>Model:
</label>
<select
id="model-select"
x-model="link"
@change="window.location = link"
class="input p-2.5"
>
<option value="" disabled class="text-[var(--color-text-secondary)]">Select a model</option>
{{ $model:=.Model}}
{{ range .ModelsConfig }}
{{ $cfg := . }}
{{ range .KnownUsecaseStrings }}
{{ if eq . "FLAG_SOUND_GENERATION" }}
<option value="sound/{{$cfg.Name}}" {{ if eq $cfg.Name $model }} selected {{end}} class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">{{$cfg.Name}}</option>
{{ end }}
{{ end }}
{{ end }}
{{ range .ModelsWithoutConfig }}
<option value="sound/{{.}}" {{ if eq . $model }} selected {{ end }} class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">{{.}}</option>
{{end}}
</select>
</div>
</div>
</div>
<!-- Form -->
<div class="p-6">
<div class="bg-[var(--color-primary)]/10 border border-[var(--color-primary-border)]/20 rounded-lg p-4 mb-6">
<div class="flex items-start">
<i class="fas fa-info-circle text-[var(--color-primary)] mt-1 mr-3 flex-shrink-0"></i>
<p class="text-[var(--color-text-secondary)]">
Use <strong>Simple mode</strong> (text as description + vocal language) or switch to <strong>Advanced mode</strong> for caption, lyrics, BPM, key, and more.
</p>
</div>
</div>
<input id="sound-model" type="hidden" value="{{.Model}}">
<form id="sound-form" class="space-y-4">
<!-- Mode toggle -->
<div class="flex items-center gap-4">
<span class="text-sm font-medium text-[var(--color-text-secondary)]">Mode:</span>
<label class="inline-flex items-center cursor-pointer">
<input type="radio" name="mode" value="simple" id="mode-simple" class="mr-2" checked>
<span class="text-sm">Simple</span>
</label>
<label class="inline-flex items-center cursor-pointer">
<input type="radio" name="mode" value="advanced" id="mode-advanced" class="mr-2">
<span class="text-sm">Advanced</span>
</label>
</div>
<!-- Text: used as description in simple mode, style in advanced mode -->
<div class="space-y-4">
<div id="text-field-container">
<label for="text" class="block text-sm font-medium text-[var(--color-text-secondary)] mb-1.5">
<i class="fas fa-align-left text-[var(--color-primary)] mr-1.5"></i>Text <span id="text-label-desc" class="text-[var(--color-text-secondary)]">(description in simple mode)</span>
</label>
<textarea
id="text"
name="text"
placeholder="e.g. A soft Bengali love song for a quiet evening"
rows="3"
class="input w-full p-3"
></textarea>
</div>
</div>
<!-- Simple mode -->
<div id="simple-fields" class="space-y-4">
<div class="flex flex-wrap items-center gap-4">
<label class="inline-flex items-center cursor-pointer">
<input type="checkbox" id="instrumental" name="instrumental" class="rounded mr-2">
<span class="text-sm text-[var(--color-text-secondary)]">Instrumental only</span>
</label>
<div class="flex-1 min-w-[140px]">
<label for="vocal_language" class="block text-sm font-medium text-[var(--color-text-secondary)] mb-1">Vocal language</label>
<input type="text" id="vocal_language" name="vocal_language" placeholder="e.g. bn, en" class="input w-full p-2">
</div>
</div>
</div>
<!-- Advanced mode -->
<div id="advanced-fields" class="hidden space-y-4">
<div>
<label for="caption" class="block text-sm font-medium text-[var(--color-text-secondary)] mb-1.5">Caption</label>
<textarea id="caption" name="caption" placeholder="e.g. A funky Japanese disco track" rows="2" class="input w-full p-3"></textarea>
</div>
<div>
<label for="lyrics" class="block text-sm font-medium text-[var(--color-text-secondary)] mb-1.5">Lyrics</label>
<textarea id="lyrics" name="lyrics" placeholder="[Verse 1]&#10;..." rows="4" class="input w-full p-3 font-mono text-sm"></textarea>
</div>
<div class="flex flex-wrap items-center gap-4">
<label class="inline-flex items-center cursor-pointer">
<input type="checkbox" id="think" name="think" class="rounded mr-2">
<span class="text-sm text-[var(--color-text-secondary)]">Think (reasoning)</span>
</label>
<div>
<label for="bpm" class="block text-xs text-[var(--color-text-secondary)] mb-0.5">BPM</label>
<input type="number" id="bpm" name="bpm" min="1" max="300" placeholder="120" class="input p-2 w-24">
</div>
<div>
<label for="duration_seconds" class="block text-xs text-[var(--color-text-secondary)] mb-0.5">Duration (s)</label>
<input type="number" id="duration_seconds" name="duration_seconds" min="1" placeholder="225" class="input p-2 w-24">
</div>
<div>
<label for="keyscale" class="block text-xs text-[var(--color-text-secondary)] mb-0.5">Key</label>
<input type="text" id="keyscale" name="keyscale" placeholder="e.g. Ab major" class="input p-2 w-28">
</div>
<div>
<label for="language" class="block text-xs text-[var(--color-text-secondary)] mb-0.5">Language</label>
<input type="text" id="language" name="language" placeholder="e.g. ja" class="input p-2 w-20">
</div>
<div>
<label for="timesignature" class="block text-xs text-[var(--color-text-secondary)] mb-0.5">Time sig.</label>
<input type="text" id="timesignature" name="timesignature" placeholder="4" class="input p-2 w-16">
</div>
</div>
</div>
<div class="pt-4">
<button type="submit" id="generate-btn" class="btn-primary w-full py-3 flex items-center justify-center gap-2">
<i class="fas fa-music"></i>
<span>Generate sound</span>
</button>
</div>
</form>
<!-- Loading indicator -->
<div class="flex justify-center my-6">
<div id="loader" class="animate-spin rounded-full h-10 w-10 border-t-2 border-b-2 border-[var(--color-primary)]" style="display: none;"></div>
</div>
<!-- Results Area -->
<div class="bg-[var(--color-bg-secondary)]/50 border border-[var(--color-border)] rounded-lg p-4 min-h-[100px] flex items-center justify-center">
<div id="result" class="w-full"></div>
</div>
</div>
</div>
</div>
</div>
{{template "views/partials/footer" .}}
</div>
<script>
document.getElementById('mode-simple').addEventListener('change', function() {
document.getElementById('simple-fields').classList.remove('hidden');
document.getElementById('advanced-fields').classList.add('hidden');
document.getElementById('text-field-container').classList.remove('hidden');
});
document.getElementById('mode-advanced').addEventListener('change', function() {
document.getElementById('simple-fields').classList.add('hidden');
document.getElementById('advanced-fields').classList.remove('hidden');
document.getElementById('text-field-container').classList.add('hidden');
});
</script>
</body>
</html>

View File

@@ -12,17 +12,6 @@ type ElevenLabsSoundGenerationRequest struct {
Duration *float32 `json:"duration_seconds,omitempty" yaml:"duration_seconds,omitempty"`
Temperature *float32 `json:"prompt_influence,omitempty" yaml:"prompt_influence,omitempty"`
DoSample *bool `json:"do_sample,omitempty" yaml:"do_sample,omitempty"`
// Advanced mode
Think *bool `json:"think,omitempty" yaml:"think,omitempty"`
Caption string `json:"caption,omitempty" yaml:"caption,omitempty"`
Lyrics string `json:"lyrics,omitempty" yaml:"lyrics,omitempty"`
BPM *int `json:"bpm,omitempty" yaml:"bpm,omitempty"`
Keyscale string `json:"keyscale,omitempty" yaml:"keyscale,omitempty"`
Language string `json:"language,omitempty" yaml:"language,omitempty"`
VocalLanguage string `json:"vocal_language,omitempty" yaml:"vocal_language,omitempty"`
Timesignature string `json:"timesignature,omitempty" yaml:"timesignature,omitempty"`
// Simple mode: use text as description; optional instrumental / vocal_language
Instrumental *bool `json:"instrumental,omitempty" yaml:"instrumental,omitempty"`
}
func (elttsr *ElevenLabsTTSRequest) ModelName(s *string) string {

View File

@@ -7,12 +7,9 @@ url = "/features/audio-to-text/"
Audio to text models are models that can generate text from an audio file.
The transcription endpoint allows to convert audio files to text. The endpoint supports multiple backends:
- **[whisper.cpp](https://github.com/ggerganov/whisper.cpp)**: A C++ library for audio transcription (default)
- **moonshine**: Ultra-fast transcription engine optimized for low-end devices
- **faster-whisper**: Fast Whisper implementation with CTranslate2
The endpoint input supports all the audio formats supported by `ffmpeg`.
The transcription endpoint allows to convert audio files to text. The endpoint is based
on [whisper.cpp](https://github.com/ggerganov/whisper.cpp), a C++ library for audio transcription. The endpoint input
supports all the audio formats supported by `ffmpeg`.
## Usage

View File

@@ -126,64 +126,6 @@ curl --request POST \
Future versions of LocalAI will expose additional control over audio generation beyond the text prompt.
### ACE-Step
[ACE-Step 1.5](https://github.com/ACE-Step/ACE-Step-1.5) is a music generation model that can create music from text descriptions, lyrics, or audio samples. It supports both simple text-to-music and advanced music generation with metadata like BPM, key scale, and time signature.
#### Setup
Install the `ace-step-turbo` model from the Model gallery or run `local-ai run models install ace-step-turbo`.
#### Usage
ACE-Step supports two modes: **Simple mode** (text description + vocal language) and **Advanced mode** (caption, lyrics, BPM, key, and more).
**Simple mode:**
```bash
curl http://localhost:8080/v1/audio/speech -H "Content-Type: application/json" -d '{
"model": "ace-step-turbo",
"input": "A soft Bengali love song for a quiet evening",
"vocal_language": "bn"
}' --output music.flac
```
**Advanced mode** (using the `/sound` endpoint):
```bash
curl http://localhost:8080/sound -H "Content-Type: application/json" -d '{
"model": "ace-step-turbo",
"caption": "A funky Japanese disco track",
"lyrics": "[Verse 1]\n...",
"bpm": 120,
"keyscale": "Ab major",
"language": "ja",
"duration_seconds": 225
}' --output music.flac
```
#### Configuration
You can configure ACE-Step models with various options:
```yaml
name: ace-step-turbo
backend: ace-step
parameters:
model: acestep-v15-turbo
known_usecases:
- sound_generation
- tts
options:
- "device:auto"
- "use_flash_attention:true"
- "init_lm:true" # Enable LLM for enhanced generation
- "lm_model_path:acestep-5Hz-lm-0.6B" # or acestep-5Hz-lm-4B
- "lm_backend:pt" # or vllm
- "temperature:0.85"
- "top_p:0.9"
- "inference_steps:8"
- "guidance_scale:7.0"
```
### VibeVoice
[VibeVoice-Realtime](https://github.com/microsoft/VibeVoice) is a real-time text-to-speech model that generates natural-sounding speech with voice cloning capabilities.

View File

@@ -150,7 +150,7 @@ services:
ports:
- 8080:8080
environment:
- DEBUG=false
- DEBUG=true
volumes:
- ./models:/models:cached
# For NVIDIA GPUs, uncomment:

View File

@@ -1,69 +1,4 @@
---
- name: "ace-step-turbo"
license: mit
tags:
- music
- audio
- music-generation
- tts
- sound-generation
- ace-step
- ace-step-1.5
- ace-step-1.5-turbo
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
urls:
- https://huggingface.co/ACE-Step/Ace-Step1.5
description: |
ACE-Step 1.5 Turbo is a music generation model that can create music from text descriptions,
lyrics, or audio samples. Supports both simple text-to-music and advanced music generation
with metadata like BPM, key scale, and time signature.
overrides:
name: ace-step-turbo
backend: ace-step
parameters:
model: acestep-v15-turbo
known_usecases:
- sound_generation
- tts
options:
- "device:auto"
- "use_flash_attention:true"
- "offload_to_cpu:false"
- "offload_dit_to_cpu:false"
- "init_lm:true"
- "lm_model_path:acestep-5Hz-lm-0.6B" # or acestep-5Hz-lm-4B
- "lm_backend:pt"
- "temperature:0.85"
- "top_p:0.9"
- "lm_cfg_scale:2.0"
- "inference_steps:8"
- "guidance_scale:7.0"
- "batch_size:1"
- name: "qwen3-coder-next-mxfp4_moe"
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
urls:
- https://huggingface.co/noctrex/Qwen3-Coder-Next-MXFP4_MOE-GGUF
description: |
The model is a quantized version of **Qwen/Qwen3-Coder-Next** (base model) using the **MXFP4** quantization scheme. It is optimized for efficiency while retaining performance, suitable for deployment in applications requiring lightweight inference. The quantized version is tailored for specific tasks, with parameters like temperature=1.0 and top_p=0.95 recommended for generation.
overrides:
parameters:
model: llama-cpp/models/Qwen3-Coder-Next-MXFP4_MOE.gguf
name: Qwen3-Coder-Next-MXFP4_MOE-GGUF
backend: llama-cpp
template:
use_tokenizer_template: true
known_usecases:
- chat
function:
grammar:
disable: true
description: Imported from https://huggingface.co/noctrex/Qwen3-Coder-Next-MXFP4_MOE-GGUF
options:
- use_jinja:true
files:
- filename: llama-cpp/models/Qwen3-Coder-Next-MXFP4_MOE.gguf
sha256: fa356439e87010163778b7eab5f2b07e0e5b7f2cd9aac78b069139f5ae069414
uri: https://huggingface.co/noctrex/Qwen3-Coder-Next-MXFP4_MOE-GGUF/resolve/main/Qwen3-Coder-Next-MXFP4_MOE.gguf
- name: "deepseek-ai.deepseek-v3.2"
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
urls:
@@ -1907,7 +1842,7 @@
- gpu
- text-to-speech
overrides:
backend: "outetts"
backend: "transformers"
name: "outetts"
description: "OuteTTS is a 1.6B parameter text to speech model created by OuteAI."
parameters:

42
go.mod
View File

@@ -9,7 +9,7 @@ require (
fyne.io/fyne/v2 v2.7.2
github.com/Masterminds/sprig/v3 v3.3.0
github.com/alecthomas/kong v1.13.0
github.com/anthropics/anthropic-sdk-go v1.20.0
github.com/anthropics/anthropic-sdk-go v1.19.0
github.com/charmbracelet/glamour v0.10.0
github.com/containerd/containerd v1.7.30
github.com/ebitengine/purego v0.9.1
@@ -37,9 +37,8 @@ require (
github.com/mudler/go-processmanager v0.1.0
github.com/mudler/memory v0.0.0-20251216220809-d1256471a6c2
github.com/mudler/xlog v0.0.5
github.com/onsi/ginkgo/v2 v2.28.0
github.com/onsi/gomega v1.39.1
github.com/openai/openai-go/v3 v3.17.0
github.com/onsi/ginkgo/v2 v2.27.5
github.com/onsi/gomega v1.39.0
github.com/otiai10/copy v1.14.1
github.com/otiai10/openaigo v1.7.0
github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5
@@ -55,10 +54,10 @@ require (
github.com/swaggo/swag v1.16.6
github.com/testcontainers/testcontainers-go v0.40.0
github.com/tmc/langchaingo v0.1.14
go.opentelemetry.io/otel v1.40.0
go.opentelemetry.io/otel/exporters/prometheus v0.62.0
go.opentelemetry.io/otel/metric v1.40.0
go.opentelemetry.io/otel/sdk/metric v1.40.0
go.opentelemetry.io/otel v1.39.0
go.opentelemetry.io/otel/exporters/prometheus v0.61.0
go.opentelemetry.io/otel/metric v1.39.0
go.opentelemetry.io/otel/sdk/metric v1.39.0
google.golang.org/grpc v1.78.0
gopkg.in/yaml.v2 v2.4.0
gopkg.in/yaml.v3 v3.0.1
@@ -68,13 +67,14 @@ require (
require (
github.com/ghodss/yaml v1.0.0 // indirect
github.com/labstack/gommon v0.4.2 // indirect
github.com/openai/openai-go/v3 v3.17.0 // indirect
github.com/swaggo/files/v2 v2.0.2 // indirect
github.com/tidwall/gjson v1.18.0 // indirect
github.com/tidwall/match v1.2.0 // indirect
github.com/tidwall/pretty v1.2.1 // indirect
github.com/tidwall/sjson v1.2.5 // indirect
github.com/valyala/fasttemplate v1.2.2 // indirect
google.golang.org/protobuf v1.36.11 // indirect
google.golang.org/protobuf v1.36.10 // indirect
)
require (
@@ -155,9 +155,9 @@ require (
go.yaml.in/yaml/v2 v2.4.3
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/image v0.25.0 // indirect
golang.org/x/net v0.49.0 // indirect; indirect (for websocket)
golang.org/x/oauth2 v0.34.0 // indirect
golang.org/x/telemetry v0.0.0-20260109210033-bd525da824e2 // indirect
golang.org/x/net v0.48.0 // indirect; indirect (for websocket)
golang.org/x/oauth2 v0.33.0 // indirect
golang.org/x/telemetry v0.0.0-20251111182119-bc8e575c7b54 // indirect
golang.org/x/time v0.14.0 // indirect
)
@@ -211,7 +211,7 @@ require (
github.com/google/btree v1.1.3 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/gopacket v1.1.19 // indirect
github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 // indirect
github.com/google/pprof v0.0.0-20250630185457-6e76a2b096b5 // indirect
github.com/gorilla/css v1.0.1 // indirect
github.com/gorilla/websocket v1.5.3
github.com/hashicorp/golang-lru v1.0.2 // indirect
@@ -289,7 +289,7 @@ require (
github.com/polydawn/refmt v0.89.0 // indirect
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.5 // indirect
github.com/prometheus/common v0.67.4 // indirect
github.com/prometheus/procfs v0.19.2 // indirect
github.com/quic-go/qpack v0.5.1 // indirect
github.com/quic-go/quic-go v0.54.1 // indirect
@@ -315,20 +315,20 @@ require (
github.com/yuin/goldmark-emoji v1.0.5 // indirect
github.com/yusufpapurcu/wmi v1.2.4 // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/otel/sdk v1.40.0 // indirect
go.opentelemetry.io/otel/trace v1.40.0 // indirect
go.opentelemetry.io/otel/sdk v1.39.0 // indirect
go.opentelemetry.io/otel/trace v1.39.0 // indirect
go.uber.org/dig v1.19.0 // indirect
go.uber.org/fx v1.24.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.27.0 // indirect
golang.org/x/crypto v0.47.0 // indirect
golang.org/x/crypto v0.46.0 // indirect
golang.org/x/exp v0.0.0-20250606033433-dcc06ee1d476 // indirect
golang.org/x/mod v0.32.0 // indirect
golang.org/x/mod v0.30.0 // indirect
golang.org/x/sync v0.19.0 // indirect
golang.org/x/sys v0.40.0 // indirect
golang.org/x/term v0.39.0 // indirect
golang.org/x/text v0.33.0 // indirect
golang.org/x/tools v0.41.0 // indirect
golang.org/x/term v0.38.0 // indirect
golang.org/x/text v0.32.0 // indirect
golang.org/x/tools v0.39.0 // indirect
golang.zx2c4.com/wintun v0.0.0-20230126152724-0fa3db229ce2 // indirect
golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb // indirect
golang.zx2c4.com/wireguard/windows v0.5.3 // indirect

81
go.sum
View File

@@ -44,8 +44,8 @@ github.com/andybalholm/brotli v1.0.1/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c=
github.com/anthropics/anthropic-sdk-go v1.20.0 h1:KE6gQiAT1aBHMh3Dmp1WgqnyZZLJNo2oX3ka004oDLE=
github.com/anthropics/anthropic-sdk-go v1.20.0/go.mod h1:WTz31rIUHUHqai2UslPpw5CwXrQP3geYBioRV4WOLvE=
github.com/anthropics/anthropic-sdk-go v1.19.0 h1:mO6E+ffSzLRvR/YUH9KJC0uGw0uV8GjISIuzem//3KE=
github.com/anthropics/anthropic-sdk-go v1.19.0/go.mod h1:WTz31rIUHUHqai2UslPpw5CwXrQP3geYBioRV4WOLvE=
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
github.com/aymanbagabas/go-udiff v0.2.0 h1:TK0fH4MteXUDspT88n8CKzvK0X9O2xu9yQjWpi6yML8=
@@ -277,8 +277,8 @@ github.com/google/jsonschema-go v0.3.0 h1:6AH2TxVNtk3IlvkkhjrtbUc4S8AvO0Xii0DxIy
github.com/google/jsonschema-go v0.3.0/go.mod h1:r5quNTdLOYEz95Ru18zA0ydNbBuYoo9tgaYcxEYhJVE=
github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs=
github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc=
github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83 h1:z2ogiKUYzX5Is6zr/vP9vJGqPwcdqsWjOt+V8J7+bTc=
github.com/google/pprof v0.0.0-20260115054156-294ebfa9ad83/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI=
github.com/google/pprof v0.0.0-20250630185457-6e76a2b096b5 h1:xhMrHhTJ6zxu3gA4enFM9MLn9AY7613teCdFnlUVbSQ=
github.com/google/pprof v0.0.0-20250630185457-6e76a2b096b5/go.mod h1:5hDyRhoBCxViHszMt12TnOpEI4VVi+U8Gm9iphldiMA=
github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
@@ -561,10 +561,10 @@ github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE=
github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU=
github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE=
github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU=
github.com/onsi/ginkgo/v2 v2.28.0 h1:Rrf+lVLmtlBIKv6KrIGJCjyY8N36vDVcutbGJkyqjJc=
github.com/onsi/ginkgo/v2 v2.28.0/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo=
github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28=
github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg=
github.com/onsi/ginkgo/v2 v2.27.5 h1:ZeVgZMx2PDMdJm/+w5fE/OyG6ILo1Y3e+QX4zSR0zTE=
github.com/onsi/ginkgo/v2 v2.27.5/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo=
github.com/onsi/gomega v1.39.0 h1:y2ROC3hKFmQZJNFeGAMeHZKkjBL65mIZcvrLQBF9k6Q=
github.com/onsi/gomega v1.39.0/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4=
github.com/openai/openai-go/v3 v3.17.0 h1:CfTkmQoItolSyW+bHOUF190KuX5+1Zv6MC0Gb4wAwy8=
github.com/openai/openai-go/v3 v3.17.0/go.mod h1:cdufnVK14cWcT9qA1rRtrXx4FTRsgbDPW7Ia7SS5cZo=
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
@@ -652,8 +652,8 @@ github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4=
github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw=
github.com/prometheus/common v0.67.4 h1:yR3NqWO1/UyO1w2PhUvXlGQs/PtFmoveVO0KZ4+Lvsc=
github.com/prometheus/common v0.67.4/go.mod h1:gP0fq6YjjNCLssJCQp0yk4M8W6ikLURwkdd/YKtTbyI=
github.com/prometheus/otlptranslator v1.0.0 h1:s0LJW/iN9dkIH+EnhiD3BlkkP5QVIUVEoIwkU+A6qos=
github.com/prometheus/otlptranslator v1.0.0/go.mod h1:vRYWnXvI6aWGpsdY/mOT/cbeVRBlPWtBNDb7kGR3uKM=
github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
@@ -769,6 +769,7 @@ github.com/testcontainers/testcontainers-go v0.40.0/go.mod h1:FSXV5KQtX2HAMlm7U3
github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/match v1.2.0 h1:0pt8FlkOwjN2fPt4bIl4BoNxb98gGHN2ObFEDkrfZnM=
github.com/tidwall/match v1.2.0/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
@@ -833,22 +834,22 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q=
go.opentelemetry.io/otel v1.40.0 h1:oA5YeOcpRTXq6NN7frwmwFR0Cn3RhTVZvXsP4duvCms=
go.opentelemetry.io/otel v1.40.0/go.mod h1:IMb+uXZUKkMXdPddhwAHm6UfOwJyh4ct1ybIlV14J0g=
go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48=
go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.33.0 h1:wpMfgF8E1rkrT1Z6meFh1NDtownE9Ii3n3X2GJYjsaU=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.33.0/go.mod h1:wAy0T/dUbs468uOlkT31xjvqQgEVXv58BRFWEgn5v/0=
go.opentelemetry.io/otel/exporters/prometheus v0.62.0 h1:krvC4JMfIOVdEuNPTtQ0ZjCiXrybhv+uOHMfHRmnvVo=
go.opentelemetry.io/otel/exporters/prometheus v0.62.0/go.mod h1:fgOE6FM/swEnsVQCqCnbOfRV4tOnWPg7bVeo4izBuhQ=
go.opentelemetry.io/otel/metric v1.40.0 h1:rcZe317KPftE2rstWIBitCdVp89A2HqjkxR3c11+p9g=
go.opentelemetry.io/otel/metric v1.40.0/go.mod h1:ib/crwQH7N3r5kfiBZQbwrTge743UDc7DTFVZrrXnqc=
go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8=
go.opentelemetry.io/otel/sdk v1.40.0/go.mod h1:Ph7EFdYvxq72Y8Li9q8KebuYUr2KoeyHx0DRMKrYBUE=
go.opentelemetry.io/otel/sdk/metric v1.40.0 h1:mtmdVqgQkeRxHgRv4qhyJduP3fYJRMX4AtAlbuWdCYw=
go.opentelemetry.io/otel/sdk/metric v1.40.0/go.mod h1:4Z2bGMf0KSK3uRjlczMOeMhKU2rhUqdWNoKcYrtcBPg=
go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZYblVjw=
go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA=
go.opentelemetry.io/otel/exporters/prometheus v0.61.0 h1:cCyZS4dr67d30uDyh8etKM2QyDsQ4zC9ds3bdbrVoD0=
go.opentelemetry.io/otel/exporters/prometheus v0.61.0/go.mod h1:iivMuj3xpR2DkUrUya3TPS/Z9h3dz7h01GxU+fQBRNg=
go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0=
go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs=
go.opentelemetry.io/otel/sdk v1.39.0 h1:nMLYcjVsvdui1B/4FRkwjzoRVsMK8uL/cj0OyhKzt18=
go.opentelemetry.io/otel/sdk v1.39.0/go.mod h1:vDojkC4/jsTJsE+kh+LXYQlbL8CgrEcwmt1ENZszdJE=
go.opentelemetry.io/otel/sdk/metric v1.39.0 h1:cXMVVFVgsIf2YL6QkRF4Urbr/aMInf+2WKg+sEJTtB8=
go.opentelemetry.io/otel/sdk/metric v1.39.0/go.mod h1:xq9HEVH7qeX69/JnwEfp6fVq5wosJsY1mt4lLfYdVew=
go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI=
go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA=
go.opentelemetry.io/proto/otlp v1.8.0 h1:fRAZQDcAFHySxpJ1TwlA1cJ4tvcrw7nXl9xWWC8N5CE=
go.opentelemetry.io/proto/otlp v1.8.0/go.mod h1:tIeYOeNBU4cvmPqpaji1P+KbB4Oloai8wN4rWzRrFF0=
go.uber.org/atomic v1.6.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ=
@@ -888,8 +889,8 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y
golang.org/x/crypto v0.8.0/go.mod h1:mRqEX+O9/h5TFCrQhkgjo2yKi0yYA+9ecGkdQoHrywE=
golang.org/x/crypto v0.12.0/go.mod h1:NF0Gs7EO5K4qLn+Ylc+fih8BSTeIjAP05siRnAh98yw=
golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg=
golang.org/x/crypto v0.47.0 h1:V6e3FRj+n4dbpw86FJ8Fv7XVOql7TEwpHapKoMJ/GO8=
golang.org/x/crypto v0.47.0/go.mod h1:ff3Y9VzzKbwSSEzWqJsJVBnWmRwRSHt/6Op5n9bQc4A=
golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU=
golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20250606033433-dcc06ee1d476 h1:bsqhLWFR6G6xiQcb+JoGqdKdRU6WzPWmK8E0jxTjzo4=
golang.org/x/exp v0.0.0-20250606033433-dcc06ee1d476/go.mod h1:3//PLf8L/X+8b4vuAfHzxeRUl04Adcb341+IGKfnqS8=
@@ -907,8 +908,8 @@ golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c=
golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU=
golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk=
golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc=
golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
@@ -932,14 +933,14 @@ golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.14.0/go.mod h1:PpSgVXXLK0OxS0F31C1/tv6XNguvCrnXIDrFMspZIUI=
golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY=
golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o=
golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8=
golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU=
golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20181017192945-9dcd33a902f4/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20181203162652-d668ce993890/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw=
golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
golang.org/x/oauth2 v0.33.0 h1:4Q+qn+E5z8gPRJfmRy7C2gGG3T4jIprK6aSYgTXGRpo=
golang.org/x/oauth2 v0.33.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
golang.org/x/perf v0.0.0-20180704124530-6e6d33e29852/go.mod h1:JLpeXjPJfIyPr5TlbXLkXWLhP8nz10XfvxElABhCtcw=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -983,8 +984,8 @@ golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ=
golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
golang.org/x/telemetry v0.0.0-20260109210033-bd525da824e2 h1:O1cMQHRfwNpDfDJerqRoE2oD+AFlyid87D40L/OkkJo=
golang.org/x/telemetry v0.0.0-20260109210033-bd525da824e2/go.mod h1:b7fPSJ0pKZ3ccUh8gnTONJxhn3c/PS6tyzQvyqw4iA8=
golang.org/x/telemetry v0.0.0-20251111182119-bc8e575c7b54 h1:E2/AqCUMZGgd73TQkxUMcMla25GB9i/5HOdLr+uH7Vo=
golang.org/x/telemetry v0.0.0-20251111182119-bc8e575c7b54/go.mod h1:hKdjCMrbv9skySur+Nek8Hd0uJ0GuxJIoIX2payrIdQ=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
@@ -992,8 +993,8 @@ golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.11.0/go.mod h1:zC9APTIj3jG3FdV/Ons+XE1riIZXG4aZ4GTHiPZJPIU=
golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY=
golang.org/x/term v0.39.0 h1:RclSuaJf32jOqZz74CkPA9qFuVTX7vhLlpfj/IGWlqY=
golang.org/x/term v0.39.0/go.mod h1:yxzUCTP/U+FzoxfdKmLaA0RV1WgE0VY7hXBwKtY/4ww=
golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q=
golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
@@ -1003,8 +1004,8 @@ golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE=
golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8=
golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU=
golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY=
golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI=
@@ -1026,8 +1027,8 @@ golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roY
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc=
golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg=
golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ=
golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
@@ -1078,8 +1079,8 @@ google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2
google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU=
google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE=
google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=

View File

@@ -2123,30 +2123,12 @@ const docTemplate = `{
"schema.ElevenLabsSoundGenerationRequest": {
"type": "object",
"properties": {
"bpm": {
"type": "integer"
},
"caption": {
"type": "string"
},
"do_sample": {
"type": "boolean"
},
"duration_seconds": {
"type": "number"
},
"instrumental": {
"type": "boolean"
},
"keyscale": {
"type": "string"
},
"language": {
"type": "string"
},
"lyrics": {
"type": "string"
},
"model_id": {
"type": "string"
},
@@ -2155,15 +2137,6 @@ const docTemplate = `{
},
"text": {
"type": "string"
},
"think": {
"type": "boolean"
},
"timesignature": {
"type": "string"
},
"vocal_language": {
"type": "string"
}
}
},

View File

@@ -2116,30 +2116,12 @@
"schema.ElevenLabsSoundGenerationRequest": {
"type": "object",
"properties": {
"bpm": {
"type": "integer"
},
"caption": {
"type": "string"
},
"do_sample": {
"type": "boolean"
},
"duration_seconds": {
"type": "number"
},
"instrumental": {
"type": "boolean"
},
"keyscale": {
"type": "string"
},
"language": {
"type": "string"
},
"lyrics": {
"type": "string"
},
"model_id": {
"type": "string"
},
@@ -2148,15 +2130,6 @@
},
"text": {
"type": "string"
},
"think": {
"type": "boolean"
},
"timesignature": {
"type": "string"
},
"vocal_language": {
"type": "string"
}
}
},

View File

@@ -403,34 +403,16 @@ definitions:
type: object
schema.ElevenLabsSoundGenerationRequest:
properties:
bpm:
type: integer
caption:
type: string
do_sample:
type: boolean
duration_seconds:
type: number
instrumental:
type: boolean
keyscale:
type: string
language:
type: string
lyrics:
type: string
model_id:
type: string
prompt_influence:
type: number
text:
type: string
think:
type: boolean
timesignature:
type: string
vocal_language:
type: string
type: object
schema.FunctionCall:
properties:

View File

@@ -109,14 +109,11 @@ var _ = BeforeSuite(func() {
// Create application
appCtx, appCancel = context.WithCancel(context.Background())
// Create application instance (GeneratedContentDir so sound-generation/TTS can write files the handler sends)
generatedDir := filepath.Join(tmpDir, "generated")
Expect(os.MkdirAll(generatedDir, 0750)).To(Succeed())
// Create application instance
application, err := application.New(
config.WithContext(appCtx),
config.WithSystemState(systemState),
config.WithDebug(true),
config.WithGeneratedContentDir(generatedDir),
)
Expect(err).ToNot(HaveOccurred())

View File

@@ -2,14 +2,12 @@ package main
import (
"context"
"encoding/binary"
"encoding/json"
"flag"
"fmt"
"log"
"net"
"os"
"path/filepath"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/xlog"
@@ -144,64 +142,13 @@ func (m *MockBackend) TTSStream(in *pb.TTSRequest, stream pb.Backend_TTSStreamSe
}
func (m *MockBackend) SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequest) (*pb.Result, error) {
xlog.Debug("SoundGeneration called",
"text", in.Text,
"caption", in.GetCaption(),
"lyrics", in.GetLyrics(),
"think", in.GetThink(),
"bpm", in.GetBpm(),
"keyscale", in.GetKeyscale(),
"language", in.GetLanguage(),
"timesignature", in.GetTimesignature(),
"instrumental", in.GetInstrumental())
dst := in.GetDst()
if dst != "" {
if err := os.MkdirAll(filepath.Dir(dst), 0750); err != nil {
return &pb.Result{Message: err.Error(), Success: false}, nil
}
if err := writeMinimalWAV(dst); err != nil {
return &pb.Result{Message: err.Error(), Success: false}, nil
}
}
xlog.Debug("SoundGeneration called", "text", in.Text)
return &pb.Result{
Message: "Sound generated successfully (mocked)",
Success: true,
}, nil
}
// writeMinimalWAV writes a minimal valid WAV file (short silence) so the HTTP handler can send it.
func writeMinimalWAV(path string) error {
const sampleRate = 16000
const numChannels = 1
const bitsPerSample = 16
const numSamples = 1600 // 0.1s
dataSize := numSamples * numChannels * (bitsPerSample / 8)
const headerLen = 44
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
// RIFF header
_, _ = f.Write([]byte("RIFF"))
_ = binary.Write(f, binary.LittleEndian, uint32(headerLen-8+dataSize))
_, _ = f.Write([]byte("WAVE"))
// fmt chunk
_, _ = f.Write([]byte("fmt "))
_ = binary.Write(f, binary.LittleEndian, uint32(16))
_ = binary.Write(f, binary.LittleEndian, uint16(1))
_ = binary.Write(f, binary.LittleEndian, uint16(numChannels))
_ = binary.Write(f, binary.LittleEndian, uint32(sampleRate))
_ = binary.Write(f, binary.LittleEndian, uint32(sampleRate*numChannels*(bitsPerSample/8)))
_ = binary.Write(f, binary.LittleEndian, uint16(numChannels*(bitsPerSample/8)))
_ = binary.Write(f, binary.LittleEndian, uint16(bitsPerSample))
// data chunk
_, _ = f.Write([]byte("data"))
_ = binary.Write(f, binary.LittleEndian, uint32(dataSize))
_, _ = f.Write(make([]byte, dataSize))
return nil
}
func (m *MockBackend) AudioTranscription(ctx context.Context, in *pb.TranscriptRequest) (*pb.TranscriptResult, error) {
xlog.Debug("AudioTranscription called")
return &pb.TranscriptResult{

View File

@@ -96,34 +96,6 @@ var _ = Describe("Mock Backend E2E Tests", Label("MockBackend"), func() {
})
})
Describe("Sound Generation API", func() {
It("should generate mocked sound (simple mode)", func() {
body := `{"model_id":"mock-model","text":"a soft Bengali love song for a quiet evening","instrumental":false,"vocal_language":"bn"}`
req, err := http.NewRequest("POST", apiURL+"/sound-generation", io.NopCloser(strings.NewReader(body)))
Expect(err).ToNot(HaveOccurred())
req.Header.Set("Content-Type", "application/json")
httpClient := &http.Client{Timeout: 30 * time.Second}
resp, err := httpClient.Do(req)
Expect(err).ToNot(HaveOccurred())
defer resp.Body.Close()
Expect(resp.StatusCode).To(BeNumerically("<", 500))
})
It("should generate mocked sound (advanced mode)", func() {
body := `{"model_id":"mock-model","text":"upbeat pop","caption":"A funky Japanese disco track","lyrics":"[Verse 1]\nTest lyrics","think":true,"bpm":120,"duration_seconds":225,"keyscale":"Ab major","language":"ja","timesignature":"4"}`
req, err := http.NewRequest("POST", apiURL+"/sound-generation", io.NopCloser(strings.NewReader(body)))
Expect(err).ToNot(HaveOccurred())
req.Header.Set("Content-Type", "application/json")
httpClient := &http.Client{Timeout: 30 * time.Second}
resp, err := httpClient.Do(req)
Expect(err).ToNot(HaveOccurred())
defer resp.Body.Close()
Expect(resp.StatusCode).To(BeNumerically("<", 500))
})
})
Describe("Image Generation API", func() {
It("should generate mocked image", func() {
req, err := http.NewRequest("POST", apiURL+"/images/generations", nil)