diff --git a/.agents/api-endpoints-and-auth.md b/.agents/api-endpoints-and-auth.md
index ef2857b8b..77f816849 100644
--- a/.agents/api-endpoints-and-auth.md
+++ b/.agents/api-endpoints-and-auth.md
@@ -284,7 +284,17 @@ Also bump the expected-length count in `api_instructions_test.go` and add the na
### 3. `capabilities.js` symbol (for new model-config FLAG_* flags)
-If your feature needs a new `FLAG_*` usecase flag in `core/config/model_config.go` (so users can filter gallery models by it, and so `/v1/models` surfaces it), also declare the matching symbol in `core/http/react-ui/src/utils/capabilities.js`:
+If your feature needs a new `FLAG_*` usecase flag in `core/config/model_config.go` (so users can filter gallery models by it, and so `/v1/models` surfaces it), you need to update **all** of:
+
+- `Usecase` string constant in `core/config/backend_capabilities.go`
+- `UsecaseInfoMap` entry mapping the string to its flag + gRPC method
+- `FLAG_` bitmask in `core/config/model_config.go`
+- `GetAllModelConfigUsecases()` map entry (otherwise the YAML loader silently ignores the string)
+- `ModalityGroups` membership if the flag should affect `IsMultimodal()` (e.g. realtime_audio is in both speech-input and audio-output groups so a lone flag still reads as multimodal)
+- `GuessUsecases()` branch listing the backends that own this capability
+- `usecaseFilters` in `core/http/routes/ui_api.go` (drives the gallery filter dropdown)
+- `Models.jsx` `FILTERS` array + matching `filters.` i18n key in `core/http/react-ui/public/locales/en/models.json`
+- `core/http/react-ui/src/utils/capabilities.js`:
```js
export const CAP_MY_CAPABILITY = 'FLAG_MY_CAPABILITY'
diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml
index 4aca4185e..44de7b62b 100644
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -278,6 +278,19 @@ include:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
+ - build-type: 'cublas'
+ cuda-major-version: "12"
+ cuda-minor-version: "8"
+ platforms: 'linux/amd64'
+ tag-latest: 'auto'
+ tag-suffix: '-gpu-nvidia-cuda-12-liquid-audio'
+ runs-on: 'ubuntu-latest'
+ base-image: "ubuntu:24.04"
+ skip-drivers: 'false'
+ backend: "liquid-audio"
+ dockerfile: "./backend/Dockerfile.python"
+ context: "./"
+ ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
@@ -808,6 +821,19 @@ include:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
+ - build-type: 'cublas'
+ cuda-major-version: "13"
+ cuda-minor-version: "0"
+ platforms: 'linux/amd64'
+ tag-latest: 'auto'
+ tag-suffix: '-gpu-nvidia-cuda-13-liquid-audio'
+ runs-on: 'ubuntu-latest'
+ base-image: "ubuntu:24.04"
+ skip-drivers: 'false'
+ backend: "liquid-audio"
+ dockerfile: "./backend/Dockerfile.python"
+ context: "./"
+ ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
@@ -1088,6 +1114,19 @@ include:
backend: "vibevoice"
dockerfile: "./backend/Dockerfile.python"
context: "./"
+ - build-type: 'l4t'
+ cuda-major-version: "13"
+ cuda-minor-version: "0"
+ platforms: 'linux/arm64'
+ tag-latest: 'auto'
+ tag-suffix: '-nvidia-l4t-cuda-13-arm64-liquid-audio'
+ runs-on: 'ubuntu-24.04-arm'
+ base-image: "ubuntu:24.04"
+ skip-drivers: 'false'
+ ubuntu-version: '2404'
+ backend: "liquid-audio"
+ dockerfile: "./backend/Dockerfile.python"
+ context: "./"
- build-type: 'l4t'
cuda-major-version: "13"
cuda-minor-version: "0"
@@ -1729,6 +1768,19 @@ include:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
+ - build-type: 'hipblas'
+ cuda-major-version: ""
+ cuda-minor-version: ""
+ platforms: 'linux/amd64'
+ tag-latest: 'auto'
+ tag-suffix: '-gpu-rocm-hipblas-liquid-audio'
+ runs-on: 'ubuntu-latest'
+ base-image: "rocm/dev-ubuntu-24.04:7.2.1"
+ skip-drivers: 'false'
+ backend: "liquid-audio"
+ dockerfile: "./backend/Dockerfile.python"
+ context: "./"
+ ubuntu-version: '2404'
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
@@ -2177,6 +2229,19 @@ include:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
+ - build-type: 'intel'
+ cuda-major-version: ""
+ cuda-minor-version: ""
+ platforms: 'linux/amd64'
+ tag-latest: 'auto'
+ tag-suffix: '-gpu-intel-liquid-audio'
+ runs-on: 'ubuntu-latest'
+ base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+ skip-drivers: 'false'
+ backend: "liquid-audio"
+ dockerfile: "./backend/Dockerfile.python"
+ context: "./"
+ ubuntu-version: '2404'
- build-type: 'intel'
cuda-major-version: ""
cuda-minor-version: ""
@@ -3503,6 +3568,20 @@ include:
dockerfile: "./backend/Dockerfile.python"
context: "./"
ubuntu-version: '2404'
+ - build-type: ''
+ cuda-major-version: ""
+ cuda-minor-version: ""
+ platforms: 'linux/amd64'
+ platform-tag: 'amd64'
+ tag-latest: 'auto'
+ tag-suffix: '-cpu-liquid-audio'
+ runs-on: 'ubuntu-latest'
+ base-image: "ubuntu:24.04"
+ skip-drivers: 'false'
+ backend: "liquid-audio"
+ dockerfile: "./backend/Dockerfile.python"
+ context: "./"
+ ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml
index 52f33ebde..3fd836574 100644
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -28,6 +28,7 @@ jobs:
qwen-asr: ${{ steps.detect.outputs.qwen-asr }}
nemo: ${{ steps.detect.outputs.nemo }}
voxcpm: ${{ steps.detect.outputs.voxcpm }}
+ liquid-audio: ${{ steps.detect.outputs.liquid-audio }}
llama-cpp-quantization: ${{ steps.detect.outputs.llama-cpp-quantization }}
llama-cpp: ${{ steps.detect.outputs.llama-cpp }}
ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
@@ -447,6 +448,32 @@ jobs:
run: |
make --jobs=5 --output-sync=target -C backend/python/voxcpm
make --jobs=5 --output-sync=target -C backend/python/voxcpm test
+ # liquid-audio: LFM2.5-Audio any-to-any backend. The CI smoke test
+ # exercises Health() and LoadModel(mode:finetune) — fine-tune mode
+ # short-circuits before pulling weights (backend.py:192), so no
+ # HuggingFace download or GPU is needed. The full-inference path is
+ # gated on LIQUID_AUDIO_MODEL_ID, which we don't set here.
+ tests-liquid-audio:
+ needs: detect-changes
+ if: needs.detect-changes.outputs.liquid-audio == 'true' || needs.detect-changes.outputs.run-all == 'true'
+ runs-on: ubuntu-latest
+ steps:
+ - name: Clone
+ uses: actions/checkout@v6
+ with:
+ submodules: true
+ - name: Dependencies
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y build-essential ffmpeg
+ sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+ # Install UV
+ curl -LsSf https://astral.sh/uv/install.sh | sh
+ pip install --user --no-cache-dir grpcio-tools==1.64.1
+ - name: Test liquid-audio
+ run: |
+ make --jobs=5 --output-sync=target -C backend/python/liquid-audio
+ make --jobs=5 --output-sync=target -C backend/python/liquid-audio test
tests-llama-cpp-quantization:
needs: detect-changes
if: needs.detect-changes.outputs.llama-cpp-quantization == 'true' || needs.detect-changes.outputs.run-all == 'true'
diff --git a/Makefile b/Makefile
index 488018b0d..ebeef4c41 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
# Disable parallel execution for backend builds
-.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin
+.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio
GOCMD=go
GOTEST=$(GOCMD) test
@@ -463,6 +463,7 @@ prepare-test-extra: protogen-python
$(MAKE) -C backend/python/vllm-omni
$(MAKE) -C backend/python/sglang
$(MAKE) -C backend/python/vibevoice
+ $(MAKE) -C backend/python/liquid-audio
$(MAKE) -C backend/python/moonshine
$(MAKE) -C backend/python/pocket-tts
$(MAKE) -C backend/python/qwen-tts
@@ -488,6 +489,7 @@ test-extra: prepare-test-extra
$(MAKE) -C backend/python/vllm test
$(MAKE) -C backend/python/vllm-omni test
$(MAKE) -C backend/python/vibevoice test
+ $(MAKE) -C backend/python/liquid-audio test
$(MAKE) -C backend/python/moonshine test
$(MAKE) -C backend/python/pocket-tts test
$(MAKE) -C backend/python/qwen-tts test
@@ -1092,6 +1094,7 @@ BACKEND_SGLANG = sglang|python|.|false|true
BACKEND_DIFFUSERS = diffusers|python|.|--progress=plain|true
BACKEND_CHATTERBOX = chatterbox|python|.|false|true
BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true
+BACKEND_LIQUID_AUDIO = liquid-audio|python|.|--progress=plain|true
BACKEND_MOONSHINE = moonshine|python|.|false|true
BACKEND_POCKET_TTS = pocket-tts|python|.|false|true
BACKEND_QWEN_TTS = qwen-tts|python|.|false|true
@@ -1169,6 +1172,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SGLANG)))
$(eval $(call generate-docker-build-target,$(BACKEND_DIFFUSERS)))
$(eval $(call generate-docker-build-target,$(BACKEND_CHATTERBOX)))
$(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE)))
+$(eval $(call generate-docker-build-target,$(BACKEND_LIQUID_AUDIO)))
$(eval $(call generate-docker-build-target,$(BACKEND_MOONSHINE)))
$(eval $(call generate-docker-build-target,$(BACKEND_POCKET_TTS)))
$(eval $(call generate-docker-build-target,$(BACKEND_QWEN_TTS)))
@@ -1197,7 +1201,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SHERPA_ONNX)))
docker-save-%: backend-images
docker save local-ai-backend:$* -o backend-images/$*.tar
-docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx
+docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-liquid-audio docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx
########################################################
### Mock Backend for E2E Tests
diff --git a/backend/backend.proto b/backend/backend.proto
index 915f5f91e..bf07f3bd4 100644
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -48,6 +48,11 @@ service Backend {
rpc AudioTransform(AudioTransformRequest) returns (AudioTransformResult) {}
rpc AudioTransformStream(stream AudioTransformFrameRequest) returns (stream AudioTransformFrameResponse) {}
+ // AudioToAudioStream is the bidirectional any-to-any S2S RPC. Backends
+ // that load a speech-to-speech model consume input audio frames and emit
+ // interleaved audio + transcript + tool-call deltas as typed events.
+ // Backends without S2S support return UNIMPLEMENTED.
+ rpc AudioToAudioStream(stream AudioToAudioRequest) returns (stream AudioToAudioResponse) {}
rpc ModelMetadata(ModelOptions) returns (ModelMetadataResponse) {}
@@ -768,6 +773,93 @@ message AudioTransformFrameResponse {
int64 frame_index = 2;
}
+// === AudioToAudioStream messages =========================================
+//
+// Bidirectional stream between the LocalAI core and an any-to-any audio
+// model. The client opens the stream with a Config payload, then alternates
+// Frame (input audio) and Control (turn boundaries, function-call results,
+// session updates) payloads. The server streams back typed events: audio
+// frames carry PCM in `pcm`; transcript / tool-call deltas carry JSON in
+// `meta`; the stream ends with a `response.done` (success) or `error` event.
+
+message AudioToAudioRequest {
+ oneof payload {
+ AudioToAudioConfig config = 1;
+ AudioToAudioFrame frame = 2;
+ AudioToAudioControl control = 3;
+ }
+}
+
+message AudioToAudioConfig {
+ // PCM format for client→server audio. 0 => backend default
+ // (16 kHz for the LFM2-Audio Conformer encoder).
+ int32 input_sample_rate = 1;
+ // Preferred server→client audio rate. 0 => backend default
+ // (24 kHz for the LFM2-Audio vocoder).
+ int32 output_sample_rate = 2;
+ // Optional system prompt override. Empty => backend chooses based on
+ // mode (e.g. "Respond with interleaved text and audio.").
+ string system_prompt = 3;
+ // Optional baked-voice id. Models that only ship a fixed set of
+ // voices (e.g. LFM2-Audio: us_male/us_female/uk_male/uk_female) match
+ // this against their voice table; an empty string keeps the default.
+ string voice = 4;
+ // JSON-encoded array of tool definitions in OpenAI Chat Completions
+ // format. Empty => no tools.
+ string tools = 5;
+ // Free-form sampling / decoding parameters (temperature, top_k,
+ // max_new_tokens, audio_top_k, etc).
+ map params = 6;
+ // True => reset any session-scoped state before processing further
+ // frames on this stream. The first Config implicitly resets.
+ bool reset = 7;
+}
+
+message AudioToAudioFrame {
+ // Raw PCM s16le mono at config.input_sample_rate. Empty pcm + end_of_input
+ // is a valid "user finished speaking" marker without trailing audio.
+ bytes pcm = 1;
+ // Marks the last frame of a user turn. The backend may begin emitting
+ // a response immediately after seeing this.
+ bool end_of_input = 2;
+}
+
+message AudioToAudioControl {
+ // Free-form control event names. Initial set:
+ // "input_audio_buffer.commit" — user finished speaking
+ // "response.cancel" — abort in-flight generation
+ // "conversation.item.create" — inject a non-audio item (e.g.
+ // function_call_output as JSON in
+ // `payload`)
+ // "session.update" — re-configure mid-stream
+ string event = 1;
+ // Event-specific JSON payload.
+ bytes payload = 2;
+}
+
+message AudioToAudioResponse {
+ // Event identifies what this frame carries. Mirrors the OpenAI Realtime
+ // API server-event names where applicable. Initial set:
+ // "response.audio.delta"
+ // "response.audio_transcript.delta"
+ // "response.function_call_arguments.delta"
+ // "response.function_call_arguments.done"
+ // "response.done"
+ // "error"
+ string event = 1;
+ // Populated when event = response.audio.delta.
+ bytes pcm = 2;
+ // Populated alongside pcm to identify its rate. 0 => same as the
+ // session's negotiated output_sample_rate.
+ int32 sample_rate = 3;
+ // JSON payload for non-PCM events (transcript chunk, tool args, error
+ // body).
+ bytes meta = 4;
+ // Monotonic per-stream counter, useful for client reordering and
+ // debugging.
+ int64 sequence = 5;
+}
+
message ModelMetadataResponse {
bool supports_thinking = 1;
string rendered_template = 2; // The rendered chat template with enable_thinking=true (empty if not applicable)
diff --git a/backend/index.yaml b/backend/index.yaml
index d04a2d6d5..a63f054da 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -847,6 +847,35 @@
nvidia-l4t-cuda-12: "nvidia-l4t-vibevoice"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vibevoice"
icon: https://avatars.githubusercontent.com/u/6154722?s=200&v=4
+- &liquid-audio
+ urls:
+ - https://github.com/Liquid4All/liquid-audio
+ - https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B
+ description: |
+ LiquidAI LFM2 / LFM2.5 Audio Python backend. End-to-end speech-to-speech, ASR,
+ TTS (4 baked voices), and text chat from a single 1.5B model. Wraps the
+ upstream `liquid-audio` package; supports fine-tuning via LocalAI's
+ /v1/fine-tuning/jobs endpoint.
+ tags:
+ - speech-to-speech
+ - any-to-any
+ - text-to-speech
+ - speech-to-text
+ - TTS
+ - ASR
+ - realtime
+ license: LFM-Open-License-v1.0
+ name: "liquid-audio"
+ alias: "liquid-audio"
+ capabilities:
+ nvidia: "cuda12-liquid-audio"
+ intel: "intel-liquid-audio"
+ amd: "rocm-liquid-audio"
+ default: "cpu-liquid-audio"
+ nvidia-cuda-13: "cuda13-liquid-audio"
+ nvidia-cuda-12: "cuda12-liquid-audio"
+ nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio"
+ icon: https://cdn-avatars.huggingface.co/v1/production/uploads/61b8e2ba285851687028d395/7_6D7rWrLxp2hb6OHSV1p.png
- &qwen-tts
urls:
- https://github.com/QwenLM/Qwen3-TTS
@@ -3437,6 +3466,77 @@
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-vibevoice"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-vibevoice
+## liquid-audio
+- !!merge <<: *liquid-audio
+ name: "liquid-audio-development"
+ capabilities:
+ nvidia: "cuda12-liquid-audio-development"
+ intel: "intel-liquid-audio-development"
+ amd: "rocm-liquid-audio-development"
+ default: "cpu-liquid-audio-development"
+ nvidia-cuda-13: "cuda13-liquid-audio-development"
+ nvidia-cuda-12: "cuda12-liquid-audio-development"
+ nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-liquid-audio-development"
+- !!merge <<: *liquid-audio
+ name: "cpu-liquid-audio"
+ uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-liquid-audio"
+ mirrors:
+ - localai/localai-backends:latest-cpu-liquid-audio
+- !!merge <<: *liquid-audio
+ name: "cpu-liquid-audio-development"
+ uri: "quay.io/go-skynet/local-ai-backends:master-cpu-liquid-audio"
+ mirrors:
+ - localai/localai-backends:master-cpu-liquid-audio
+- !!merge <<: *liquid-audio
+ name: "cuda12-liquid-audio"
+ uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-liquid-audio"
+ mirrors:
+ - localai/localai-backends:latest-gpu-nvidia-cuda-12-liquid-audio
+- !!merge <<: *liquid-audio
+ name: "cuda12-liquid-audio-development"
+ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-liquid-audio"
+ mirrors:
+ - localai/localai-backends:master-gpu-nvidia-cuda-12-liquid-audio
+- !!merge <<: *liquid-audio
+ name: "cuda13-liquid-audio"
+ uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-liquid-audio"
+ mirrors:
+ - localai/localai-backends:latest-gpu-nvidia-cuda-13-liquid-audio
+- !!merge <<: *liquid-audio
+ name: "cuda13-liquid-audio-development"
+ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-liquid-audio"
+ mirrors:
+ - localai/localai-backends:master-gpu-nvidia-cuda-13-liquid-audio
+- !!merge <<: *liquid-audio
+ name: "intel-liquid-audio"
+ uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-liquid-audio"
+ mirrors:
+ - localai/localai-backends:latest-gpu-intel-liquid-audio
+- !!merge <<: *liquid-audio
+ name: "intel-liquid-audio-development"
+ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-liquid-audio"
+ mirrors:
+ - localai/localai-backends:master-gpu-intel-liquid-audio
+- !!merge <<: *liquid-audio
+ name: "rocm-liquid-audio"
+ uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-liquid-audio"
+ mirrors:
+ - localai/localai-backends:latest-gpu-rocm-hipblas-liquid-audio
+- !!merge <<: *liquid-audio
+ name: "rocm-liquid-audio-development"
+ uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-liquid-audio"
+ mirrors:
+ - localai/localai-backends:master-gpu-rocm-hipblas-liquid-audio
+- !!merge <<: *liquid-audio
+ name: "cuda13-nvidia-l4t-arm64-liquid-audio"
+ uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-liquid-audio"
+ mirrors:
+ - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-liquid-audio
+- !!merge <<: *liquid-audio
+ name: "cuda13-nvidia-l4t-arm64-liquid-audio-development"
+ uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-liquid-audio"
+ mirrors:
+ - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-liquid-audio
## qwen-tts
- !!merge <<: *qwen-tts
name: "qwen-tts-development"
diff --git a/backend/python/liquid-audio/Makefile b/backend/python/liquid-audio/Makefile
new file mode 100644
index 000000000..59b71c8c9
--- /dev/null
+++ b/backend/python/liquid-audio/Makefile
@@ -0,0 +1,23 @@
+.PHONY: liquid-audio
+liquid-audio:
+ bash install.sh
+
+.PHONY: run
+run: liquid-audio
+ @echo "Running liquid-audio..."
+ bash run.sh
+ @echo "liquid-audio run."
+
+.PHONY: test
+test: liquid-audio
+ @echo "Testing liquid-audio..."
+ bash test.sh
+ @echo "liquid-audio tested."
+
+.PHONY: protogen-clean
+protogen-clean:
+ $(RM) backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: clean
+clean: protogen-clean
+ rm -rf venv __pycache__
diff --git a/backend/python/liquid-audio/backend.py b/backend/python/liquid-audio/backend.py
new file mode 100644
index 000000000..0a64ecb3e
--- /dev/null
+++ b/backend/python/liquid-audio/backend.py
@@ -0,0 +1,871 @@
+#!/usr/bin/env python3
+"""
+Liquid Audio backend for LocalAI.
+
+Wraps LiquidAI's `liquid-audio` Python package (https://github.com/Liquid4All/liquid-audio).
+The same model serves four roles, selected by the `mode` option at load time:
+chat, asr, tts, s2s. Fine-tuning is exposed via StartFineTune.
+"""
+from concurrent import futures
+import argparse
+import json
+import os
+import queue
+import signal
+import sys
+import threading
+import time
+import traceback
+import uuid
+
+import grpc
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'common'))
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'common'))
+from grpc_auth import get_auth_interceptors # noqa: E402
+from python_utils import parse_options # noqa: E402
+
+import backend_pb2 # noqa: E402
+import backend_pb2_grpc # noqa: E402
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Voice id → system-prompt suffix. The model only ships these four voices.
+VOICE_PROMPTS = {
+ "us_male": "Perform TTS. Use the US male voice.",
+ "us_female": "Perform TTS. Use the US female voice.",
+ "uk_male": "Perform TTS. Use the UK male voice.",
+ "uk_female": "Perform TTS. Use the UK female voice.",
+}
+DEFAULT_VOICE = "us_female"
+
+# Special-token IDs that LFM2-Audio emits to delimit modality boundaries.
+# Sourced from liquid_audio/model/lfm2_audio.py (see generate_sequential/_sample_*).
+TEXT_END_TOKEN = 130 # <|text_end|>
+AUDIO_START_TOKEN = 128 # <|audio_start|>
+IM_END_TOKEN = 7 # <|im_end|>
+AUDIO_EOS_CODE = 2048 # signals end-of-audio in any codebook position
+
+_PATCHED_LOCAL_PATHS = False
+
+
+def _patch_liquid_audio_local_paths():
+ """Make liquid_audio.utils.get_model_dir() tolerate local directories.
+
+ Upstream always passes its argument to huggingface_hub.snapshot_download,
+ which only accepts `owner/repo` ids. LocalAI's gallery hands us absolute
+ paths under //, so we intercept snapshot_download
+ in the liquid_audio.utils namespace and return the directory as-is when
+ it already exists on disk. Idempotent.
+ """
+ global _PATCHED_LOCAL_PATHS
+ if _PATCHED_LOCAL_PATHS:
+ return
+ import liquid_audio.utils as _la_utils
+ _orig_snapshot_download = _la_utils.snapshot_download
+
+ def _local_first_snapshot_download(repo_id, revision=None, **kwargs):
+ if isinstance(repo_id, (str, os.PathLike)) and os.path.isdir(str(repo_id)):
+ return str(repo_id)
+ return _orig_snapshot_download(repo_id, revision=revision, **kwargs)
+
+ _la_utils.snapshot_download = _local_first_snapshot_download
+ _PATCHED_LOCAL_PATHS = True
+
+
+def _select_device():
+ import torch
+ if torch.cuda.is_available():
+ return "cuda"
+ if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+ return "mps"
+ return "cpu"
+
+
+class ActiveJob:
+ """Tracks an in-flight fine-tune so FineTuneProgress can stream from its queue."""
+
+ def __init__(self, job_id):
+ self.job_id = job_id
+ self.progress_queue = queue.Queue()
+ self.thread = None
+ self.stopped = False
+ self.completed = False
+ self.error = None
+
+
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+ def __init__(self):
+ self.processor = None
+ self.model = None
+ self.device = "cpu"
+ self.dtype = None
+ self.options = {}
+ self.model_id = None
+ self.active_job = None
+
+ @property
+ def mode(self):
+ return str(self.options.get("mode", "chat")).lower()
+
+ @property
+ def voice(self):
+ v = str(self.options.get("voice", DEFAULT_VOICE)).lower()
+ return v if v in VOICE_PROMPTS else DEFAULT_VOICE
+
+
+ def Free(self, request, context):
+ # Called by LocalAI when unloading the model. Drop GPU tensors so the
+ # next load starts from a clean state instead of bumping into OOM.
+ try:
+ for attr in ("model", "processor", "tokenizer"):
+ if hasattr(self, attr):
+ try:
+ delattr(self, attr)
+ except Exception:
+ pass
+ import gc
+ gc.collect()
+ try:
+ import torch
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ except Exception:
+ pass
+ return backend_pb2.Result(success=True, message="OK")
+ except Exception as exc:
+ print(f"Free failed: {exc}", file=sys.stderr)
+ return backend_pb2.Result(success=False, message=str(exc))
+
+
+ def Health(self, request, context):
+ return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+
+ def LoadModel(self, request, context):
+ try:
+ import torch
+
+ self.options = parse_options(request.Options)
+ if self.options.get("voice") and self.options["voice"] not in VOICE_PROMPTS:
+ print(f"Warning: unknown voice '{self.options['voice']}'; defaulting to '{DEFAULT_VOICE}'",
+ file=sys.stderr)
+
+ requested_device = self.options.get("device")
+ self.device = requested_device or _select_device()
+ if self.device == "cuda" and not torch.cuda.is_available():
+ return backend_pb2.Result(success=False, message="CUDA requested but not available")
+ if self.device == "mps" and not (hasattr(torch.backends, "mps") and
+ torch.backends.mps.is_available()):
+ print("MPS not available; falling back to CPU", file=sys.stderr)
+ self.device = "cpu"
+
+ dtype_name = str(self.options.get("dtype", "bfloat16")).lower()
+ self.dtype = {
+ "bfloat16": torch.bfloat16,
+ "bf16": torch.bfloat16,
+ "float16": torch.float16,
+ "fp16": torch.float16,
+ "half": torch.float16,
+ "float32": torch.float32,
+ "fp32": torch.float32,
+ }.get(dtype_name, torch.bfloat16)
+
+ # request.Model holds the raw `parameters.model` value (an HF
+ # repo id like "LiquidAI/LFM2.5-Audio-1.5B"); request.ModelFile
+ # is LocalAI's ModelPath-prefixed local copy that exists only
+ # when the gallery supplied a `files:` list. Mirror the
+ # transformers/vibevoice convention: prefer the repo id and
+ # only switch to the local path if it's been staged on disk.
+ model_id = request.Model
+ if not model_id:
+ model_id = request.ModelFile
+ if not model_id:
+ return backend_pb2.Result(success=False, message="No model identifier provided")
+ if request.ModelFile and os.path.isdir(request.ModelFile):
+ model_id = request.ModelFile
+ self.model_id = model_id
+
+ # Pure fine-tune jobs don't need an in-memory inference model — the
+ # Trainer instantiates its own copy at StartFineTune time.
+ if self.mode == "finetune":
+ print(f"Loaded liquid-audio backend in fine-tune mode (model id: {model_id})",
+ file=sys.stderr)
+ return backend_pb2.Result(success=True, message="OK")
+
+ from liquid_audio import LFM2AudioModel, LFM2AudioProcessor
+
+ # liquid_audio's from_pretrained unconditionally routes through
+ # huggingface_hub.snapshot_download, which rejects local paths
+ # (HFValidationError on `/models/LiquidAI/LFM2.5-Audio-1.5B`).
+ # When LocalAI's gallery has already staged the weights on disk,
+ # short-circuit the download to return the local directory.
+ _patch_liquid_audio_local_paths()
+
+ print(f"Loading liquid-audio model '{model_id}' on {self.device} ({self.dtype})",
+ file=sys.stderr)
+ self.processor = LFM2AudioProcessor.from_pretrained(model_id, device=self.device).eval()
+ self.model = LFM2AudioModel.from_pretrained(
+ model_id, device=self.device, dtype=self.dtype
+ ).eval()
+
+ print(f"Liquid-audio mode={self.mode}, voice={self.voice}", file=sys.stderr)
+ return backend_pb2.Result(success=True, message="OK")
+
+ except Exception as exc:
+ print(f"LoadModel failed: {exc}", file=sys.stderr)
+ print(traceback.format_exc(), file=sys.stderr)
+ return backend_pb2.Result(success=False, message=str(exc))
+
+
+ def Predict(self, request, context):
+ try:
+ text = "".join(self._generate_text_stream(request))
+ return backend_pb2.Reply(message=text.encode("utf-8"))
+ except Exception as exc:
+ print(f"Predict failed: {exc}", file=sys.stderr)
+ print(traceback.format_exc(), file=sys.stderr)
+ context.set_code(grpc.StatusCode.INTERNAL)
+ context.set_details(str(exc))
+ return backend_pb2.Reply()
+
+ def PredictStream(self, request, context):
+ try:
+ for delta in self._generate_text_stream(request):
+ yield backend_pb2.Reply(message=delta.encode("utf-8"))
+ except Exception as exc:
+ print(f"PredictStream failed: {exc}", file=sys.stderr)
+ print(traceback.format_exc(), file=sys.stderr)
+ context.set_code(grpc.StatusCode.INTERNAL)
+ context.set_details(str(exc))
+
+
+ def VAD(self, request, context):
+ # Stub voice-activity detector: RMS-energy threshold over 30ms frames at
+ # 16 kHz. Good enough for the realtime endpoint's handleVAD loop, which
+ # only inspects segment presence + last segment end. The proper signal
+ # would come from the model's audio encoder, but that ride-along is a
+ # PR-D scope item — until then this keeps the legacy pipeline path
+ # working without forcing the operator to install a separate VAD model.
+ import numpy as np
+ try:
+ audio = np.asarray(request.audio, dtype=np.float32)
+ if audio.size == 0:
+ return backend_pb2.VADResponse(segments=[])
+
+ sample_rate = 16000
+ frame_size = sample_rate * 30 // 1000 # 30ms → 480 samples
+ threshold = float(self.options.get("vad_rms_threshold", 0.01))
+ min_speech_frames = int(self.options.get("vad_min_speech_frames", 2)) # ≥60ms
+ # handleVAD ticks every 300 ms and only inspects segment presence
+ # + last segment end relative to silence_threshold (~500 ms). Cap
+ # the analysed window to the tail of the buffer so we don't redo
+ # the entire growing utterance every tick.
+ window_s = float(self.options.get("vad_window_s", 5.0))
+ window_samples = int(window_s * sample_rate)
+ time_offset_s = 0.0
+ if audio.size > window_samples:
+ time_offset_s = (audio.size - window_samples) / sample_rate
+ audio = audio[-window_samples:]
+
+ n_frames = audio.size // frame_size
+ if n_frames == 0:
+ return backend_pb2.VADResponse(segments=[])
+ frames = audio[: n_frames * frame_size].reshape(n_frames, frame_size)
+ rms = np.sqrt(np.mean(frames ** 2, axis=1))
+ speech = rms > threshold
+
+ def _emit(start_idx, end_idx, out):
+ if end_idx - start_idx >= min_speech_frames:
+ out.append(backend_pb2.VADSegment(
+ start=time_offset_s + start_idx * frame_size / sample_rate,
+ end=time_offset_s + end_idx * frame_size / sample_rate,
+ ))
+
+ segments = []
+ start_idx = None
+ for i, is_speech in enumerate(speech):
+ if is_speech and start_idx is None:
+ start_idx = i
+ elif not is_speech and start_idx is not None:
+ _emit(start_idx, i, segments)
+ start_idx = None
+ if start_idx is not None:
+ _emit(start_idx, n_frames, segments)
+ return backend_pb2.VADResponse(segments=segments)
+ except Exception as exc:
+ print(f"VAD failed: {exc}", file=sys.stderr)
+ print(traceback.format_exc(), file=sys.stderr)
+ context.set_code(grpc.StatusCode.INTERNAL)
+ context.set_details(str(exc))
+ return backend_pb2.VADResponse(segments=[])
+
+
+ def TTS(self, request, context):
+ try:
+ if self.model is None or self.processor is None:
+ return backend_pb2.Result(success=False, message="Model not loaded")
+
+ import torch
+ import torchaudio
+ from liquid_audio import ChatState
+
+ voice = request.voice.lower() if request.voice else self.voice
+ voice = voice.removeprefix("lfm2:").removeprefix("lfm:")
+ if voice not in VOICE_PROMPTS:
+ voice = self.voice
+ system_prompt = VOICE_PROMPTS[voice]
+
+ chat = ChatState(self.processor)
+ chat.new_turn("system")
+ chat.add_text(system_prompt)
+ chat.end_turn()
+ chat.new_turn("user")
+ chat.add_text(request.text or "")
+ chat.end_turn()
+ chat.new_turn("assistant")
+
+ audio_top_k = int(self.options.get("audio_top_k", 64))
+ audio_temp = float(self.options.get("audio_temperature", 0.8))
+ max_new = int(self.options.get("max_new_tokens", 2048))
+
+ audio_out = []
+ for tok in self.model.generate_sequential(
+ **chat,
+ max_new_tokens=max_new,
+ audio_temperature=audio_temp,
+ audio_top_k=audio_top_k,
+ ):
+ if tok.numel() > 1:
+ audio_out.append(tok)
+
+ if len(audio_out) <= 1:
+ return backend_pb2.Result(success=False, message="No audio frames generated")
+
+ # Drop the trailing end-of-audio frame, matching the package's examples.
+ audio_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0)
+ waveform = self.processor.decode(audio_codes)
+
+ out_path = request.dst
+ if not out_path:
+ return backend_pb2.Result(success=False, message="dst path is required")
+ os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
+ # soundfile in preference to torchaudio.save — the latter routes
+ # through torchcodec, whose native libs need NVIDIA NPP that we
+ # don't bundle in the cuda13 image.
+ import soundfile as _sf
+ _sf.write(out_path, waveform.cpu().numpy().squeeze(0).T, 24_000)
+
+ return backend_pb2.Result(success=True)
+ except Exception as exc:
+ print(f"TTS failed: {exc}", file=sys.stderr)
+ print(traceback.format_exc(), file=sys.stderr)
+ return backend_pb2.Result(success=False, message=str(exc))
+
+
+ def AudioToAudioStream(self, request_iterator, context):
+ """Bidirectional any-to-any speech-to-speech stream.
+
+ See `backend.proto` AudioToAudioStream for the wire protocol. Audio
+ is decoded once per turn here; chunked detokenization for sub-second
+ TTFB is left to a future iteration once the LFM2AudioDetokenizer
+ gains a streaming entry point.
+ """
+ try:
+ yield from self._audio_to_audio_stream(request_iterator, context)
+ except Exception as exc:
+ print(f"AudioToAudioStream failed: {exc}", file=sys.stderr)
+ print(traceback.format_exc(), file=sys.stderr)
+ yield backend_pb2.AudioToAudioResponse(
+ event="error",
+ meta=json.dumps({"message": str(exc)}).encode("utf-8"),
+ )
+
+ def _audio_to_audio_stream(self, request_iterator, context):
+ if self.model is None or self.processor is None:
+ raise RuntimeError("Model not loaded")
+
+ import torch
+ import torchaudio
+ from liquid_audio import ChatState
+
+ cfg = None
+ chat = None
+ input_sample_rate = 16000
+ output_sample_rate = 24000
+ sequence = 0
+
+ def _new_event(event, **kwargs):
+ nonlocal sequence
+ sequence += 1
+ kwargs.setdefault("sequence", sequence)
+ return backend_pb2.AudioToAudioResponse(event=event, **kwargs)
+
+ def _ensure_chat():
+ """Build a fresh ChatState seeded with the system prompt."""
+ nonlocal chat
+ chat = ChatState(self.processor)
+ system_prompt = (cfg.system_prompt if cfg and cfg.system_prompt
+ else "Respond with interleaved text and audio.")
+ chat.new_turn("system")
+ chat.add_text(system_prompt)
+ chat.end_turn()
+
+ # Buffers for the in-flight user turn
+ pcm_buffer = bytearray()
+
+ def _consume_user_turn():
+ nonlocal pcm_buffer
+ if not pcm_buffer:
+ return
+ # Avoid the bytes(pcm_buffer) copy and let the float widen happen
+ # in-place: numpy view → torch view → in-place divide.
+ import numpy as np
+ arr = np.frombuffer(memoryview(pcm_buffer), dtype=np.int16)
+ wav = torch.from_numpy(arr).to(torch.float32).div_(32768.0).unsqueeze(0)
+ chat.new_turn("user")
+ chat.add_audio(wav, input_sample_rate)
+ chat.end_turn()
+ pcm_buffer = bytearray()
+
+ def _run_generation():
+ """Run generate_interleaved; yield response events as we go."""
+ chat.new_turn("assistant")
+ audio_top_k = int(self.options.get("audio_top_k", 4))
+ audio_temp = float(self.options.get("audio_temperature", 1.0))
+ text_top_k = int(self.options.get("text_top_k", 0)) or None
+ text_temp = float(self.options.get("text_temperature", 0)) or None
+ max_new = int(self.options.get("max_new_tokens", 512))
+
+ audio_tokens = []
+ for tok in self.model.generate_interleaved(
+ **chat,
+ max_new_tokens=max_new,
+ text_temperature=text_temp,
+ text_top_k=text_top_k,
+ audio_temperature=audio_temp,
+ audio_top_k=audio_top_k,
+ ):
+ if tok.numel() == 1:
+ if tok.item() == IM_END_TOKEN:
+ break
+ text = self.processor.text.decode(tok)
+ if not text:
+ continue
+ yield _new_event(
+ "response.audio_transcript.delta",
+ meta=json.dumps({"delta": text}).encode("utf-8"),
+ )
+ else:
+ audio_tokens.append(tok)
+
+ # Detokenize the accumulated audio at end-of-turn — the
+ # LFM2AudioDetokenizer is non-streaming today.
+ if len(audio_tokens) > 1:
+ audio_codes = torch.stack(audio_tokens[:-1], 1).unsqueeze(0)
+ waveform = self.processor.decode(audio_codes)
+ # Convert to s16le PCM bytes at output_sample_rate
+ if output_sample_rate != 24000:
+ waveform = torchaudio.functional.resample(
+ waveform.cpu(), 24000, output_sample_rate
+ )
+ pcm = (waveform.cpu().squeeze(0).clamp(-1, 1) * 32767.0).to(
+ torch.int16
+ ).numpy().tobytes()
+ yield _new_event(
+ "response.audio.delta",
+ pcm=pcm,
+ sample_rate=output_sample_rate,
+ )
+
+ yield _new_event("response.done", meta=b"{}")
+
+ for req in request_iterator:
+ if not context.is_active():
+ return
+ payload = req.WhichOneof("payload")
+ if payload == "config":
+ cfg = req.config
+ if cfg.input_sample_rate > 0:
+ input_sample_rate = cfg.input_sample_rate
+ if cfg.output_sample_rate > 0:
+ output_sample_rate = cfg.output_sample_rate
+ # The first config implicitly resets state.
+ _ensure_chat()
+ pcm_buffer = bytearray()
+ elif payload == "frame":
+ if chat is None:
+ _ensure_chat()
+ if req.frame.pcm:
+ pcm_buffer.extend(req.frame.pcm)
+ if req.frame.end_of_input:
+ _consume_user_turn()
+ yield from _run_generation()
+ elif payload == "control":
+ event = req.control.event
+ if event == "input_audio_buffer.commit":
+ _consume_user_turn()
+ yield from _run_generation()
+ elif event == "response.cancel":
+ # Synchronous generation here means cancel can only
+ # take effect between turns; we ack so the client unblocks.
+ yield _new_event("response.done", meta=b'{"cancelled":true}')
+ elif event == "session.update":
+ # Free-form session re-config; treat as a soft reset.
+ _ensure_chat()
+ pcm_buffer = bytearray()
+ # Unknown events are ignored — forward-compatible.
+
+
+ def AudioTranscription(self, request, context):
+ try:
+ if self.model is None or self.processor is None:
+ return backend_pb2.TranscriptResult(segments=[], text="")
+
+ import torchaudio
+ from liquid_audio import ChatState
+
+ audio_path = request.dst
+ if not audio_path:
+ return backend_pb2.TranscriptResult(segments=[], text="")
+
+ chat = ChatState(self.processor)
+ chat.new_turn("system")
+ chat.add_text("Perform ASR.")
+ chat.end_turn()
+ chat.new_turn("user")
+ # soundfile in preference to torchaudio.load — the latter routes
+ # through torchcodec which needs NVIDIA NPP libs we don't bundle.
+ import soundfile as _sf
+ import torch
+ audio_np, sr = _sf.read(audio_path, dtype="float32", always_2d=True)
+ wav = torch.from_numpy(audio_np.T) # (channels, samples)
+ if wav.shape[0] > 1:
+ # Down-mix to mono — the processor expects a single channel
+ wav = wav.mean(dim=0, keepdim=True)
+ chat.add_audio(wav, sr)
+ chat.end_turn()
+ chat.new_turn("assistant")
+
+ max_new = int(self.options.get("max_new_tokens", 1024))
+
+ pieces = []
+ for tok in self.model.generate_sequential(**chat, max_new_tokens=max_new):
+ if tok.numel() == 1:
+ if tok.item() == IM_END_TOKEN:
+ break
+ pieces.append(self.processor.text.decode(tok))
+
+ text = "".join(pieces).strip()
+ duration_ms = int((wav.shape[1] / sr) * 1000)
+ segment = backend_pb2.TranscriptSegment(
+ id=0, start=0, end=duration_ms, text=text, tokens=[],
+ )
+ return backend_pb2.TranscriptResult(segments=[segment], text=text)
+ except Exception as exc:
+ print(f"AudioTranscription failed: {exc}", file=sys.stderr)
+ print(traceback.format_exc(), file=sys.stderr)
+ return backend_pb2.TranscriptResult(segments=[], text="")
+
+
+ def StartFineTune(self, request, context):
+ if self.active_job is not None and not self.active_job.completed:
+ return backend_pb2.FineTuneJobResult(
+ job_id="", success=False,
+ message="A fine-tuning job is already running",
+ )
+
+ job_id = request.job_id or str(uuid.uuid4())
+ job = ActiveJob(job_id)
+ self.active_job = job
+
+ thread = threading.Thread(target=self._run_training, args=(request, job), daemon=True)
+ job.thread = thread
+ thread.start()
+
+ return backend_pb2.FineTuneJobResult(
+ job_id=job_id, success=True, message="Training started",
+ )
+
+ def FineTuneProgress(self, request, context):
+ if self.active_job is None or self.active_job.job_id != request.job_id:
+ context.set_code(grpc.StatusCode.NOT_FOUND)
+ context.set_details(f"Job {request.job_id} not found")
+ return
+
+ job = self.active_job
+ while True:
+ try:
+ update = job.progress_queue.get(timeout=1.0)
+ except queue.Empty:
+ if job.completed or job.stopped:
+ break
+ if not context.is_active():
+ break
+ continue
+ if update is None:
+ break
+ yield update
+ if update.status in ("completed", "failed", "stopped"):
+ break
+
+ def StopFineTune(self, request, context):
+ # We can't kill the Accelerate training loop mid-step cleanly from here;
+ # LocalAI's job manager kills the backend process on stop. The flag below
+ # at least lets the progress stream terminate quickly.
+ if self.active_job is not None and self.active_job.job_id == request.job_id:
+ self.active_job.stopped = True
+ self.active_job.progress_queue.put(None)
+ return backend_pb2.Result(success=True, message="OK")
+
+ def _run_training(self, request, job):
+ try:
+ self._do_train(request, job)
+ job.completed = True
+ job.progress_queue.put(backend_pb2.FineTuneProgressUpdate(
+ job_id=job.job_id, status="completed", message="Training completed",
+ progress_percent=100.0,
+ ))
+ except Exception as exc:
+ job.error = str(exc)
+ job.completed = True
+ print(f"Training failed: {exc}", file=sys.stderr)
+ print(traceback.format_exc(), file=sys.stderr)
+ job.progress_queue.put(backend_pb2.FineTuneProgressUpdate(
+ job_id=job.job_id, status="failed", message=str(exc),
+ ))
+ finally:
+ job.progress_queue.put(None)
+
+ def _do_train(self, request, job):
+ from liquid_audio import LFM2AudioModel # noqa: F401 (sanity import)
+ from liquid_audio.data.dataloader import LFM2DataLoader
+ from liquid_audio.trainer import Trainer
+
+ model_id = request.model or self.model_id or "LiquidAI/LFM2.5-Audio-1.5B"
+
+ dataset_path = request.dataset_source
+ if not dataset_path:
+ raise ValueError("dataset_source is required (path to a preprocessed dataset)")
+
+ extras = dict(request.extra_options) if request.extra_options else {}
+ val_path = extras.get("val_dataset")
+
+ # Map FineTuneRequest hyperparameters to liquid_audio.Trainer constructor args
+ lr = request.learning_rate or 3e-5
+ max_steps = request.max_steps or 1000
+ warmup_steps = request.warmup_steps or min(100, max_steps // 10)
+ batch_size = request.batch_size or 16
+ save_interval = request.save_steps or max(1, max_steps // 4)
+
+ output_dir = request.output_dir or os.path.join(
+ os.environ.get("LIQUID_AUDIO_OUTPUT_DIR", "/tmp"),
+ f"liquid-audio-{job.job_id}",
+ )
+ os.makedirs(output_dir, exist_ok=True)
+
+ job.progress_queue.put(backend_pb2.FineTuneProgressUpdate(
+ job_id=job.job_id, status="loading_dataset",
+ message=f"Loading preprocessed dataset from {dataset_path}",
+ ))
+ train_data = LFM2DataLoader(dataset_path)
+ val_data = LFM2DataLoader(val_path) if val_path else None
+
+ job.progress_queue.put(backend_pb2.FineTuneProgressUpdate(
+ job_id=job.job_id, status="loading_model",
+ message=f"Loading base model {model_id}",
+ ))
+
+ # The Liquid Trainer logs via self.accelerator.print; we subclass it to
+ # also push progress events onto the queue every logging_interval steps.
+ progress_q = job.progress_queue
+
+ class QueuedTrainer(Trainer):
+ def log(self_, model_output):
+ if self_.step > 0 and self_.step % self_.logging_interval == 0:
+ try:
+ loss = self_.accelerator.reduce(
+ model_output.loss.detach(), reduction="mean"
+ ).item()
+ except Exception:
+ loss = float("nan")
+ lr_now = self_.optimizer.param_groups[0]["lr"]
+ pct = (self_.step / self_.max_steps * 100.0) if self_.max_steps else 0.0
+ progress_q.put(backend_pb2.FineTuneProgressUpdate(
+ job_id=job.job_id,
+ current_step=int(self_.step),
+ total_steps=int(self_.max_steps),
+ current_epoch=float(self_.epoch),
+ loss=float(loss),
+ learning_rate=float(lr_now),
+ progress_percent=float(pct),
+ status="training",
+ ))
+ # Honour stop requests: raising here terminates the loop cleanly
+ if job.stopped:
+ raise KeyboardInterrupt("stop requested")
+ return super().log(model_output)
+
+ def validate(self_):
+ progress_q.put(backend_pb2.FineTuneProgressUpdate(
+ job_id=job.job_id, current_step=int(self_.step),
+ total_steps=int(self_.max_steps), status="training",
+ message=f"Running validation at step {self_.step}",
+ ))
+ return super().validate()
+
+ trainer = QueuedTrainer(
+ model_id=model_id,
+ train_data=train_data,
+ val_data=val_data,
+ lr=lr,
+ max_steps=max_steps,
+ warmup_steps=warmup_steps,
+ batch_size=batch_size,
+ save_interval=save_interval,
+ output_dir=output_dir,
+ weight_decay=request.weight_decay or 0.1,
+ )
+
+ job.progress_queue.put(backend_pb2.FineTuneProgressUpdate(
+ job_id=job.job_id, status="training", message="Training started",
+ total_steps=int(max_steps),
+ ))
+ trainer.train()
+
+ job.progress_queue.put(backend_pb2.FineTuneProgressUpdate(
+ job_id=job.job_id, status="saving",
+ message=f"Saved final model to {output_dir}",
+ checkpoint_path=os.path.join(output_dir, "final"),
+ ))
+
+
+ def _build_chat_state(self, messages, user_prompt, tools_prelude=None):
+ """Build a ChatState from a list of (role, content) tuples plus an optional final user turn.
+
+ tools_prelude, when non-empty, is prepended as an extra system turn carrying
+ the LFM2 tool-list block — mirrors gallery/lfm.yaml's `function:` template
+ so the model sees the same prompt shape whether served via llama-cpp or here.
+ """
+ from liquid_audio import ChatState
+ chat = ChatState(self.processor)
+ if tools_prelude:
+ chat.new_turn("system")
+ chat.add_text(tools_prelude)
+ chat.end_turn()
+ for role, content in messages:
+ chat.new_turn(role)
+ chat.add_text(content)
+ chat.end_turn()
+ if user_prompt:
+ chat.new_turn("user")
+ chat.add_text(user_prompt)
+ chat.end_turn()
+ chat.new_turn("assistant")
+ return chat
+
+ def _collect_messages(self, request):
+ """Translate PredictOptions.Messages into (role, content) tuples."""
+ out = []
+ for m in request.Messages:
+ role = (m.role or "user").lower()
+ if role not in ("system", "user", "assistant"):
+ role = "user"
+ out.append((role, m.content or ""))
+ return out
+
+ def _render_tools_prelude(self, request):
+ """Build the LFM2 `<|tool_list_start|>…<|tool_list_end|>` system prelude
+ from request.Tools (OpenAI Chat-Completions tool JSON). Returns "" when
+ no tools are attached. Output mirrors gallery/lfm.yaml's `function:`
+ template so the model sees the same prompt whether routed via llama-cpp
+ or this backend."""
+ tools_raw = getattr(request, "Tools", "") or ""
+ if not tools_raw:
+ return ""
+ try:
+ tools = json.loads(tools_raw)
+ except json.JSONDecodeError:
+ print(f"liquid-audio: ignoring malformed Tools JSON: {tools_raw[:200]!r}",
+ file=sys.stderr)
+ return ""
+ if not isinstance(tools, list) or not tools:
+ return ""
+ # The LFM2 chat template uses single-quoted Python-dict-ish syntax in
+ # examples, but the tokenizer treats this whole block as opaque text;
+ # JSON works fine and is what other backends emit.
+ return (
+ "You are a function calling AI model. You are provided with functions to "
+ "execute. You may call one or more functions to assist with the user query. "
+ "Don't make assumptions about what values to plug into functions.\n"
+ "List of tools: <|tool_list_start|>"
+ + json.dumps(tools, separators=(",", ":"))
+ + "<|tool_list_end|>"
+ )
+
+ def _generate_text_stream(self, request):
+ """Yield text-only deltas from generate_sequential. Caller joins for unary Predict."""
+ if self.model is None or self.processor is None:
+ raise RuntimeError("Model not loaded")
+ messages = self._collect_messages(request)
+ user_prompt = request.Prompt or None
+ tools_prelude = self._render_tools_prelude(request)
+ # If the request already carries Messages, Prompt is the templated form
+ # of the same content — don't append a duplicate user turn.
+ chat = self._build_chat_state(
+ messages,
+ user_prompt if not messages else None,
+ tools_prelude=tools_prelude,
+ )
+
+ max_new = request.Tokens if request.Tokens > 0 else int(self.options.get("max_new_tokens", 512))
+ temperature = request.Temperature if request.Temperature > 0 else None
+ top_k = request.TopK if request.TopK > 0 else None
+
+ for tok in self.model.generate_sequential(
+ **chat,
+ max_new_tokens=max_new,
+ text_temperature=temperature,
+ text_top_k=top_k,
+ ):
+ if tok.numel() == 1:
+ if tok.item() == IM_END_TOKEN:
+ break
+ yield self.processor.text.decode(tok)
+
+
+def serve(address):
+ server = grpc.server(
+ futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+ options=[
+ ('grpc.max_message_length', 50 * 1024 * 1024),
+ ('grpc.max_send_message_length', 50 * 1024 * 1024),
+ ('grpc.max_receive_message_length', 50 * 1024 * 1024),
+ ],
+ interceptors=get_auth_interceptors(),
+ )
+ backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+ server.add_insecure_port(address)
+ server.start()
+ print(f"Liquid-audio backend listening on {address}", file=sys.stderr, flush=True)
+
+ def stop(_signum, _frame):
+ server.stop(0)
+ sys.exit(0)
+
+ signal.signal(signal.SIGTERM, stop)
+ signal.signal(signal.SIGINT, stop)
+
+ try:
+ while True:
+ time.sleep(_ONE_DAY_IN_SECONDS)
+ except KeyboardInterrupt:
+ server.stop(0)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Liquid Audio gRPC backend")
+ parser.add_argument("--addr", default="localhost:50051", help="gRPC server address")
+ args = parser.parse_args()
+ serve(args.addr)
diff --git a/backend/python/liquid-audio/install.sh b/backend/python/liquid-audio/install.sh
new file mode 100755
index 000000000..c7ed8eaa8
--- /dev/null
+++ b/backend/python/liquid-audio/install.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+set -e
+
+# liquid-audio requires Python ≥ 3.12 (per its pyproject.toml); the default
+# portable Python in libbackend.sh is 3.10. Override before sourcing.
+export PYTHON_VERSION="${PYTHON_VERSION:-3.12}"
+export PYTHON_PATCH="${PYTHON_PATCH:-11}"
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+ source $backend_dir/common/libbackend.sh
+else
+ source $backend_dir/../common/libbackend.sh
+fi
+
+# liquid-audio's torch wheels are large; allow upgrades to satisfy transitive pins
+EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+installRequirements
diff --git a/backend/python/liquid-audio/protogen.sh b/backend/python/liquid-audio/protogen.sh
new file mode 100755
index 000000000..df3325c6f
--- /dev/null
+++ b/backend/python/liquid-audio/protogen.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+ source $backend_dir/common/libbackend.sh
+else
+ source $backend_dir/../common/libbackend.sh
+fi
+
+runProtogen
diff --git a/backend/python/liquid-audio/requirements-cpu.txt b/backend/python/liquid-audio/requirements-cpu.txt
new file mode 100644
index 000000000..c2fee25e3
--- /dev/null
+++ b/backend/python/liquid-audio/requirements-cpu.txt
@@ -0,0 +1,13 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch>=2.8.0
+torchaudio>=2.8.0
+torchcodec>=0.9.1
+transformers>=4.55.4
+accelerate>=1.10.1
+datasets>=4.8.4
+einops>=0.8.1
+librosa>=0.11.0
+soundfile>=0.12.1
+sentencepiece>=0.2.1
+huggingface-hub>=1.3.0
+liquid-audio>=1.2.0
diff --git a/backend/python/liquid-audio/requirements-cublas12.txt b/backend/python/liquid-audio/requirements-cublas12.txt
new file mode 100644
index 000000000..8f1a965bb
--- /dev/null
+++ b/backend/python/liquid-audio/requirements-cublas12.txt
@@ -0,0 +1,13 @@
+--extra-index-url https://download.pytorch.org/whl/cu121
+torch>=2.8.0
+torchaudio>=2.8.0
+torchcodec>=0.9.1
+transformers>=4.55.4
+accelerate>=1.10.1
+datasets>=4.8.4
+einops>=0.8.1
+librosa>=0.11.0
+soundfile>=0.12.1
+sentencepiece>=0.2.1
+huggingface-hub>=1.3.0
+liquid-audio>=1.2.0
diff --git a/backend/python/liquid-audio/requirements-cublas13.txt b/backend/python/liquid-audio/requirements-cublas13.txt
new file mode 100644
index 000000000..4ab768158
--- /dev/null
+++ b/backend/python/liquid-audio/requirements-cublas13.txt
@@ -0,0 +1,13 @@
+--extra-index-url https://download.pytorch.org/whl/cu130
+torch>=2.8.0
+torchaudio>=2.8.0
+torchcodec>=0.9.1
+transformers>=4.55.4
+accelerate>=1.10.1
+datasets>=4.8.4
+einops>=0.8.1
+librosa>=0.11.0
+soundfile>=0.12.1
+sentencepiece>=0.2.1
+huggingface-hub>=1.3.0
+liquid-audio>=1.2.0
diff --git a/backend/python/liquid-audio/requirements-hipblas.txt b/backend/python/liquid-audio/requirements-hipblas.txt
new file mode 100644
index 000000000..304f934b1
--- /dev/null
+++ b/backend/python/liquid-audio/requirements-hipblas.txt
@@ -0,0 +1,13 @@
+--extra-index-url https://download.pytorch.org/whl/rocm7.0
+torch>=2.8.0
+torchaudio>=2.8.0
+torchcodec>=0.9.1
+transformers>=4.55.4
+accelerate>=1.10.1
+datasets>=4.8.4
+einops>=0.8.1
+librosa>=0.11.0
+soundfile>=0.12.1
+sentencepiece>=0.2.1
+huggingface-hub>=1.3.0
+liquid-audio>=1.2.0
diff --git a/backend/python/liquid-audio/requirements-l4t13.txt b/backend/python/liquid-audio/requirements-l4t13.txt
new file mode 100644
index 000000000..7ecc7203d
--- /dev/null
+++ b/backend/python/liquid-audio/requirements-l4t13.txt
@@ -0,0 +1,13 @@
+--extra-index-url https://pypi.jetson-ai-lab.io/jp7/cu130
+torch>=2.8.0
+torchaudio>=2.8.0
+torchcodec>=0.9.1
+transformers>=4.55.4
+accelerate>=1.10.1
+datasets>=4.8.4
+einops>=0.8.1
+librosa>=0.11.0
+soundfile>=0.12.1
+sentencepiece>=0.2.1
+huggingface-hub>=1.3.0
+liquid-audio>=1.2.0
diff --git a/backend/python/liquid-audio/requirements-mps.txt b/backend/python/liquid-audio/requirements-mps.txt
new file mode 100644
index 000000000..f57687f29
--- /dev/null
+++ b/backend/python/liquid-audio/requirements-mps.txt
@@ -0,0 +1,12 @@
+torch>=2.8.0
+torchaudio>=2.8.0
+torchcodec>=0.9.1
+transformers>=4.55.4
+accelerate>=1.10.1
+datasets>=4.8.4
+einops>=0.8.1
+librosa>=0.11.0
+soundfile>=0.12.1
+sentencepiece>=0.2.1
+huggingface-hub>=1.3.0
+liquid-audio>=1.2.0
diff --git a/backend/python/liquid-audio/requirements.txt b/backend/python/liquid-audio/requirements.txt
new file mode 100644
index 000000000..0834a8fcd
--- /dev/null
+++ b/backend/python/liquid-audio/requirements.txt
@@ -0,0 +1,3 @@
+grpcio==1.78.1
+protobuf
+certifi
diff --git a/backend/python/liquid-audio/run.sh b/backend/python/liquid-audio/run.sh
new file mode 100755
index 000000000..bd17c6e1d
--- /dev/null
+++ b/backend/python/liquid-audio/run.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+ source $backend_dir/common/libbackend.sh
+else
+ source $backend_dir/../common/libbackend.sh
+fi
+
+startBackend $@
diff --git a/backend/python/liquid-audio/test.py b/backend/python/liquid-audio/test.py
new file mode 100644
index 000000000..450ce3156
--- /dev/null
+++ b/backend/python/liquid-audio/test.py
@@ -0,0 +1,89 @@
+"""Smoke tests for the liquid-audio backend.
+
+These run without contacting HuggingFace or loading model weights:
+they only verify that the gRPC service starts and Health() responds.
+
+To run an end-to-end inference test, set LIQUID_AUDIO_MODEL_ID
+(e.g. "LiquidAI/LFM2.5-Audio-1.5B") in the environment — see test_inference().
+"""
+import os
+import subprocess
+import sys
+import time
+import unittest
+
+import grpc
+
+# Ensure generated protobuf stubs are importable
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import backend_pb2
+import backend_pb2_grpc
+
+
+class TestBackend(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ addr = os.environ.get("LIQUID_AUDIO_TEST_ADDR", "localhost:50053")
+ cls.addr = addr
+ cls.server = subprocess.Popen(
+ [sys.executable, os.path.join(os.path.dirname(__file__), "backend.py"), "--addr", addr],
+ )
+ time.sleep(2) # Give the server a moment to bind
+
+ @classmethod
+ def tearDownClass(cls):
+ cls.server.terminate()
+ try:
+ cls.server.wait(timeout=5)
+ except subprocess.TimeoutExpired:
+ cls.server.kill()
+
+ def _stub(self):
+ channel = grpc.insecure_channel(self.addr)
+ return backend_pb2_grpc.BackendStub(channel)
+
+ def test_health(self):
+ stub = self._stub()
+ reply = stub.Health(backend_pb2.HealthMessage(), timeout=5)
+ self.assertEqual(reply.message, b"OK")
+
+ def test_load_finetune_mode_without_weights(self):
+ """Loading in fine-tune mode should succeed without pulling model weights."""
+ stub = self._stub()
+ result = stub.LoadModel(
+ backend_pb2.ModelOptions(
+ Model="LiquidAI/LFM2.5-Audio-1.5B",
+ Options=["mode:finetune"],
+ ),
+ timeout=10,
+ )
+ self.assertTrue(result.success, msg=result.message)
+
+ @unittest.skipUnless(os.environ.get("LIQUID_AUDIO_MODEL_ID"),
+ "Set LIQUID_AUDIO_MODEL_ID to run an end-to-end inference smoke test")
+ def test_inference(self):
+ """End-to-end: load a real LFM2-Audio model and run one short prediction."""
+ stub = self._stub()
+ model_id = os.environ["LIQUID_AUDIO_MODEL_ID"]
+ result = stub.LoadModel(
+ backend_pb2.ModelOptions(
+ Model=model_id,
+ Options=["mode:chat"],
+ ),
+ timeout=600,
+ )
+ self.assertTrue(result.success, msg=result.message)
+ reply = stub.Predict(
+ backend_pb2.PredictOptions(
+ Prompt="Hello!",
+ Tokens=8,
+ Temperature=0.0,
+ ),
+ timeout=120,
+ )
+ self.assertGreater(len(reply.message), 0)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/backend/python/liquid-audio/test.sh b/backend/python/liquid-audio/test.sh
new file mode 100755
index 000000000..eb59f2aaf
--- /dev/null
+++ b/backend/python/liquid-audio/test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -e
+
+backend_dir=$(dirname $0)
+if [ -d $backend_dir/common ]; then
+ source $backend_dir/common/libbackend.sh
+else
+ source $backend_dir/../common/libbackend.sh
+fi
+
+runUnittests
diff --git a/core/config/backend_capabilities.go b/core/config/backend_capabilities.go
index e9c1920f0..4d66dc107 100644
--- a/core/config/backend_capabilities.go
+++ b/core/config/backend_capabilities.go
@@ -24,6 +24,7 @@ const (
UsecaseVAD = "vad"
UsecaseAudioTransform = "audio_transform"
UsecaseDiarization = "diarization"
+ UsecaseRealtimeAudio = "realtime_audio"
)
// GRPCMethod identifies a Backend service RPC from backend.proto.
@@ -45,6 +46,7 @@ const (
MethodVAD GRPCMethod = "VAD"
MethodAudioTransform GRPCMethod = "AudioTransform"
MethodDiarize GRPCMethod = "Diarize"
+ MethodAudioToAudioStream GRPCMethod = "AudioToAudioStream"
)
// UsecaseInfo describes a single known_usecase value and how it maps
@@ -147,6 +149,11 @@ var UsecaseInfoMap = map[string]UsecaseInfo{
GRPCMethod: MethodDiarize,
Description: "Speaker diarization (who-spoke-when, per-speaker segments) via the Diarize RPC.",
},
+ UsecaseRealtimeAudio: {
+ Flag: FLAG_REALTIME_AUDIO,
+ GRPCMethod: MethodAudioToAudioStream,
+ Description: "Self-contained any-to-any audio model for the Realtime API — accepts microphone audio and emits speech + transcript (+ optional function calls) from a single backend via the AudioToAudioStream RPC.",
+ },
}
// BackendCapability describes which gRPC methods and usecases a backend supports.
@@ -397,6 +404,15 @@ var BackendCapabilities = map[string]BackendCapability{
Description: "Meta MusicGen via transformers — music generation from text",
},
+ // --- Any-to-any audio backends ---
+ "liquid-audio": {
+ GRPCMethods: []GRPCMethod{MethodPredict, MethodPredictStream, MethodAudioTranscription, MethodTTS, MethodAudioToAudioStream, MethodVAD},
+ PossibleUsecases: []string{UsecaseChat, UsecaseCompletion, UsecaseTranscript, UsecaseTTS, UsecaseRealtimeAudio, UsecaseVAD},
+ DefaultUsecases: []string{UsecaseRealtimeAudio, UsecaseChat, UsecaseTranscript, UsecaseTTS, UsecaseVAD},
+ AcceptsAudios: true,
+ Description: "LFM2 / LFM2.5-Audio — self-contained any-to-any audio model for the Realtime API; also exposes chat, transcription, TTS and a stub energy-based VAD endpoint",
+ },
+
// --- Audio transform backends ---
"localvqe": {
GRPCMethods: []GRPCMethod{MethodAudioTransform},
diff --git a/core/config/model_config.go b/core/config/model_config.go
index 5a7c74c41..f14bc4a4e 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -636,6 +636,7 @@ const (
FLAG_SPEAKER_RECOGNITION ModelConfigUsecase = 0b1000000000000000
FLAG_AUDIO_TRANSFORM ModelConfigUsecase = 0b10000000000000000
FLAG_DIARIZATION ModelConfigUsecase = 0b100000000000000000
+ FLAG_REALTIME_AUDIO ModelConfigUsecase = 0b1000000000000000000
// Common Subsets
FLAG_LLM ModelConfigUsecase = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
@@ -645,12 +646,12 @@ const (
// Flags within the same group are NOT orthogonal (e.g., chat and completion are
// both text/language). A model is multimodal when its usecases span 2+ groups.
var ModalityGroups = []ModelConfigUsecase{
- FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT, // text/language
- FLAG_VISION | FLAG_DETECTION, // visual understanding
- FLAG_TRANSCRIPT, // speech input
- FLAG_TTS | FLAG_SOUND_GENERATION, // audio output
- FLAG_AUDIO_TRANSFORM, // audio in/out transforms
- FLAG_IMAGE | FLAG_VIDEO, // visual generation
+ FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT, // text/language
+ FLAG_VISION | FLAG_DETECTION, // visual understanding
+ FLAG_TRANSCRIPT | FLAG_REALTIME_AUDIO, // speech input — realtime_audio is any-to-any, so it counts here too
+ FLAG_TTS | FLAG_SOUND_GENERATION | FLAG_REALTIME_AUDIO, // audio output — and here, so a lone realtime_audio flag still reads as multimodal
+ FLAG_AUDIO_TRANSFORM, // audio in/out transforms
+ FLAG_IMAGE | FLAG_VIDEO, // visual generation
}
// IsMultimodal returns true if the given usecases span two or more orthogonal
@@ -692,6 +693,7 @@ func GetAllModelConfigUsecases() map[string]ModelConfigUsecase {
"FLAG_SPEAKER_RECOGNITION": FLAG_SPEAKER_RECOGNITION,
"FLAG_AUDIO_TRANSFORM": FLAG_AUDIO_TRANSFORM,
"FLAG_DIARIZATION": FLAG_DIARIZATION,
+ "FLAG_REALTIME_AUDIO": FLAG_REALTIME_AUDIO,
}
}
@@ -866,6 +868,16 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool {
}
}
+ if (u & FLAG_REALTIME_AUDIO) == FLAG_REALTIME_AUDIO {
+ // Backends that own a single any-to-any loop and implement
+ // AudioToAudioStream — listed here so models without an explicit
+ // known_usecases still surface on the Talk page.
+ realtimeAudioBackends := []string{"liquid-audio"}
+ if !slices.Contains(realtimeAudioBackends, c.Backend) {
+ return false
+ }
+ }
+
return true
}
diff --git a/core/gallery/importers/importers.go b/core/gallery/importers/importers.go
index 02606f099..88a4314a7 100644
--- a/core/gallery/importers/importers.go
+++ b/core/gallery/importers/importers.go
@@ -130,6 +130,8 @@ var defaultImporters = []Importer{
// and would otherwise swallow the C++ port's GGUF bundles.
&VibeVoiceCppImporter{},
&VibeVoiceImporter{},
+ // LiquidAudio (Python) — keep before LlamaCPP so non-GGUF LFM2-Audio repos route here.
+ &LiquidAudioImporter{},
&CoquiImporter{},
// Image/Video (Batch 3)
&StableDiffusionGGMLImporter{},
diff --git a/core/gallery/importers/liquid-audio.go b/core/gallery/importers/liquid-audio.go
new file mode 100644
index 000000000..dbea16020
--- /dev/null
+++ b/core/gallery/importers/liquid-audio.go
@@ -0,0 +1,145 @@
+package importers
+
+import (
+ "encoding/json"
+ "path/filepath"
+ "strings"
+
+ "github.com/mudler/LocalAI/core/config"
+ "github.com/mudler/LocalAI/core/gallery"
+ "github.com/mudler/LocalAI/core/schema"
+ "go.yaml.in/yaml/v2"
+)
+
+var _ Importer = &LiquidAudioImporter{}
+
+// LiquidAudioImporter recognises LiquidAI's LFM2-Audio family (LFM2-Audio-1.5B,
+// LFM2.5-Audio-1.5B, community finetunes) and routes them to the Python
+// `liquid-audio` backend. Detection is by repo-name substring so third-party
+// mirrors still match. preferences.backend="liquid-audio" overrides detection.
+//
+// Once upstream llama.cpp PR #18641 lands and the GGUF gallery entries are
+// added, GGUF mirrors of these models should route to llama-cpp; that's
+// handled by ordering LlamaCPPImporter after this one and by the explicit
+// "-gguf" exclusion below.
+type LiquidAudioImporter struct{}
+
+func (i *LiquidAudioImporter) Name() string { return "liquid-audio" }
+func (i *LiquidAudioImporter) Modality() string { return "tts" }
+func (i *LiquidAudioImporter) AutoDetects() bool { return true }
+
+func (i *LiquidAudioImporter) Match(details Details) bool {
+ preferences, err := details.Preferences.MarshalJSON()
+ if err != nil {
+ return false
+ }
+ preferencesMap := make(map[string]any)
+ if len(preferences) > 0 {
+ if err := json.Unmarshal(preferences, &preferencesMap); err != nil {
+ return false
+ }
+ }
+
+ if b, ok := preferencesMap["backend"].(string); ok && b == "liquid-audio" {
+ return true
+ }
+
+ matchRepo := func(repo string) bool {
+ r := strings.ToLower(repo)
+ // Cede GGUF mirrors to the (later-ordered) llama-cpp importer.
+ if strings.HasSuffix(r, "-gguf") {
+ return false
+ }
+ return strings.Contains(r, "lfm2-audio") || strings.Contains(r, "lfm2.5-audio")
+ }
+
+ if details.HuggingFace != nil {
+ repoName := details.HuggingFace.ModelID
+ if idx := strings.Index(repoName, "/"); idx >= 0 {
+ repoName = repoName[idx+1:]
+ }
+ if matchRepo(repoName) {
+ return true
+ }
+ }
+
+ if _, repo, ok := HFOwnerRepoFromURI(details.URI); ok {
+ return matchRepo(repo)
+ }
+ return false
+}
+
+func (i *LiquidAudioImporter) Import(details Details) (gallery.ModelConfig, error) {
+ preferences, err := details.Preferences.MarshalJSON()
+ if err != nil {
+ return gallery.ModelConfig{}, err
+ }
+ preferencesMap := make(map[string]any)
+ if len(preferences) > 0 {
+ if err := json.Unmarshal(preferences, &preferencesMap); err != nil {
+ return gallery.ModelConfig{}, err
+ }
+ }
+
+ name, ok := preferencesMap["name"].(string)
+ if !ok {
+ name = filepath.Base(details.URI)
+ }
+
+ description, ok := preferencesMap["description"].(string)
+ if !ok {
+ description = "Imported from " + details.URI
+ }
+
+ model := details.URI
+ if details.HuggingFace != nil && details.HuggingFace.ModelID != "" {
+ model = details.HuggingFace.ModelID
+ }
+
+ // Preferences may pin the mode (chat / asr / tts / s2s / finetune).
+ // Default to s2s — the headline any-to-any use case.
+ mode, _ := preferencesMap["mode"].(string)
+ if mode == "" {
+ mode = "s2s"
+ }
+
+ options := []string{"mode:" + mode}
+ if voice, ok := preferencesMap["voice"].(string); ok && voice != "" {
+ options = append(options, "voice:"+voice)
+ }
+
+ usecases := []string{"chat"}
+ switch mode {
+ case "asr":
+ usecases = []string{"transcript"}
+ case "tts":
+ usecases = []string{"tts"}
+ case "s2s":
+ // realtime_audio surfaces the model on the Talk page; chat/tts/
+ // transcript/vad keep the standalone OpenAI-compatible endpoints
+ // working since liquid-audio implements all of them.
+ usecases = []string{"realtime_audio", "chat", "tts", "transcript", "vad"}
+ }
+
+ modelConfig := config.ModelConfig{
+ Name: name,
+ Description: description,
+ Backend: "liquid-audio",
+ KnownUsecaseStrings: usecases,
+ Options: options,
+ PredictionOptions: schema.PredictionOptions{
+ BasicModelRequest: schema.BasicModelRequest{Model: model},
+ },
+ }
+
+ data, err := yaml.Marshal(modelConfig)
+ if err != nil {
+ return gallery.ModelConfig{}, err
+ }
+
+ return gallery.ModelConfig{
+ Name: name,
+ Description: description,
+ ConfigFile: string(data),
+ }, nil
+}
diff --git a/core/gallery/importers/liquid-audio_test.go b/core/gallery/importers/liquid-audio_test.go
new file mode 100644
index 000000000..f7dc5e046
--- /dev/null
+++ b/core/gallery/importers/liquid-audio_test.go
@@ -0,0 +1,91 @@
+package importers_test
+
+import (
+ "encoding/json"
+ "fmt"
+
+ "github.com/mudler/LocalAI/core/gallery/importers"
+ . "github.com/onsi/ginkgo/v2"
+ . "github.com/onsi/gomega"
+)
+
+var _ = Describe("LiquidAudioImporter", func() {
+ Context("detection from HuggingFace", func() {
+ It("matches LiquidAI/LFM2.5-Audio-1.5B", func() {
+ uri := "https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B"
+ preferences := json.RawMessage(`{}`)
+
+ modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
+
+ Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
+ Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: liquid-audio"))
+ Expect(modelConfig.ConfigFile).To(ContainSubstring("LiquidAI/LFM2.5-Audio-1.5B"))
+ })
+
+ It("matches LiquidAI/LFM2-Audio-1.5B (older variant)", func() {
+ uri := "https://huggingface.co/LiquidAI/LFM2-Audio-1.5B"
+ preferences := json.RawMessage(`{}`)
+
+ modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
+
+ Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
+ Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: liquid-audio"))
+ })
+
+ It("cedes -GGUF mirrors to the llama-cpp importer", func() {
+ // LiquidAI/LFM2.5-Audio-1.5B-GGUF should NOT route to liquid-audio.
+ // Once upstream PR #18641 lands and the GGUF gallery entry exists,
+ // this is the path that lets users opt into the C++ runtime.
+ uri := "https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B-GGUF"
+ preferences := json.RawMessage(`{}`)
+
+ modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
+
+ Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
+ Expect(modelConfig.ConfigFile).ToNot(ContainSubstring("backend: liquid-audio"),
+ fmt.Sprintf("GGUF repo should not match Python importer; got: %s", modelConfig.ConfigFile))
+ })
+ })
+
+ Context("preference override", func() {
+ It("honours preferences.backend=liquid-audio for arbitrary URIs", func() {
+ uri := "https://example.com/some-unrelated-model"
+ preferences := json.RawMessage(`{"backend": "liquid-audio"}`)
+
+ modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
+
+ Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
+ Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: liquid-audio"))
+ })
+
+ It("picks up the mode preference", func() {
+ uri := "https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B"
+ preferences := json.RawMessage(`{"mode": "asr"}`)
+
+ modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
+
+ Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
+ Expect(modelConfig.ConfigFile).To(ContainSubstring("mode:asr"))
+ Expect(modelConfig.ConfigFile).To(ContainSubstring("transcript"))
+ })
+
+ It("picks up the voice preference", func() {
+ uri := "https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B"
+ preferences := json.RawMessage(`{"mode": "tts", "voice": "uk_male"}`)
+
+ modelConfig, err := importers.DiscoverModelConfig(uri, preferences)
+
+ Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Error: %v", err))
+ Expect(modelConfig.ConfigFile).To(ContainSubstring("voice:uk_male"))
+ })
+ })
+
+ Context("Importer interface metadata", func() {
+ It("exposes name/modality/autodetect", func() {
+ imp := &importers.LiquidAudioImporter{}
+ Expect(imp.Name()).To(Equal("liquid-audio"))
+ Expect(imp.Modality()).To(Equal("tts"))
+ Expect(imp.AutoDetects()).To(BeTrue())
+ })
+ })
+})
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index f02aa7fe0..b7a0be6ac 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -8,6 +8,7 @@ import (
"fmt"
"math"
"os"
+ "strconv"
"sync"
"time"
@@ -20,6 +21,8 @@ import (
"github.com/mudler/LocalAI/core/backend"
"github.com/mudler/LocalAI/core/config"
+ "github.com/mudler/LocalAI/core/http/auth"
+ mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/core/templates"
@@ -79,6 +82,26 @@ type Session struct {
InputSampleRate int
OutputSampleRate int
MaxOutputTokens types.IntOrInf
+ // MaxHistoryItems caps the number of MessageItems passed to the LLM each
+ // turn (0 = unlimited). Small models — especially the LFM2.5-Audio 1.5B
+ // served via the liquid-audio backend — degrade quickly past a handful
+ // of turns. Counted from the tail; FunctionCall + FunctionCallOutput
+ // pairs are kept together so we never feed an orphaned tool result.
+ MaxHistoryItems int
+
+ // AssistantExecutor is non-nil when the session opted into the in-process
+ // LocalAI Assistant tool surface. Tool calls whose name matches this
+ // executor's catalog are run inproc and their output is fed back to the
+ // model server-side; the client never sees a function_call_arguments
+ // event for those. Mirrors the chat handler's metadata.localai_assistant
+ // path.
+ AssistantExecutor mcpTools.ToolExecutor
+
+ // AssistantTools is the cached ToolUnion slice we injected at session
+ // creation. Re-applied after every client session.update so a
+ // client-driven tool refresh (e.g. toggling a client MCP server) doesn't
+ // silently strip Manage Mode's tools.
+ AssistantTools []types.ToolUnion
// Response cancellation: protects activeResponseCancel/activeResponseDone
responseMu sync.Mutex
@@ -205,6 +228,19 @@ func RealtimeTranscriptionSession(application *application.Application) echo.Han
}
}
+// RealtimeSessionOptions bundles per-session knobs decoded from the WS query
+// string (or the WebRTC handshake body). Mirrors what chat.go pulls off
+// `metadata.localai_assistant` — admin-only opt-in to the in-process
+// management tool surface.
+type RealtimeSessionOptions struct {
+ LocalAIAssistant bool
+ // AuthEnabled mirrors chat.go's requireAssistantAccess gate. We resolve
+ // admin role at handshake time (where the echo.Context has the auth
+ // cookie/Bearer) and drop the result here so runRealtimeSession can
+ // decide without holding onto the request.
+ IsAdmin bool
+}
+
func Realtime(application *application.Application) echo.HandlerFunc {
return func(c echo.Context) error {
ws, err := upgrader.Upgrade(c.Response(), c.Request(), nil)
@@ -218,25 +254,105 @@ func Realtime(application *application.Application) echo.HandlerFunc {
// Extract query parameters from Echo context before passing to websocket handler
model := c.QueryParam("model")
+ assistantFlag, _ := strconv.ParseBool(c.QueryParam("localai_assistant"))
+ opts := RealtimeSessionOptions{
+ LocalAIAssistant: assistantFlag,
+ IsAdmin: isCurrentUserAdmin(c, application),
+ }
- registerRealtime(application, model)(ws)
+ registerRealtime(application, model, opts)(ws)
return nil
}
}
-func registerRealtime(application *application.Application, model string) func(c *websocket.Conn) {
+// isCurrentUserAdmin replicates the chat-side admin check at the realtime
+// handshake. When auth is disabled, every caller is treated as admin (same
+// as chat's requireAssistantAccess).
+func isCurrentUserAdmin(c echo.Context, application *application.Application) bool {
+ if application == nil || application.ApplicationConfig() == nil || !application.ApplicationConfig().Auth.Enabled {
+ return true
+ }
+ user := auth.GetUser(c)
+ return user != nil && user.Role == auth.RoleAdmin
+}
+
+func registerRealtime(application *application.Application, model string, opts RealtimeSessionOptions) func(c *websocket.Conn) {
return func(conn *websocket.Conn) {
t := NewWebSocketTransport(conn)
evaluator := application.TemplatesEvaluator()
xlog.Debug("Realtime WebSocket connection established", "address", conn.RemoteAddr().String(), "model", model)
- runRealtimeSession(application, t, model, evaluator)
+ runRealtimeSession(application, t, model, evaluator, opts)
}
}
+// defaultMaxHistoryItems picks a sensible default cap for the session.
+// Small any-to-any audio models degrade quickly past a handful of turns;
+// legacy pipelines composing larger LLMs keep the historical "unlimited"
+// default and rely on the LLM's own context window.
+func defaultMaxHistoryItems(cfg *config.ModelConfig) int {
+ if cfg != nil && cfg.HasUsecases(config.FLAG_REALTIME_AUDIO) {
+ return 6
+ }
+ return 0
+}
+
+// trimRealtimeItems returns the tail of items capped at maxItems (0 = no cap).
+// Walks backwards keeping function_call + function_call_output pairs together
+// so we never feed the LLM an orphaned tool result that references a call it
+// can't see.
+func trimRealtimeItems(items []*types.MessageItemUnion, maxItems int) []*types.MessageItemUnion {
+ if maxItems <= 0 || len(items) <= maxItems {
+ return items
+ }
+ // Find the cut point starting from len-maxItems and pull it left until
+ // we're not in the middle of a tool-call pair.
+ cut := len(items) - maxItems
+ for cut > 0 && items[cut] != nil && items[cut].FunctionCallOutput != nil {
+ cut--
+ }
+ return items[cut:]
+}
+
+// prepareRealtimeConfig validates a model config for use in a realtime session
+// and fills in pipeline slots for self-contained any-to-any models. It returns
+// an error code + message pair suitable for sendError; the bool indicates
+// whether the caller should proceed. Extracted from runRealtimeSession so the
+// gate logic can be exercised in unit tests without a full Application.
+func prepareRealtimeConfig(cfg *config.ModelConfig) (errCode, errMsg string, ok bool) {
+ if cfg == nil {
+ return "invalid_model", "Model is not a pipeline model", false
+ }
+
+ // Self-contained any-to-any models (e.g. liquid-audio) own the whole
+ // loop in one engine — surface them by populating empty pipeline slots
+ // with the model's own name so newModel can resolve a config for each
+ // role. The user can still pin individual slots (e.g. Pipeline.VAD =
+ // silero-vad) and those wins.
+ if cfg.HasUsecases(config.FLAG_REALTIME_AUDIO) {
+ if cfg.Pipeline.VAD == "" {
+ cfg.Pipeline.VAD = cfg.Name
+ }
+ if cfg.Pipeline.Transcription == "" {
+ cfg.Pipeline.Transcription = cfg.Name
+ }
+ if cfg.Pipeline.LLM == "" {
+ cfg.Pipeline.LLM = cfg.Name
+ }
+ if cfg.Pipeline.TTS == "" {
+ cfg.Pipeline.TTS = cfg.Name
+ }
+ return "", "", true
+ }
+
+ if cfg.Pipeline.VAD == "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.TTS == "" && cfg.Pipeline.LLM == "" {
+ return "invalid_model", "Model is not a pipeline model", false
+ }
+ return "", "", true
+}
+
// runRealtimeSession runs the main event loop for a realtime session.
// It is transport-agnostic and works with both WebSocket and WebRTC.
-func runRealtimeSession(application *application.Application, t Transport, model string, evaluator *templates.Evaluator) {
- // TODO: Allow any-to-any model to be specified
+func runRealtimeSession(application *application.Application, t Transport, model string, evaluator *templates.Evaluator, opts RealtimeSessionOptions) {
cl := application.ModelConfigLoader()
cfg, err := cl.LoadModelConfigFileByNameDefaultOptions(model, application.ApplicationConfig())
if err != nil {
@@ -245,22 +361,79 @@ func runRealtimeSession(application *application.Application, t Transport, model
return
}
- if cfg == nil || (cfg.Pipeline.VAD == "" && cfg.Pipeline.Transcription == "" && cfg.Pipeline.TTS == "" && cfg.Pipeline.LLM == "") {
+ if code, msg, ok := prepareRealtimeConfig(cfg); !ok {
xlog.Error("model is not a pipeline", "model", model)
- sendError(t, "invalid_model", "Model is not a pipeline model", "", "")
+ sendError(t, code, msg, "", "")
return
}
+ // LocalAI Assistant opt-in: gate on admin (same rule as chat.go's
+ // requireAssistantAccess) and grab the process-wide holder's executor.
+ // We collect tools + system prompt here and merge them into the session
+ // below so they're live from the first response.create.
+ var assistantTools []types.ToolUnion
+ var assistantSystemPrompt string
+ var assistantExecutor mcpTools.ToolExecutor
+ if opts.LocalAIAssistant {
+ if !opts.IsAdmin {
+ sendError(t, "forbidden", "localai_assistant requires admin", "", "")
+ return
+ }
+ appCfg := application.ApplicationConfig()
+ if appCfg != nil && appCfg.DisableLocalAIAssistant {
+ sendError(t, "unavailable", "LocalAI Assistant is disabled on this server", "", "")
+ return
+ }
+ holder := application.LocalAIAssistant()
+ if holder == nil || !holder.HasTools() {
+ sendError(t, "unavailable", "LocalAI Assistant is not available on this server", "", "")
+ return
+ }
+ exec := holder.Executor()
+ fns, discErr := exec.DiscoverTools(context.Background())
+ if discErr != nil {
+ xlog.Error("realtime: failed to discover LocalAI Assistant tools", "error", discErr)
+ sendError(t, "tool_discovery_failed", "failed to discover assistant tools: "+discErr.Error(), "", "")
+ return
+ }
+ assistantExecutor = exec
+ assistantSystemPrompt = holder.SystemPrompt()
+ assistantTools = make([]types.ToolUnion, 0, len(fns))
+ for _, fn := range fns {
+ fnCopy := fn
+ assistantTools = append(assistantTools, types.ToolUnion{
+ Function: &types.ToolFunction{
+ Name: fnCopy.Name,
+ Description: fnCopy.Description,
+ Parameters: fnCopy.Parameters,
+ },
+ })
+ }
+ xlog.Debug("realtime: LocalAI Assistant tools injected", "count", len(fns))
+ }
+
sttModel := cfg.Pipeline.Transcription
+ // Compose the system prompt: prepend the assistant prompt when we have
+ // one (it teaches the model the safety rules and tool recipes), then the
+ // session's default voice instructions. Order matches chat.go's
+ // hasSystemMessage check — assistant prompt comes first.
+ instructions := defaultInstructions
+ if assistantSystemPrompt != "" {
+ instructions = assistantSystemPrompt + "\n\n" + defaultInstructions
+ }
+
sessionID := generateSessionID()
session := &Session{
ID: sessionID,
TranscriptionOnly: false,
Model: model,
Voice: cfg.TTSConfig.Voice,
- Instructions: defaultInstructions,
+ Instructions: instructions,
ModelConfig: cfg,
+ Tools: assistantTools,
+ AssistantTools: assistantTools,
+ AssistantExecutor: assistantExecutor,
TurnDetection: &types.TurnDetectionUnion{
ServerVad: &types.ServerVad{
Threshold: 0.5,
@@ -275,6 +448,7 @@ func runRealtimeSession(application *application.Application, t Transport, model
Conversations: make(map[string]*Conversation),
InputSampleRate: defaultRemoteSampleRate,
OutputSampleRate: defaultRemoteSampleRate,
+ MaxHistoryItems: defaultMaxHistoryItems(cfg),
}
// Create a default conversation
@@ -810,7 +984,28 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
}
if rt.Tools != nil {
- session.Tools = rt.Tools
+ // Manage Mode tools survive a client-driven session.update — the
+ // alternative is silently dropping them whenever the user toggles
+ // a client MCP server, which would break the modality mid-session.
+ // Names from rt.Tools win on collision (the client is explicit;
+ // we preserve, we don't override).
+ merged := append([]types.ToolUnion(nil), rt.Tools...)
+ seen := make(map[string]struct{}, len(merged))
+ for _, t := range merged {
+ if t.Function != nil {
+ seen[t.Function.Name] = struct{}{}
+ }
+ }
+ for _, t := range session.AssistantTools {
+ if t.Function == nil {
+ continue
+ }
+ if _, ok := seen[t.Function.Name]; ok {
+ continue
+ }
+ merged = append(merged, t)
+ }
+ session.Tools = merged
}
if rt.ToolChoice != nil {
session.ToolChoice = rt.ToolChoice
@@ -1104,7 +1299,17 @@ func generateResponse(ctx context.Context, session *Session, utt []byte, transcr
triggerResponse(ctx, session, conv, t, nil)
}
+// maxAssistantToolTurns caps the server-side agentic loop. Mirrors the
+// chat-page maxToolTurns:10 from useChat.js — the model gets up to this
+// many consecutive tool round-trips before we return control to the user
+// without another response cycle.
+const maxAssistantToolTurns = 10
+
func triggerResponse(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams) {
+ triggerResponseAtTurn(ctx, session, conv, t, overrides, 0)
+}
+
+func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversation, t Transport, overrides *types.ResponseCreateParams, toolTurn int) {
config := session.ModelInterface.PredictConfig()
// Default values
@@ -1155,7 +1360,8 @@ func triggerResponse(ctx context.Context, session *Session, conv *Conversation,
imgIndex := 0
conv.Lock.Lock()
- for _, item := range conv.Items {
+ items := trimRealtimeItems(conv.Items, session.MaxHistoryItems)
+ for _, item := range items {
if item.User != nil {
msg := schema.Message{
Role: string(types.MessageRoleUser),
@@ -1575,8 +1781,16 @@ func triggerResponse(ctx context.Context, session *Session, conv *Conversation,
})
}
- // Handle Tool Calls
+ // Handle Tool Calls. Two paths:
+ // - LocalAI Assistant tools (session.AssistantExecutor.IsTool) run
+ // server-side; we append both the call and its output to conv.Items
+ // and re-trigger a follow-up response so the model can speak the
+ // result. The client only sees observability events.
+ // - All other tools follow the standard OpenAI flow: emit
+ // function_call_arguments.done and wait for the client to send
+ // conversation.item.create back.
xlog.Debug("About to handle tool calls", "finalToolCallsCount", len(finalToolCalls))
+ executedAssistantTool := false
for i, tc := range finalToolCalls {
toolCallID := generateItemID()
callID := "call_" + generateUniqueID() // OpenAI uses call_xyz
@@ -1608,6 +1822,51 @@ func triggerResponse(ctx context.Context, session *Session, conv *Conversation,
Item: fcItem,
})
+ serverSide := session.AssistantExecutor != nil && session.AssistantExecutor.IsTool(tc.Name)
+ if serverSide {
+ output, execErr := session.AssistantExecutor.ExecuteTool(ctx, tc.Name, tc.Arguments)
+ if execErr != nil {
+ output = "Error: " + execErr.Error()
+ xlog.Error("realtime: assistant tool execution failed", "tool", tc.Name, "error", execErr)
+ }
+ foItem := types.MessageItemUnion{
+ FunctionCallOutput: &types.MessageItemFunctionCallOutput{
+ ID: generateItemID(),
+ CallID: callID,
+ Output: output,
+ Status: types.ItemStatusCompleted,
+ },
+ }
+ conv.Lock.Lock()
+ conv.Items = append(conv.Items, &foItem)
+ conv.Lock.Unlock()
+ // Close the call out and emit the output as its own paired
+ // added/done — the OpenAI spec pairs every item-done with a
+ // preceding item-added, so we re-pair here for the output.
+ // The UI renders the transcript entry on item.done for both
+ // shapes (FunctionCall + FunctionCallOutput).
+ sendEvent(t, types.ResponseOutputItemDoneEvent{
+ ServerEventBase: types.ServerEventBase{},
+ ResponseID: responseID,
+ OutputIndex: outputIndex,
+ Item: fcItem,
+ })
+ sendEvent(t, types.ResponseOutputItemAddedEvent{
+ ServerEventBase: types.ServerEventBase{},
+ ResponseID: responseID,
+ OutputIndex: outputIndex,
+ Item: foItem,
+ })
+ sendEvent(t, types.ResponseOutputItemDoneEvent{
+ ServerEventBase: types.ServerEventBase{},
+ ResponseID: responseID,
+ OutputIndex: outputIndex,
+ Item: foItem,
+ })
+ executedAssistantTool = true
+ continue
+ }
+
sendEvent(t, types.ResponseFunctionCallArgumentsDeltaEvent{
ServerEventBase: types.ServerEventBase{},
ResponseID: responseID,
@@ -1643,6 +1902,19 @@ func triggerResponse(ctx context.Context, session *Session, conv *Conversation,
Status: types.ResponseStatusCompleted,
},
})
+
+ // If we executed any assistant tools inproc, run another response cycle
+ // so the model can speak the result. Mirrors the chat-side agentic loop
+ // but driven server-side rather than by client round-trip. Bounded so a
+ // degenerate "model keeps calling tools" doesn't blow the stack.
+ if executedAssistantTool {
+ if toolTurn+1 >= maxAssistantToolTurns {
+ xlog.Warn("realtime: assistant tool-turn limit reached, stopping the agentic loop",
+ "limit", maxAssistantToolTurns, "model", session.Model)
+ return
+ }
+ triggerResponseAtTurn(ctx, session, conv, t, nil, toolTurn+1)
+ }
}
// Helper functions to generate unique IDs
diff --git a/core/http/endpoints/openai/realtime_gate_test.go b/core/http/endpoints/openai/realtime_gate_test.go
new file mode 100644
index 000000000..e49eb71eb
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_gate_test.go
@@ -0,0 +1,153 @@
+package openai
+
+import (
+ "github.com/mudler/LocalAI/core/config"
+ "github.com/mudler/LocalAI/core/http/endpoints/openai/types"
+ . "github.com/onsi/ginkgo/v2"
+ . "github.com/onsi/gomega"
+)
+
+// withUsecases returns a *ModelConfigUsecase pointing at the OR of the given flags.
+// Helper so each spec keeps its intent obvious.
+func withUsecases(flags ...config.ModelConfigUsecase) *config.ModelConfigUsecase {
+ var u config.ModelConfigUsecase
+ for _, f := range flags {
+ u |= f
+ }
+ return &u
+}
+
+var _ = Describe("prepareRealtimeConfig", func() {
+ It("rejects a nil config", func() {
+ code, msg, ok := prepareRealtimeConfig(nil)
+ Expect(ok).To(BeFalse())
+ Expect(code).To(Equal("invalid_model"))
+ Expect(msg).To(ContainSubstring("not a pipeline model"))
+ })
+
+ It("rejects a model with no pipeline slots and no realtime_audio usecase", func() {
+ cfg := &config.ModelConfig{Name: "plain-chat"}
+ code, msg, ok := prepareRealtimeConfig(cfg)
+ Expect(ok).To(BeFalse())
+ Expect(code).To(Equal("invalid_model"))
+ Expect(msg).To(ContainSubstring("not a pipeline model"))
+ })
+
+ It("accepts a model with a fully populated legacy pipeline", func() {
+ cfg := &config.ModelConfig{
+ Name: "legacy",
+ Pipeline: config.Pipeline{
+ VAD: "silero",
+ Transcription: "whisper",
+ LLM: "llama",
+ TTS: "piper",
+ },
+ }
+ _, _, ok := prepareRealtimeConfig(cfg)
+ Expect(ok).To(BeTrue())
+ Expect(cfg.Pipeline.LLM).To(Equal("llama"), "user-supplied pipeline slot must not be overwritten")
+ })
+
+ It("accepts a self-contained realtime_audio model and self-pipelines empty slots", func() {
+ cfg := &config.ModelConfig{
+ Name: "lfm2.5-audio-realtime",
+ KnownUsecases: withUsecases(config.FLAG_REALTIME_AUDIO),
+ }
+ _, _, ok := prepareRealtimeConfig(cfg)
+ Expect(ok).To(BeTrue())
+ Expect(cfg.Pipeline.VAD).To(Equal("lfm2.5-audio-realtime"))
+ Expect(cfg.Pipeline.Transcription).To(Equal("lfm2.5-audio-realtime"))
+ Expect(cfg.Pipeline.LLM).To(Equal("lfm2.5-audio-realtime"))
+ Expect(cfg.Pipeline.TTS).To(Equal("lfm2.5-audio-realtime"))
+ })
+
+ It("preserves user-pinned pipeline slots on a realtime_audio model", func() {
+ // A user might want a dedicated silero-vad and let the realtime_audio
+ // model own only STT/LLM/TTS.
+ cfg := &config.ModelConfig{
+ Name: "lfm-with-external-vad",
+ KnownUsecases: withUsecases(config.FLAG_REALTIME_AUDIO),
+ Pipeline: config.Pipeline{
+ VAD: "silero-vad",
+ },
+ }
+ _, _, ok := prepareRealtimeConfig(cfg)
+ Expect(ok).To(BeTrue())
+ Expect(cfg.Pipeline.VAD).To(Equal("silero-vad"))
+ Expect(cfg.Pipeline.Transcription).To(Equal("lfm-with-external-vad"))
+ Expect(cfg.Pipeline.LLM).To(Equal("lfm-with-external-vad"))
+ Expect(cfg.Pipeline.TTS).To(Equal("lfm-with-external-vad"))
+ })
+
+ It("accepts a model with at least one legacy pipeline slot set", func() {
+ // Pre-existing behaviour: the gate only rejected when ALL four slots
+ // were empty. Lock that in so the change doesn't tighten the gate.
+ cfg := &config.ModelConfig{
+ Name: "partial",
+ Pipeline: config.Pipeline{
+ LLM: "llama",
+ },
+ }
+ _, _, ok := prepareRealtimeConfig(cfg)
+ Expect(ok).To(BeTrue())
+ })
+})
+
+var _ = Describe("defaultMaxHistoryItems", func() {
+ It("caps realtime_audio sessions at 6", func() {
+ cfg := &config.ModelConfig{KnownUsecases: withUsecases(config.FLAG_REALTIME_AUDIO)}
+ Expect(defaultMaxHistoryItems(cfg)).To(Equal(6))
+ })
+ It("leaves legacy pipelines unlimited", func() {
+ cfg := &config.ModelConfig{Pipeline: config.Pipeline{LLM: "llama"}}
+ Expect(defaultMaxHistoryItems(cfg)).To(Equal(0))
+ })
+ It("tolerates nil", func() {
+ Expect(defaultMaxHistoryItems(nil)).To(Equal(0))
+ })
+})
+
+var _ = Describe("trimRealtimeItems", func() {
+ user := func(id string) *types.MessageItemUnion {
+ return &types.MessageItemUnion{User: &types.MessageItemUser{ID: id}}
+ }
+ assistant := func(id string) *types.MessageItemUnion {
+ return &types.MessageItemUnion{Assistant: &types.MessageItemAssistant{ID: id}}
+ }
+ fnCall := func(id, callID string) *types.MessageItemUnion {
+ return &types.MessageItemUnion{FunctionCall: &types.MessageItemFunctionCall{ID: id, CallID: callID}}
+ }
+ fnOut := func(id, callID string) *types.MessageItemUnion {
+ return &types.MessageItemUnion{FunctionCallOutput: &types.MessageItemFunctionCallOutput{ID: id, CallID: callID}}
+ }
+
+ It("returns the input unchanged when cap is zero", func() {
+ in := []*types.MessageItemUnion{user("u1"), assistant("a1")}
+ Expect(trimRealtimeItems(in, 0)).To(Equal(in))
+ })
+
+ It("returns the input unchanged when under the cap", func() {
+ in := []*types.MessageItemUnion{user("u1"), assistant("a1")}
+ Expect(trimRealtimeItems(in, 4)).To(Equal(in))
+ })
+
+ It("keeps the tail when over the cap", func() {
+ in := []*types.MessageItemUnion{user("u1"), assistant("a1"), user("u2"), assistant("a2"), user("u3")}
+ out := trimRealtimeItems(in, 3)
+ Expect(out).To(HaveLen(3))
+ Expect(out[0].User.ID).To(Equal("u2"))
+ Expect(out[2].User.ID).To(Equal("u3"))
+ })
+
+ It("pulls the cut left to keep a function_call paired with its output", func() {
+ // 0:user 1:fc 2:fc_out 3:assistant — cap=2 would otherwise start at
+ // index 2 (orphan fc_out). Helper must roll back to include 1.
+ in := []*types.MessageItemUnion{user("u1"), fnCall("fc1", "c1"), fnOut("fo1", "c1"), assistant("a1")}
+ out := trimRealtimeItems(in, 2)
+ // Expect at least the fc + fc_out + assistant (3 items, cap was 2)
+ // — the rollback prefers correctness over the cap.
+ Expect(len(out)).To(BeNumerically(">=", 3))
+ Expect(out[0].FunctionCall).NotTo(BeNil())
+ Expect(out[1].FunctionCallOutput).NotTo(BeNil())
+ })
+})
diff --git a/core/http/endpoints/openai/realtime_webrtc.go b/core/http/endpoints/openai/realtime_webrtc.go
index 864c67b19..fd305799c 100644
--- a/core/http/endpoints/openai/realtime_webrtc.go
+++ b/core/http/endpoints/openai/realtime_webrtc.go
@@ -15,6 +15,10 @@ import (
type RealtimeCallRequest struct {
SDP string `json:"sdp"`
Model string `json:"model"`
+ // LocalAIAssistant opts the session into the in-process admin tool
+ // surface (same modality as the chat page's "Manage Mode"). Admin-only;
+ // the realtime entry point gates it the same way the chat handler does.
+ LocalAIAssistant bool `json:"localai_assistant,omitempty"`
}
// RealtimeCallResponse is the JSON response for POST /v1/realtime/calls.
@@ -165,9 +169,13 @@ func RealtimeCalls(application *application.Application) echo.HandlerFunc {
// Start the realtime session in a goroutine
evaluator := application.TemplatesEvaluator()
+ opts := RealtimeSessionOptions{
+ LocalAIAssistant: req.LocalAIAssistant,
+ IsAdmin: isCurrentUserAdmin(c, application),
+ }
go func() {
defer transport.Close()
- runRealtimeSession(application, transport, req.Model, evaluator)
+ runRealtimeSession(application, transport, req.Model, evaluator, opts)
}()
return c.JSON(http.StatusCreated, RealtimeCallResponse{
diff --git a/core/http/react-ui/public/locales/en/models.json b/core/http/react-ui/public/locales/en/models.json
index 441b2fc09..b503dd187 100644
--- a/core/http/react-ui/public/locales/en/models.json
+++ b/core/http/react-ui/public/locales/en/models.json
@@ -24,6 +24,7 @@
"diarization": "Diarization",
"soundGen": "Sound",
"audioTransform": "Audio FX",
+ "realtimeAudio": "Realtime Audio",
"embedding": "Embeddings",
"rerank": "Rerank",
"detection": "Detection",
diff --git a/core/http/react-ui/src/pages/FineTune.jsx b/core/http/react-ui/src/pages/FineTune.jsx
index 1b593f8cd..61a9be8fe 100644
--- a/core/http/react-ui/src/pages/FineTune.jsx
+++ b/core/http/react-ui/src/pages/FineTune.jsx
@@ -732,6 +732,9 @@ export default function FineTune() {
const [seed, setSeed] = useState(0)
const [mixedPrecision, setMixedPrecision] = useState('')
const [extraOptions, setExtraOptions] = useState([])
+ // liquid-audio specific knobs (folded into extra_options on submit)
+ const [liquidAudioVoice, setLiquidAudioVoice] = useState('')
+ const [liquidAudioValDataset, setLiquidAudioValDataset] = useState('')
const [hfToken, setHfToken] = useState('')
const [showAdvanced, setShowAdvanced] = useState(false)
const [resumeFromCheckpoint, setResumeFromCheckpoint] = useState('')
@@ -801,6 +804,12 @@ export default function FineTune() {
for (const { key, value } of extraOptions) {
if (key.trim()) extra[key.trim()] = value
}
+ // Fold liquid-audio specific fields into extra_options. The Python
+ // backend reads `voice` and `val_dataset` directly from there.
+ if (backend === 'liquid-audio') {
+ if (liquidAudioVoice) extra.voice = liquidAudioVoice
+ if (liquidAudioValDataset.trim()) extra.val_dataset = liquidAudioValDataset.trim()
+ }
const isAdapter = ['lora', 'loha', 'lokr'].includes(trainingType)
@@ -872,6 +881,10 @@ export default function FineTune() {
for (const { key, value } of extraOptions) {
if (key.trim()) extra[key.trim()] = value
}
+ if (backend === 'liquid-audio') {
+ if (liquidAudioVoice) extra.voice = liquidAudioVoice
+ if (liquidAudioValDataset.trim()) extra.val_dataset = liquidAudioValDataset.trim()
+ }
return {
model,
backend,
@@ -965,10 +978,15 @@ export default function FineTune() {
setSaveTotalLimit(Number(config.extra_options.save_total_limit))
}
+ // Restore liquid-audio specific extras (also filtered out of the
+ // freeform list below).
+ if (config.extra_options?.voice != null) setLiquidAudioVoice(String(config.extra_options.voice))
+ if (config.extra_options?.val_dataset != null) setLiquidAudioValDataset(String(config.extra_options.val_dataset))
+
// Convert extra_options object to [{key, value}] entries, filtering out handled keys
if (config.extra_options && typeof config.extra_options === 'object') {
const entries = Object.entries(config.extra_options)
- .filter(([k]) => !['max_seq_length', 'save_total_limit', 'hf_token', 'eval_strategy', 'eval_steps', 'eval_split', 'eval_dataset_source', 'eval_split_ratio'].includes(k))
+ .filter(([k]) => !['max_seq_length', 'save_total_limit', 'hf_token', 'eval_strategy', 'eval_steps', 'eval_split', 'eval_dataset_source', 'eval_split_ratio', 'voice', 'val_dataset'].includes(k))
.map(([key, value]) => ({ key, value: String(value) }))
setExtraOptions(entries)
}
@@ -1458,6 +1476,31 @@ export default function FineTune() {
)}
+ {backend === 'liquid-audio' && (
+
+
+
+ Dataset must be preprocessed by LFM2AudioChatMapper (a directory of LFM2DataLoader-ready arrow files). See liquid_audio/examples/preprocess_jenny_tts.py for the conversion recipe.
+