feat: Add Kokoros backend (#9212)

Signed-off-by: Richard Palethorpe <io@richiejp.com>
This commit is contained in:
Richard Palethorpe
2026-04-08 18:23:16 +01:00
committed by GitHub
parent b7247fc148
commit ea6e850809
19 changed files with 4180 additions and 2 deletions

View File

@@ -31,6 +31,7 @@ jobs:
llama-cpp-quantization: ${{ steps.detect.outputs.llama-cpp-quantization }}
acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
voxtral: ${{ steps.detect.outputs.voxtral }}
kokoros: ${{ steps.detect.outputs.kokoros }}
steps:
- name: Checkout repository
uses: actions/checkout@v6
@@ -528,3 +529,25 @@ jobs:
- name: Test voxtral
run: |
make --jobs=5 --output-sync=target -C backend/go/voxtral test
tests-kokoros:
needs: detect-changes
if: needs.detect-changes.outputs.kokoros == 'true' || needs.detect-changes.outputs.run-all == 'true'
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v6
with:
submodules: true
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install -y build-essential cmake pkg-config protobuf-compiler clang libclang-dev
sudo apt-get install -y espeak-ng libespeak-ng-dev libsonic-dev libpcaudio-dev libopus-dev libssl-dev
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- name: Build kokoros
run: |
make -C backend/rust/kokoros kokoros-grpc
- name: Test kokoros
run: |
make -C backend/rust/kokoros test

3
.gitmodules vendored
View File

@@ -1,3 +1,6 @@
[submodule "docs/themes/hugo-theme-relearn"]
path = docs/themes/hugo-theme-relearn
url = https://github.com/McShelby/hugo-theme-relearn.git
[submodule "backend/rust/kokoros/sources/Kokoros"]
path = backend/rust/kokoros/sources/Kokoros
url = https://github.com/lucasjinreal/Kokoros

View File

@@ -1,5 +1,5 @@
# Disable parallel execution for backend builds
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros
GOCMD=go
GOTEST=$(GOCMD) test
@@ -431,6 +431,7 @@ prepare-test-extra: protogen-python
$(MAKE) -C backend/python/whisperx
$(MAKE) -C backend/python/ace-step
$(MAKE) -C backend/python/trl
$(MAKE) -C backend/rust/kokoros kokoros-grpc
test-extra: prepare-test-extra
$(MAKE) -C backend/python/transformers test
@@ -451,6 +452,7 @@ test-extra: prepare-test-extra
$(MAKE) -C backend/python/whisperx test
$(MAKE) -C backend/python/ace-step test
$(MAKE) -C backend/python/trl test
$(MAKE) -C backend/rust/kokoros test
DOCKER_IMAGE?=local-ai
IMAGE_TYPE?=core
@@ -586,6 +588,9 @@ BACKEND_MLX_DISTRIBUTED = mlx-distributed|python|./|false|true
BACKEND_TRL = trl|python|.|false|true
BACKEND_LLAMA_CPP_QUANTIZATION = llama-cpp-quantization|python|.|false|true
# Rust backends
BACKEND_KOKOROS = kokoros|rust|.|false|true
# Helper function to build docker image for a backend
# Usage: $(call docker-build-backend,BACKEND_NAME,DOCKERFILE_TYPE,BUILD_CONTEXT,PROGRESS_FLAG,NEEDS_BACKEND_ARG)
define docker-build-backend
@@ -644,12 +649,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_ACESTEP_CPP)))
$(eval $(call generate-docker-build-target,$(BACKEND_MLX_DISTRIBUTED)))
$(eval $(call generate-docker-build-target,$(BACKEND_TRL)))
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_QUANTIZATION)))
$(eval $(call generate-docker-build-target,$(BACKEND_KOKOROS)))
# Pattern rule for docker-save targets
docker-save-%: backend-images
docker save local-ai-backend:$* -o backend-images/$*.tar
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-kokoros
########################################################
### Mock Backend for E2E Tests

39
backend/Dockerfile.rust Normal file
View File

@@ -0,0 +1,39 @@
ARG BASE_IMAGE=ubuntu:24.04
FROM ${BASE_IMAGE} AS builder
ARG BACKEND=kokoros
ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETARCH
ARG TARGETVARIANT
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
git ccache \
ca-certificates \
make cmake wget \
curl unzip \
clang \
pkg-config \
libssl-dev \
espeak-ng libespeak-ng-dev \
libsonic-dev libpcaudio-dev \
libopus-dev \
protobuf-compiler && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Install Rust
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
COPY . /LocalAI
RUN git config --global --add safe.directory /LocalAI
RUN make -C /LocalAI/backend/rust/${BACKEND} build
FROM scratch
ARG BACKEND=kokoros
COPY --from=builder /LocalAI/backend/rust/${BACKEND}/package/. ./

View File

@@ -469,6 +469,26 @@
nvidia-cuda-13: "cuda13-kokoro"
nvidia-cuda-12: "cuda12-kokoro"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-kokoro"
- &kokoros
icon: https://avatars.githubusercontent.com/u/166769057?v=4
description: |
Kokoros is a pure Rust TTS backend using the Kokoro ONNX model (82M parameters).
It provides fast, high-quality text-to-speech with streaming support, built on
ONNX Runtime for efficient CPU inference. Supports English, Japanese, Mandarin
Chinese, and German.
urls:
- https://huggingface.co/hexgrad/Kokoro-82M
- https://github.com/lucasjinreal/Kokoros
tags:
- text-to-speech
- TTS
- Rust
- ONNX
license: apache-2.0
alias: "kokoros"
name: "kokoros"
capabilities:
default: "cpu-kokoros"
- &coqui
urls:
- https://github.com/idiap/coqui-ai-TTS
@@ -2043,6 +2063,21 @@
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-kokoro"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-kokoro
## kokoros (Rust)
- !!merge <<: *kokoros
name: "kokoros-development"
capabilities:
default: "cpu-kokoros-development"
- !!merge <<: *kokoros
name: "cpu-kokoros"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-kokoros"
mirrors:
- localai/localai-backends:latest-cpu-kokoros
- !!merge <<: *kokoros
name: "cpu-kokoros-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-kokoros"
mirrors:
- localai/localai-backends:master-cpu-kokoros
## faster-whisper
- !!merge <<: *faster-whisper
name: "faster-whisper-development"

3
backend/rust/kokoros/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
/target/
/proto/
/package/

3074
backend/rust/kokoros/Cargo.lock generated Normal file
View File

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,26 @@
[package]
name = "kokoros-grpc"
version = "0.1.0"
edition = "2021"
[[bin]]
name = "kokoros-grpc"
path = "src/main.rs"
[dependencies]
kokoros = { path = "sources/Kokoros/kokoros" }
tonic = "0.13"
prost = "0.13"
tokio = { version = "1", features = ["full"] }
tokio-stream = "0.1"
clap = { version = "4", features = ["derive"] }
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
[build-dependencies]
tonic-build = "0.13"
[features]
default = ["cpu"]
cpu = ["kokoros/cpu"]

View File

@@ -0,0 +1,25 @@
CURRENT_DIR=$(abspath ./)
.PHONY: kokoros-grpc
kokoros-grpc:
mkdir -p $(CURRENT_DIR)/proto
cp $(CURRENT_DIR)/../../backend.proto $(CURRENT_DIR)/proto/backend.proto
cd $(CURRENT_DIR) && \
BACKEND_PROTO_PATH=$(CURRENT_DIR)/proto/backend.proto \
cargo build --release
.PHONY: package
package:
bash package.sh
.PHONY: test
test: kokoros-grpc
cd $(CURRENT_DIR) && cargo test
.PHONY: build
build: kokoros-grpc package
.PHONY: clean
clean:
cargo clean
rm -rf package proto

View File

@@ -0,0 +1,15 @@
fn main() -> Result<(), Box<dyn std::error::Error>> {
let proto_path = std::env::var("BACKEND_PROTO_PATH")
.unwrap_or_else(|_| "proto/backend.proto".to_string());
let proto_dir = std::path::Path::new(&proto_path)
.parent()
.unwrap_or(std::path::Path::new("."));
tonic_build::configure()
.build_server(true)
.build_client(false)
.compile_protos(&[&proto_path], &[proto_dir])?;
Ok(())
}

View File

@@ -0,0 +1,42 @@
#!/bin/bash
set -e
CURDIR=$(dirname "$(realpath $0)")
mkdir -p $CURDIR/package/lib
# Copy the binary and run script
cp -avf $CURDIR/target/release/kokoros-grpc $CURDIR/package/
cp -rfv $CURDIR/run.sh $CURDIR/package/
chmod +x $CURDIR/package/run.sh
# Copy espeak-ng data
if [ -d "/usr/share/espeak-ng-data" ]; then
cp -rf /usr/share/espeak-ng-data $CURDIR/package/
elif [ -d "/usr/lib/x86_64-linux-gnu/espeak-ng-data" ]; then
cp -rf /usr/lib/x86_64-linux-gnu/espeak-ng-data $CURDIR/package/
fi
# Bundle all dynamic library dependencies
echo "Bundling dynamic library dependencies..."
ldd $CURDIR/target/release/kokoros-grpc | grep "=>" | awk '{print $3}' | while read lib; do
if [ -n "$lib" ] && [ -f "$lib" ]; then
cp -avfL "$lib" $CURDIR/package/lib/
fi
done
# Copy CA certificates for HTTPS (needed for model auto-download)
if [ -d "/etc/ssl/certs" ]; then
mkdir -p $CURDIR/package/etc/ssl
cp -rf /etc/ssl/certs $CURDIR/package/etc/ssl/
fi
# Copy the dynamic linker
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
fi
echo "Packaging completed successfully"
ls -liah $CURDIR/package/
ls -liah $CURDIR/package/lib/

23
backend/rust/kokoros/run.sh Executable file
View File

@@ -0,0 +1,23 @@
#!/bin/bash
set -ex
CURDIR=$(dirname "$(realpath $0)")
export LD_LIBRARY_PATH=$CURDIR/lib:${LD_LIBRARY_PATH:-}
# SSL certificates for model auto-download
if [ -d "$CURDIR/etc/ssl/certs" ]; then
export SSL_CERT_DIR=$CURDIR/etc/ssl/certs
fi
# espeak-ng data directory
if [ -d "$CURDIR/espeak-ng-data" ]; then
export ESPEAK_NG_DATA=$CURDIR/espeak-ng-data
fi
# Use bundled ld.so if present (portability)
if [ -f $CURDIR/lib/ld.so ]; then
exec $CURDIR/lib/ld.so $CURDIR/kokoros-grpc "$@"
fi
exec $CURDIR/kokoros-grpc "$@"

View File

@@ -0,0 +1,26 @@
use tonic::{Request, Status};
/// Returns an interceptor function if LOCALAI_GRPC_AUTH_TOKEN is set.
pub fn make_auth_interceptor(
) -> Option<impl Fn(Request<()>) -> Result<Request<()>, Status> + Clone> {
let token = std::env::var("LOCALAI_GRPC_AUTH_TOKEN").ok()?;
if token.is_empty() {
return None;
}
let expected = format!("Bearer {}", token);
Some(
move |req: Request<()>| -> Result<Request<()>, Status> {
let meta = req.metadata();
match meta.get("authorization") {
Some(val) => {
if val.as_bytes() == expected.as_bytes() {
Ok(req)
} else {
Err(Status::unauthenticated("invalid token"))
}
}
None => Err(Status::unauthenticated("missing authorization")),
}
},
)
}

View File

@@ -0,0 +1,53 @@
use clap::Parser;
use tonic::transport::Server;
mod auth;
mod service;
pub mod backend {
tonic::include_proto!("backend");
}
#[derive(Parser, Debug)]
#[command(name = "kokoros-grpc")]
struct Cli {
/// gRPC listen address (host:port)
#[arg(long, default_value = "localhost:50051")]
addr: String,
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
tracing_subscriber::fmt()
.with_writer(std::io::stderr)
.with_ansi(false)
.without_time()
.with_env_filter(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
)
.init();
let cli = Cli::parse();
let addr = cli.addr.parse()?;
tracing::info!("Starting kokoros gRPC server on {}", addr);
let mut builder = Server::builder();
if let Some(interceptor) = auth::make_auth_interceptor() {
tracing::info!("Bearer token authentication enabled");
let svc = backend::backend_server::BackendServer::with_interceptor(
service::KokorosService::default(),
interceptor,
);
builder.add_service(svc).serve(addr).await?;
} else {
let svc = backend::backend_server::BackendServer::new(service::KokorosService::default())
.max_decoding_message_size(50 * 1024 * 1024)
.max_encoding_message_size(50 * 1024 * 1024);
builder.add_service(svc).serve(addr).await?;
}
Ok(())
}

View File

@@ -0,0 +1,652 @@
use std::sync::{Arc, Mutex};
use tokio::sync::Mutex as TokioMutex;
use tokio_stream::wrappers::ReceiverStream;
use tonic::{Request, Response, Status};
use kokoros::tts::koko::TTSKoko;
use crate::backend;
use crate::backend::backend_server::Backend;
/// Write f32 samples as a standard 44-byte PCM 16-bit WAV file.
/// LocalAI's audio pipeline assumes this exact header layout.
fn write_pcm16_wav(
path: &str,
samples: &[f32],
sample_rate: u32,
) -> Result<(), Box<dyn std::error::Error>> {
use std::fs::File;
use std::io::Write;
let num_samples = samples.len() as u32;
let data_size = num_samples * 2; // 16-bit = 2 bytes per sample
let file_size = 36 + data_size;
let mut f = File::create(path)?;
// RIFF header
f.write_all(b"RIFF")?;
f.write_all(&file_size.to_le_bytes())?;
f.write_all(b"WAVE")?;
// fmt chunk — standard 16-byte PCM format
f.write_all(b"fmt ")?;
f.write_all(&16u32.to_le_bytes())?; // chunk size
f.write_all(&1u16.to_le_bytes())?; // audio format = PCM
f.write_all(&1u16.to_le_bytes())?; // channels = mono
f.write_all(&sample_rate.to_le_bytes())?;
f.write_all(&(sample_rate * 2).to_le_bytes())?; // byte rate
f.write_all(&2u16.to_le_bytes())?; // block align
f.write_all(&16u16.to_le_bytes())?; // bits per sample
// data chunk
f.write_all(b"data")?;
f.write_all(&data_size.to_le_bytes())?;
for &s in samples {
let clamped = s.clamp(-1.0, 1.0);
let pcm = (clamped * 32767.0) as i16;
f.write_all(&pcm.to_le_bytes())?;
}
Ok(())
}
pub struct KokorosService {
tts: Arc<TokioMutex<Option<TTSKoko>>>,
language: Arc<Mutex<String>>,
speed: Arc<Mutex<f32>>,
}
impl Default for KokorosService {
fn default() -> Self {
Self {
tts: Arc::new(TokioMutex::new(None)),
language: Arc::new(Mutex::new("en-us".to_string())),
speed: Arc::new(Mutex::new(1.0)),
}
}
}
#[tonic::async_trait]
impl Backend for KokorosService {
async fn health(
&self,
_req: Request<backend::HealthMessage>,
) -> Result<Response<backend::Reply>, Status> {
Ok(Response::new(backend::Reply {
message: b"OK".to_vec(),
..Default::default()
}))
}
async fn load_model(
&self,
req: Request<backend::ModelOptions>,
) -> Result<Response<backend::Result>, Status> {
let opts = req.into_inner();
// Model path: join ModelPath + Model, or just Model
let model_path = if !opts.model_path.is_empty() && !opts.model.is_empty() {
format!("{}/{}", opts.model_path, opts.model)
} else if !opts.model.is_empty() {
opts.model.clone()
} else {
"checkpoints/kokoro-v1.0.onnx".to_string()
};
// Voices data path from AudioPath, or derive from model dir
let voices_path = if !opts.audio_path.is_empty() {
opts.audio_path.clone()
} else {
let model_dir = std::path::Path::new(&model_path)
.parent()
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_else(|| ".".to_string());
format!("{}/voices-v1.0.bin", model_dir)
};
// Parse options (key:value pairs)
for opt in &opts.options {
if let Some((key, value)) = opt.split_once(':') {
match key {
"lang_code" => *self.language.lock().unwrap() = value.to_string(),
"speed" => {
if let Ok(s) = value.parse::<f32>() {
*self.speed.lock().unwrap() = s;
}
}
_ => {}
}
}
}
tracing::info!("Loading Kokoros model from: {}", model_path);
tracing::info!("Loading voices from: {}", voices_path);
tracing::info!("Language: {}", self.language.lock().unwrap());
let tts = TTSKoko::new(&model_path, &voices_path).await;
*self.tts.lock().await = Some(tts);
tracing::info!("Kokoros TTS model loaded successfully");
Ok(Response::new(backend::Result {
success: true,
message: "Kokoros TTS model loaded".into(),
}))
}
async fn tts(
&self,
req: Request<backend::TtsRequest>,
) -> Result<Response<backend::Result>, Status> {
let req = req.into_inner();
let tts_guard = self.tts.lock().await;
let tts = tts_guard
.as_ref()
.ok_or_else(|| Status::failed_precondition("Model not loaded"))?;
let voice = if req.voice.is_empty() {
"af_heart"
} else {
&req.voice
};
let lang = req
.language
.filter(|l| !l.is_empty())
.unwrap_or_else(|| self.language.lock().unwrap().clone());
let speed = *self.speed.lock().unwrap();
tracing::info!(
text = req.text,
voice = voice,
lang = lang.as_str(),
dst = req.dst,
"TTS request received"
);
let start = std::time::Instant::now();
match tts.tts_raw_audio(&req.text, &lang, voice, speed, None, None, None, None) {
Ok(samples) => {
let duration_secs = samples.len() as f64 / 24000.0;
tracing::info!(
num_samples = samples.len(),
audio_duration = format!("{:.2}s", duration_secs),
inference_time = format!("{:.2}s", start.elapsed().as_secs_f64()),
dst = req.dst,
"TTS inference complete"
);
if let Err(e) = write_pcm16_wav(&req.dst, &samples, 24000) {
tracing::error!("Failed to write WAV to {}: {}", req.dst, e);
return Ok(Response::new(backend::Result {
success: false,
message: format!("Failed to write WAV: {}", e),
}));
}
Ok(Response::new(backend::Result {
success: true,
message: String::new(),
}))
}
Err(e) => {
tracing::error!("TTS error: {}", e);
Ok(Response::new(backend::Result {
success: false,
message: format!("TTS error: {}", e),
}))
}
}
}
type TTSStreamStream = ReceiverStream<Result<backend::Reply, Status>>;
async fn tts_stream(
&self,
req: Request<backend::TtsRequest>,
) -> Result<Response<Self::TTSStreamStream>, Status> {
let req = req.into_inner();
let tts_guard = self.tts.lock().await;
let tts = tts_guard
.as_ref()
.ok_or_else(|| Status::failed_precondition("Model not loaded"))?
.clone();
let voice = if req.voice.is_empty() {
"af_heart".to_string()
} else {
req.voice
};
let lang = req
.language
.filter(|l| !l.is_empty())
.unwrap_or_else(|| self.language.lock().unwrap().clone());
let speed = *self.speed.lock().unwrap();
let text = req.text;
let (tx, rx) = tokio::sync::mpsc::channel(32);
// Send sample rate info as first message
let tx_clone = tx.clone();
let _ = tx_clone
.send(Ok(backend::Reply {
message: br#"{"sample_rate":24000}"#.to_vec(),
..Default::default()
}))
.await;
tokio::task::spawn_blocking(move || {
let result = tts.tts_raw_audio_streaming(
&text,
&lang,
&voice,
speed,
None,
None,
None,
None,
|audio_chunk: Vec<f32>| -> Result<(), Box<dyn std::error::Error>> {
// Convert f32 PCM to 16-bit PCM bytes (what LocalAI expects for streaming)
let bytes: Vec<u8> = audio_chunk
.iter()
.flat_map(|&s| {
let clamped = s.clamp(-1.0, 1.0);
let i16_val = (clamped * 32767.0) as i16;
i16_val.to_le_bytes()
})
.collect();
tx.blocking_send(Ok(backend::Reply {
audio: bytes,
..Default::default()
}))
.map_err(|e| Box::new(e) as Box<dyn std::error::Error>)
},
);
if let Err(e) = result {
tracing::error!("TTSStream error: {}", e);
}
});
Ok(Response::new(ReceiverStream::new(rx)))
}
async fn status(
&self,
_req: Request<backend::HealthMessage>,
) -> Result<Response<backend::StatusResponse>, Status> {
let tts = self.tts.lock().await;
let state = if tts.is_some() {
backend::status_response::State::Ready as i32
} else {
backend::status_response::State::Uninitialized as i32
};
Ok(Response::new(backend::StatusResponse {
state,
memory: None,
}))
}
async fn free(
&self,
_req: Request<backend::HealthMessage>,
) -> Result<Response<backend::Result>, Status> {
*self.tts.lock().await = None;
Ok(Response::new(backend::Result {
success: true,
message: "Model freed".into(),
}))
}
// --- Unimplemented RPCs ---
async fn predict(
&self,
_: Request<backend::PredictOptions>,
) -> Result<Response<backend::Reply>, Status> {
Err(Status::unimplemented("Not supported"))
}
type PredictStreamStream = ReceiverStream<Result<backend::Reply, Status>>;
async fn predict_stream(
&self,
_: Request<backend::PredictOptions>,
) -> Result<Response<Self::PredictStreamStream>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn embedding(
&self,
_: Request<backend::PredictOptions>,
) -> Result<Response<backend::EmbeddingResult>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn generate_image(
&self,
_: Request<backend::GenerateImageRequest>,
) -> Result<Response<backend::Result>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn generate_video(
&self,
_: Request<backend::GenerateVideoRequest>,
) -> Result<Response<backend::Result>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn audio_transcription(
&self,
_: Request<backend::TranscriptRequest>,
) -> Result<Response<backend::TranscriptResult>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn sound_generation(
&self,
_: Request<backend::SoundGenerationRequest>,
) -> Result<Response<backend::Result>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn tokenize_string(
&self,
_: Request<backend::PredictOptions>,
) -> Result<Response<backend::TokenizationResponse>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn detect(
&self,
_: Request<backend::DetectOptions>,
) -> Result<Response<backend::DetectResponse>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn stores_set(
&self,
_: Request<backend::StoresSetOptions>,
) -> Result<Response<backend::Result>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn stores_delete(
&self,
_: Request<backend::StoresDeleteOptions>,
) -> Result<Response<backend::Result>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn stores_get(
&self,
_: Request<backend::StoresGetOptions>,
) -> Result<Response<backend::StoresGetResult>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn stores_find(
&self,
_: Request<backend::StoresFindOptions>,
) -> Result<Response<backend::StoresFindResult>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn rerank(
&self,
_: Request<backend::RerankRequest>,
) -> Result<Response<backend::RerankResult>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn get_metrics(
&self,
_: Request<backend::MetricsRequest>,
) -> Result<Response<backend::MetricsResponse>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn vad(
&self,
_: Request<backend::VadRequest>,
) -> Result<Response<backend::VadResponse>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn audio_encode(
&self,
_: Request<backend::AudioEncodeRequest>,
) -> Result<Response<backend::AudioEncodeResult>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn audio_decode(
&self,
_: Request<backend::AudioDecodeRequest>,
) -> Result<Response<backend::AudioDecodeResult>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn model_metadata(
&self,
_: Request<backend::ModelOptions>,
) -> Result<Response<backend::ModelMetadataResponse>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn start_fine_tune(
&self,
_: Request<backend::FineTuneRequest>,
) -> Result<Response<backend::FineTuneJobResult>, Status> {
Err(Status::unimplemented("Not supported"))
}
type FineTuneProgressStream = ReceiverStream<Result<backend::FineTuneProgressUpdate, Status>>;
async fn fine_tune_progress(
&self,
_: Request<backend::FineTuneProgressRequest>,
) -> Result<Response<Self::FineTuneProgressStream>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn stop_fine_tune(
&self,
_: Request<backend::FineTuneStopRequest>,
) -> Result<Response<backend::Result>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn list_checkpoints(
&self,
_: Request<backend::ListCheckpointsRequest>,
) -> Result<Response<backend::ListCheckpointsResponse>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn export_model(
&self,
_: Request<backend::ExportModelRequest>,
) -> Result<Response<backend::Result>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn start_quantization(
&self,
_: Request<backend::QuantizationRequest>,
) -> Result<Response<backend::QuantizationJobResult>, Status> {
Err(Status::unimplemented("Not supported"))
}
type QuantizationProgressStream =
ReceiverStream<Result<backend::QuantizationProgressUpdate, Status>>;
async fn quantization_progress(
&self,
_: Request<backend::QuantizationProgressRequest>,
) -> Result<Response<Self::QuantizationProgressStream>, Status> {
Err(Status::unimplemented("Not supported"))
}
async fn stop_quantization(
&self,
_: Request<backend::QuantizationStopRequest>,
) -> Result<Response<backend::Result>, Status> {
Err(Status::unimplemented("Not supported"))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn wav_header_is_standard_pcm16() {
let samples = vec![0.0f32, 0.5, -0.5, 1.0, -1.0];
let path = std::env::temp_dir().join("kokoros_test.wav");
let path_str = path.to_str().unwrap();
write_pcm16_wav(path_str, &samples, 24000).unwrap();
let data = std::fs::read(&path).unwrap();
std::fs::remove_file(&path).unwrap();
// Must be exactly 44-byte header + data
assert_eq!(data.len(), 44 + samples.len() * 2);
// RIFF header
assert_eq!(&data[0..4], b"RIFF");
assert_eq!(&data[8..12], b"WAVE");
// fmt chunk: 16 bytes, format=1 (PCM), channels=1, 16-bit
assert_eq!(&data[12..16], b"fmt ");
assert_eq!(u32::from_le_bytes(data[16..20].try_into().unwrap()), 16); // chunk size
assert_eq!(u16::from_le_bytes(data[20..22].try_into().unwrap()), 1); // PCM format
assert_eq!(u16::from_le_bytes(data[22..24].try_into().unwrap()), 1); // mono
assert_eq!(u32::from_le_bytes(data[24..28].try_into().unwrap()), 24000); // sample rate
assert_eq!(u16::from_le_bytes(data[34..36].try_into().unwrap()), 16); // bits per sample
// data chunk
assert_eq!(&data[36..40], b"data");
assert_eq!(
u32::from_le_bytes(data[40..44].try_into().unwrap()),
(samples.len() * 2) as u32
);
// Verify sample values: 0.5 -> 16383, -0.5 -> -16383, 1.0 -> 32767, -1.0 -> -32767
let s1 = i16::from_le_bytes(data[46..48].try_into().unwrap());
assert_eq!(s1, 16383); // 0.5 * 32767
let s3 = i16::from_le_bytes(data[50..52].try_into().unwrap());
assert_eq!(s3, 32767); // 1.0 clamped
let s4 = i16::from_le_bytes(data[52..54].try_into().unwrap());
assert_eq!(s4, -32767); // -1.0 clamped
}
/// Integration test: runs actual TTS inference and validates the output audio.
/// Skipped unless KOKOROS_MODEL_PATH is set to a directory containing
/// kokoro-v1.0.onnx and voices-v1.0.bin.
#[tokio::test]
async fn tts_produces_valid_speech() {
let model_dir = match std::env::var("KOKOROS_MODEL_PATH") {
Ok(p) => p,
Err(_) => {
eprintln!("KOKOROS_MODEL_PATH not set, skipping integration test");
return;
}
};
let model_path = format!("{}/kokoro-v1.0.onnx", model_dir);
let voices_path = format!("{}/voices-v1.0.bin", model_dir);
if !std::path::Path::new(&model_path).exists() {
eprintln!("Model file not found at {}, skipping", model_path);
return;
}
let tts = TTSKoko::new(&model_path, &voices_path).await;
let input_text = "Hello world, this is a test of speech synthesis.";
let out_path = std::env::temp_dir().join("kokoros_integration_test.wav");
let out_str = out_path.to_str().unwrap();
let samples = tts
.tts_raw_audio(input_text, "en-us", "af_heart", 1.0, None, None, None, None)
.expect("tts_raw_audio failed");
write_pcm16_wav(out_str, &samples, 24000).unwrap();
let data = std::fs::read(&out_path).unwrap();
std::fs::remove_file(&out_path).unwrap();
// --- WAV header sanity ---
assert_eq!(&data[0..4], b"RIFF");
assert_eq!(&data[8..12], b"WAVE");
assert_eq!(u16::from_le_bytes(data[20..22].try_into().unwrap()), 1); // PCM
assert_eq!(u32::from_le_bytes(data[24..28].try_into().unwrap()), 24000); // sample rate
assert_eq!(u16::from_le_bytes(data[34..36].try_into().unwrap()), 16); // 16-bit
let num_samples = samples.len();
let duration_secs = num_samples as f64 / 24000.0;
// --- Duration check ---
// ~10 words should produce roughly 2-8 seconds of speech
assert!(
duration_secs > 1.0,
"Audio too short: {:.2}s for {} words",
duration_secs,
input_text.split_whitespace().count()
);
assert!(
duration_secs < 15.0,
"Audio too long: {:.2}s for {} words",
duration_secs,
input_text.split_whitespace().count()
);
// --- Energy check: not silence ---
let rms = (samples.iter().map(|s| s * s).sum::<f32>() / num_samples as f32).sqrt();
assert!(
rms > 0.01,
"Audio is near-silence: RMS = {:.6}",
rms
);
// --- Not clipped/saturated: should have dynamic range ---
let max_abs = samples.iter().map(|s| s.abs()).fold(0.0f32, f32::max);
assert!(
max_abs < 1.0,
"Audio is fully saturated (max |sample| = {:.4})",
max_abs
);
assert!(
max_abs > 0.05,
"Audio has very low amplitude (max |sample| = {:.4})",
max_abs
);
// --- Speech-like spectral check ---
// Speech should have significant energy variation (not white noise or DC).
// Check that the signal has zero-crossings in a speech-like range (roughly
// 50-400 crossings per 24000 samples = 100-8000 Hz fundamental range).
let zero_crossings: usize = samples
.windows(2)
.filter(|w| (w[0] >= 0.0) != (w[1] >= 0.0))
.count();
let crossings_per_sec = zero_crossings as f64 / duration_secs;
// White noise at 24kHz would have ~12000 crossings/sec.
// Speech is typically 100-4000 crossings/sec.
assert!(
crossings_per_sec < 10000.0,
"Too many zero crossings ({:.0}/s) — likely noise, not speech",
crossings_per_sec
);
assert!(
crossings_per_sec > 50.0,
"Too few zero crossings ({:.0}/s) — likely DC or silence, not speech",
crossings_per_sec
);
eprintln!(
"Integration test passed: duration={:.2}s, rms={:.4}, max={:.4}, zero_crossings={:.0}/s",
duration_secs, rms, max_abs, crossings_per_sec
);
}
}

View File

@@ -2514,6 +2514,132 @@
- lang_code:a
known_usecases:
- tts
- name: "kokoros"
url: "github:mudler/LocalAI/gallery/kokoros.yaml@master"
size: "327MB"
urls:
- https://github.com/lucasjinreal/Kokoros
license: apache-2.0
tags:
- tts
- kokoros
- cpu
- text-to-speech
- rust
description: |
Kokoros is a pure Rust TTS backend using the Kokoro v1.0 ONNX model (82M parameters).
Fast, streaming TTS with high quality. American English with af_heart voice.
overrides:
backend: "kokoros"
name: "kokoros"
description: "Kokoros Rust TTS - American English"
parameters:
model: "kokoro-v1.0.onnx"
voice: "af_heart"
options:
- lang_code:en-us
known_usecases:
- tts
files:
- filename: "kokoro-v1.0.onnx"
sha256: 7d5df8ecf7d4b1878015a32686053fd0eebe2bc377234608764cc0ef3636a6c5
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"
- filename: "voices-v1.0.bin"
sha256: bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
- name: "kokoros-ja"
url: "github:mudler/LocalAI/gallery/kokoros.yaml@master"
size: "327MB"
urls:
- https://github.com/lucasjinreal/Kokoros
license: apache-2.0
tags:
- tts
- kokoros
- japanese
- text-to-speech
description: |
Kokoros Rust TTS - Japanese. Uses the Kokoro v1.0 ONNX model with Japanese phonemization.
overrides:
backend: "kokoros"
name: "kokoros-ja"
description: "Kokoros Rust TTS - Japanese"
parameters:
model: "kokoro-v1.0.onnx"
voice: "jf_alpha"
options:
- lang_code:ja
known_usecases:
- tts
files:
- filename: "kokoro-v1.0.onnx"
sha256: 7d5df8ecf7d4b1878015a32686053fd0eebe2bc377234608764cc0ef3636a6c5
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"
- filename: "voices-v1.0.bin"
sha256: bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
- name: "kokoros-cmn"
url: "github:mudler/LocalAI/gallery/kokoros.yaml@master"
size: "327MB"
urls:
- https://github.com/lucasjinreal/Kokoros
license: apache-2.0
tags:
- tts
- kokoros
- chinese
- text-to-speech
description: |
Kokoros Rust TTS - Mandarin Chinese.
overrides:
backend: "kokoros"
name: "kokoros-cmn"
description: "Kokoros Rust TTS - Mandarin Chinese"
parameters:
model: "kokoro-v1.0.onnx"
voice: "zf_xiaobei"
options:
- lang_code:cmn
known_usecases:
- tts
files:
- filename: "kokoro-v1.0.onnx"
sha256: 7d5df8ecf7d4b1878015a32686053fd0eebe2bc377234608764cc0ef3636a6c5
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"
- filename: "voices-v1.0.bin"
sha256: bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
- name: "kokoros-de"
url: "github:mudler/LocalAI/gallery/kokoros.yaml@master"
size: "327MB"
urls:
- https://github.com/lucasjinreal/Kokoros
license: apache-2.0
tags:
- tts
- kokoros
- german
- text-to-speech
description: |
Kokoros Rust TTS - German.
overrides:
backend: "kokoros"
name: "kokoros-de"
description: "Kokoros Rust TTS - German"
parameters:
model: "kokoro-v1.0.onnx"
voice: "df_greta"
options:
- lang_code:de
known_usecases:
- tts
files:
- filename: "kokoro-v1.0.onnx"
sha256: 7d5df8ecf7d4b1878015a32686053fd0eebe2bc377234608764cc0ef3636a6c5
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"
- filename: "voices-v1.0.bin"
sha256: bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
- name: "kitten-tts"
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
urls:

3
gallery/kokoros.yaml Normal file
View File

@@ -0,0 +1,3 @@
---
config_file: |
backend: kokoros

View File

@@ -21,6 +21,9 @@ function inferBackendPath(item) {
if (item.dockerfile.endsWith("golang")) {
return `backend/go/${item.backend}/`;
}
if (item.dockerfile.endsWith("rust")) {
return `backend/rust/${item.backend}/`;
}
if (item.dockerfile.endsWith("llama-cpp")) {
return `backend/cpp/llama-cpp/`;
}