mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-16 12:59:33 -04:00
feat: Add Kokoros backend (#9212)
Signed-off-by: Richard Palethorpe <io@richiejp.com>
This commit is contained in:
committed by
GitHub
parent
b7247fc148
commit
ea6e850809
23
.github/workflows/test-extra.yml
vendored
23
.github/workflows/test-extra.yml
vendored
@@ -31,6 +31,7 @@ jobs:
|
||||
llama-cpp-quantization: ${{ steps.detect.outputs.llama-cpp-quantization }}
|
||||
acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
|
||||
voxtral: ${{ steps.detect.outputs.voxtral }}
|
||||
kokoros: ${{ steps.detect.outputs.kokoros }}
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v6
|
||||
@@ -528,3 +529,25 @@ jobs:
|
||||
- name: Test voxtral
|
||||
run: |
|
||||
make --jobs=5 --output-sync=target -C backend/go/voxtral test
|
||||
tests-kokoros:
|
||||
needs: detect-changes
|
||||
if: needs.detect-changes.outputs.kokoros == 'true' || needs.detect-changes.outputs.run-all == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential cmake pkg-config protobuf-compiler clang libclang-dev
|
||||
sudo apt-get install -y espeak-ng libespeak-ng-dev libsonic-dev libpcaudio-dev libopus-dev libssl-dev
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
||||
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
- name: Build kokoros
|
||||
run: |
|
||||
make -C backend/rust/kokoros kokoros-grpc
|
||||
- name: Test kokoros
|
||||
run: |
|
||||
make -C backend/rust/kokoros test
|
||||
|
||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@@ -1,3 +1,6 @@
|
||||
[submodule "docs/themes/hugo-theme-relearn"]
|
||||
path = docs/themes/hugo-theme-relearn
|
||||
url = https://github.com/McShelby/hugo-theme-relearn.git
|
||||
[submodule "backend/rust/kokoros/sources/Kokoros"]
|
||||
path = backend/rust/kokoros/sources/Kokoros
|
||||
url = https://github.com/lucasjinreal/Kokoros
|
||||
|
||||
10
Makefile
10
Makefile
@@ -1,5 +1,5 @@
|
||||
# Disable parallel execution for backend builds
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros
|
||||
|
||||
GOCMD=go
|
||||
GOTEST=$(GOCMD) test
|
||||
@@ -431,6 +431,7 @@ prepare-test-extra: protogen-python
|
||||
$(MAKE) -C backend/python/whisperx
|
||||
$(MAKE) -C backend/python/ace-step
|
||||
$(MAKE) -C backend/python/trl
|
||||
$(MAKE) -C backend/rust/kokoros kokoros-grpc
|
||||
|
||||
test-extra: prepare-test-extra
|
||||
$(MAKE) -C backend/python/transformers test
|
||||
@@ -451,6 +452,7 @@ test-extra: prepare-test-extra
|
||||
$(MAKE) -C backend/python/whisperx test
|
||||
$(MAKE) -C backend/python/ace-step test
|
||||
$(MAKE) -C backend/python/trl test
|
||||
$(MAKE) -C backend/rust/kokoros test
|
||||
|
||||
DOCKER_IMAGE?=local-ai
|
||||
IMAGE_TYPE?=core
|
||||
@@ -586,6 +588,9 @@ BACKEND_MLX_DISTRIBUTED = mlx-distributed|python|./|false|true
|
||||
BACKEND_TRL = trl|python|.|false|true
|
||||
BACKEND_LLAMA_CPP_QUANTIZATION = llama-cpp-quantization|python|.|false|true
|
||||
|
||||
# Rust backends
|
||||
BACKEND_KOKOROS = kokoros|rust|.|false|true
|
||||
|
||||
# Helper function to build docker image for a backend
|
||||
# Usage: $(call docker-build-backend,BACKEND_NAME,DOCKERFILE_TYPE,BUILD_CONTEXT,PROGRESS_FLAG,NEEDS_BACKEND_ARG)
|
||||
define docker-build-backend
|
||||
@@ -644,12 +649,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_ACESTEP_CPP)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_MLX_DISTRIBUTED)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_TRL)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_QUANTIZATION)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_KOKOROS)))
|
||||
|
||||
# Pattern rule for docker-save targets
|
||||
docker-save-%: backend-images
|
||||
docker save local-ai-backend:$* -o backend-images/$*.tar
|
||||
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-kokoros
|
||||
|
||||
########################################################
|
||||
### Mock Backend for E2E Tests
|
||||
|
||||
39
backend/Dockerfile.rust
Normal file
39
backend/Dockerfile.rust
Normal file
@@ -0,0 +1,39 @@
|
||||
ARG BASE_IMAGE=ubuntu:24.04
|
||||
|
||||
FROM ${BASE_IMAGE} AS builder
|
||||
ARG BACKEND=kokoros
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
git ccache \
|
||||
ca-certificates \
|
||||
make cmake wget \
|
||||
curl unzip \
|
||||
clang \
|
||||
pkg-config \
|
||||
libssl-dev \
|
||||
espeak-ng libespeak-ng-dev \
|
||||
libsonic-dev libpcaudio-dev \
|
||||
libopus-dev \
|
||||
protobuf-compiler && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Rust
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
COPY . /LocalAI
|
||||
|
||||
RUN git config --global --add safe.directory /LocalAI
|
||||
|
||||
RUN make -C /LocalAI/backend/rust/${BACKEND} build
|
||||
|
||||
FROM scratch
|
||||
ARG BACKEND=kokoros
|
||||
|
||||
COPY --from=builder /LocalAI/backend/rust/${BACKEND}/package/. ./
|
||||
@@ -469,6 +469,26 @@
|
||||
nvidia-cuda-13: "cuda13-kokoro"
|
||||
nvidia-cuda-12: "cuda12-kokoro"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-kokoro"
|
||||
- &kokoros
|
||||
icon: https://avatars.githubusercontent.com/u/166769057?v=4
|
||||
description: |
|
||||
Kokoros is a pure Rust TTS backend using the Kokoro ONNX model (82M parameters).
|
||||
It provides fast, high-quality text-to-speech with streaming support, built on
|
||||
ONNX Runtime for efficient CPU inference. Supports English, Japanese, Mandarin
|
||||
Chinese, and German.
|
||||
urls:
|
||||
- https://huggingface.co/hexgrad/Kokoro-82M
|
||||
- https://github.com/lucasjinreal/Kokoros
|
||||
tags:
|
||||
- text-to-speech
|
||||
- TTS
|
||||
- Rust
|
||||
- ONNX
|
||||
license: apache-2.0
|
||||
alias: "kokoros"
|
||||
name: "kokoros"
|
||||
capabilities:
|
||||
default: "cpu-kokoros"
|
||||
- &coqui
|
||||
urls:
|
||||
- https://github.com/idiap/coqui-ai-TTS
|
||||
@@ -2043,6 +2063,21 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-kokoro"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-kokoro
|
||||
## kokoros (Rust)
|
||||
- !!merge <<: *kokoros
|
||||
name: "kokoros-development"
|
||||
capabilities:
|
||||
default: "cpu-kokoros-development"
|
||||
- !!merge <<: *kokoros
|
||||
name: "cpu-kokoros"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-kokoros"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-cpu-kokoros
|
||||
- !!merge <<: *kokoros
|
||||
name: "cpu-kokoros-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-kokoros"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-kokoros
|
||||
## faster-whisper
|
||||
- !!merge <<: *faster-whisper
|
||||
name: "faster-whisper-development"
|
||||
|
||||
3
backend/rust/kokoros/.gitignore
vendored
Normal file
3
backend/rust/kokoros/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
/target/
|
||||
/proto/
|
||||
/package/
|
||||
3074
backend/rust/kokoros/Cargo.lock
generated
Normal file
3074
backend/rust/kokoros/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
26
backend/rust/kokoros/Cargo.toml
Normal file
26
backend/rust/kokoros/Cargo.toml
Normal file
@@ -0,0 +1,26 @@
|
||||
[package]
|
||||
name = "kokoros-grpc"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[[bin]]
|
||||
name = "kokoros-grpc"
|
||||
path = "src/main.rs"
|
||||
|
||||
[dependencies]
|
||||
kokoros = { path = "sources/Kokoros/kokoros" }
|
||||
|
||||
tonic = "0.13"
|
||||
prost = "0.13"
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
tokio-stream = "0.1"
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
|
||||
[build-dependencies]
|
||||
tonic-build = "0.13"
|
||||
|
||||
[features]
|
||||
default = ["cpu"]
|
||||
cpu = ["kokoros/cpu"]
|
||||
25
backend/rust/kokoros/Makefile
Normal file
25
backend/rust/kokoros/Makefile
Normal file
@@ -0,0 +1,25 @@
|
||||
CURRENT_DIR=$(abspath ./)
|
||||
|
||||
.PHONY: kokoros-grpc
|
||||
kokoros-grpc:
|
||||
mkdir -p $(CURRENT_DIR)/proto
|
||||
cp $(CURRENT_DIR)/../../backend.proto $(CURRENT_DIR)/proto/backend.proto
|
||||
cd $(CURRENT_DIR) && \
|
||||
BACKEND_PROTO_PATH=$(CURRENT_DIR)/proto/backend.proto \
|
||||
cargo build --release
|
||||
|
||||
.PHONY: package
|
||||
package:
|
||||
bash package.sh
|
||||
|
||||
.PHONY: test
|
||||
test: kokoros-grpc
|
||||
cd $(CURRENT_DIR) && cargo test
|
||||
|
||||
.PHONY: build
|
||||
build: kokoros-grpc package
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
cargo clean
|
||||
rm -rf package proto
|
||||
15
backend/rust/kokoros/build.rs
Normal file
15
backend/rust/kokoros/build.rs
Normal file
@@ -0,0 +1,15 @@
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let proto_path = std::env::var("BACKEND_PROTO_PATH")
|
||||
.unwrap_or_else(|_| "proto/backend.proto".to_string());
|
||||
|
||||
let proto_dir = std::path::Path::new(&proto_path)
|
||||
.parent()
|
||||
.unwrap_or(std::path::Path::new("."));
|
||||
|
||||
tonic_build::configure()
|
||||
.build_server(true)
|
||||
.build_client(false)
|
||||
.compile_protos(&[&proto_path], &[proto_dir])?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
42
backend/rust/kokoros/package.sh
Normal file
42
backend/rust/kokoros/package.sh
Normal file
@@ -0,0 +1,42 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
mkdir -p $CURDIR/package/lib
|
||||
|
||||
# Copy the binary and run script
|
||||
cp -avf $CURDIR/target/release/kokoros-grpc $CURDIR/package/
|
||||
cp -rfv $CURDIR/run.sh $CURDIR/package/
|
||||
chmod +x $CURDIR/package/run.sh
|
||||
|
||||
# Copy espeak-ng data
|
||||
if [ -d "/usr/share/espeak-ng-data" ]; then
|
||||
cp -rf /usr/share/espeak-ng-data $CURDIR/package/
|
||||
elif [ -d "/usr/lib/x86_64-linux-gnu/espeak-ng-data" ]; then
|
||||
cp -rf /usr/lib/x86_64-linux-gnu/espeak-ng-data $CURDIR/package/
|
||||
fi
|
||||
|
||||
# Bundle all dynamic library dependencies
|
||||
echo "Bundling dynamic library dependencies..."
|
||||
ldd $CURDIR/target/release/kokoros-grpc | grep "=>" | awk '{print $3}' | while read lib; do
|
||||
if [ -n "$lib" ] && [ -f "$lib" ]; then
|
||||
cp -avfL "$lib" $CURDIR/package/lib/
|
||||
fi
|
||||
done
|
||||
|
||||
# Copy CA certificates for HTTPS (needed for model auto-download)
|
||||
if [ -d "/etc/ssl/certs" ]; then
|
||||
mkdir -p $CURDIR/package/etc/ssl
|
||||
cp -rf /etc/ssl/certs $CURDIR/package/etc/ssl/
|
||||
fi
|
||||
|
||||
# Copy the dynamic linker
|
||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||
cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
|
||||
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||
cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
|
||||
fi
|
||||
|
||||
echo "Packaging completed successfully"
|
||||
ls -liah $CURDIR/package/
|
||||
ls -liah $CURDIR/package/lib/
|
||||
23
backend/rust/kokoros/run.sh
Executable file
23
backend/rust/kokoros/run.sh
Executable file
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
|
||||
export LD_LIBRARY_PATH=$CURDIR/lib:${LD_LIBRARY_PATH:-}
|
||||
|
||||
# SSL certificates for model auto-download
|
||||
if [ -d "$CURDIR/etc/ssl/certs" ]; then
|
||||
export SSL_CERT_DIR=$CURDIR/etc/ssl/certs
|
||||
fi
|
||||
|
||||
# espeak-ng data directory
|
||||
if [ -d "$CURDIR/espeak-ng-data" ]; then
|
||||
export ESPEAK_NG_DATA=$CURDIR/espeak-ng-data
|
||||
fi
|
||||
|
||||
# Use bundled ld.so if present (portability)
|
||||
if [ -f $CURDIR/lib/ld.so ]; then
|
||||
exec $CURDIR/lib/ld.so $CURDIR/kokoros-grpc "$@"
|
||||
fi
|
||||
|
||||
exec $CURDIR/kokoros-grpc "$@"
|
||||
1
backend/rust/kokoros/sources/Kokoros
Submodule
1
backend/rust/kokoros/sources/Kokoros
Submodule
Submodule backend/rust/kokoros/sources/Kokoros added at 7089168f0c
26
backend/rust/kokoros/src/auth.rs
Normal file
26
backend/rust/kokoros/src/auth.rs
Normal file
@@ -0,0 +1,26 @@
|
||||
use tonic::{Request, Status};
|
||||
|
||||
/// Returns an interceptor function if LOCALAI_GRPC_AUTH_TOKEN is set.
|
||||
pub fn make_auth_interceptor(
|
||||
) -> Option<impl Fn(Request<()>) -> Result<Request<()>, Status> + Clone> {
|
||||
let token = std::env::var("LOCALAI_GRPC_AUTH_TOKEN").ok()?;
|
||||
if token.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let expected = format!("Bearer {}", token);
|
||||
Some(
|
||||
move |req: Request<()>| -> Result<Request<()>, Status> {
|
||||
let meta = req.metadata();
|
||||
match meta.get("authorization") {
|
||||
Some(val) => {
|
||||
if val.as_bytes() == expected.as_bytes() {
|
||||
Ok(req)
|
||||
} else {
|
||||
Err(Status::unauthenticated("invalid token"))
|
||||
}
|
||||
}
|
||||
None => Err(Status::unauthenticated("missing authorization")),
|
||||
}
|
||||
},
|
||||
)
|
||||
}
|
||||
53
backend/rust/kokoros/src/main.rs
Normal file
53
backend/rust/kokoros/src/main.rs
Normal file
@@ -0,0 +1,53 @@
|
||||
use clap::Parser;
|
||||
use tonic::transport::Server;
|
||||
|
||||
mod auth;
|
||||
mod service;
|
||||
|
||||
pub mod backend {
|
||||
tonic::include_proto!("backend");
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "kokoros-grpc")]
|
||||
struct Cli {
|
||||
/// gRPC listen address (host:port)
|
||||
#[arg(long, default_value = "localhost:50051")]
|
||||
addr: String,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
tracing_subscriber::fmt()
|
||||
.with_writer(std::io::stderr)
|
||||
.with_ansi(false)
|
||||
.without_time()
|
||||
.with_env_filter(
|
||||
tracing_subscriber::EnvFilter::try_from_default_env()
|
||||
.unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")),
|
||||
)
|
||||
.init();
|
||||
|
||||
let cli = Cli::parse();
|
||||
let addr = cli.addr.parse()?;
|
||||
|
||||
tracing::info!("Starting kokoros gRPC server on {}", addr);
|
||||
|
||||
let mut builder = Server::builder();
|
||||
|
||||
if let Some(interceptor) = auth::make_auth_interceptor() {
|
||||
tracing::info!("Bearer token authentication enabled");
|
||||
let svc = backend::backend_server::BackendServer::with_interceptor(
|
||||
service::KokorosService::default(),
|
||||
interceptor,
|
||||
);
|
||||
builder.add_service(svc).serve(addr).await?;
|
||||
} else {
|
||||
let svc = backend::backend_server::BackendServer::new(service::KokorosService::default())
|
||||
.max_decoding_message_size(50 * 1024 * 1024)
|
||||
.max_encoding_message_size(50 * 1024 * 1024);
|
||||
builder.add_service(svc).serve(addr).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
652
backend/rust/kokoros/src/service.rs
Normal file
652
backend/rust/kokoros/src/service.rs
Normal file
@@ -0,0 +1,652 @@
|
||||
use std::sync::{Arc, Mutex};
|
||||
use tokio::sync::Mutex as TokioMutex;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
use tonic::{Request, Response, Status};
|
||||
|
||||
use kokoros::tts::koko::TTSKoko;
|
||||
|
||||
use crate::backend;
|
||||
use crate::backend::backend_server::Backend;
|
||||
|
||||
/// Write f32 samples as a standard 44-byte PCM 16-bit WAV file.
|
||||
/// LocalAI's audio pipeline assumes this exact header layout.
|
||||
fn write_pcm16_wav(
|
||||
path: &str,
|
||||
samples: &[f32],
|
||||
sample_rate: u32,
|
||||
) -> Result<(), Box<dyn std::error::Error>> {
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
|
||||
let num_samples = samples.len() as u32;
|
||||
let data_size = num_samples * 2; // 16-bit = 2 bytes per sample
|
||||
let file_size = 36 + data_size;
|
||||
|
||||
let mut f = File::create(path)?;
|
||||
|
||||
// RIFF header
|
||||
f.write_all(b"RIFF")?;
|
||||
f.write_all(&file_size.to_le_bytes())?;
|
||||
f.write_all(b"WAVE")?;
|
||||
|
||||
// fmt chunk — standard 16-byte PCM format
|
||||
f.write_all(b"fmt ")?;
|
||||
f.write_all(&16u32.to_le_bytes())?; // chunk size
|
||||
f.write_all(&1u16.to_le_bytes())?; // audio format = PCM
|
||||
f.write_all(&1u16.to_le_bytes())?; // channels = mono
|
||||
f.write_all(&sample_rate.to_le_bytes())?;
|
||||
f.write_all(&(sample_rate * 2).to_le_bytes())?; // byte rate
|
||||
f.write_all(&2u16.to_le_bytes())?; // block align
|
||||
f.write_all(&16u16.to_le_bytes())?; // bits per sample
|
||||
|
||||
// data chunk
|
||||
f.write_all(b"data")?;
|
||||
f.write_all(&data_size.to_le_bytes())?;
|
||||
|
||||
for &s in samples {
|
||||
let clamped = s.clamp(-1.0, 1.0);
|
||||
let pcm = (clamped * 32767.0) as i16;
|
||||
f.write_all(&pcm.to_le_bytes())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct KokorosService {
|
||||
tts: Arc<TokioMutex<Option<TTSKoko>>>,
|
||||
language: Arc<Mutex<String>>,
|
||||
speed: Arc<Mutex<f32>>,
|
||||
}
|
||||
|
||||
impl Default for KokorosService {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
tts: Arc::new(TokioMutex::new(None)),
|
||||
language: Arc::new(Mutex::new("en-us".to_string())),
|
||||
speed: Arc::new(Mutex::new(1.0)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl Backend for KokorosService {
|
||||
async fn health(
|
||||
&self,
|
||||
_req: Request<backend::HealthMessage>,
|
||||
) -> Result<Response<backend::Reply>, Status> {
|
||||
Ok(Response::new(backend::Reply {
|
||||
message: b"OK".to_vec(),
|
||||
..Default::default()
|
||||
}))
|
||||
}
|
||||
|
||||
async fn load_model(
|
||||
&self,
|
||||
req: Request<backend::ModelOptions>,
|
||||
) -> Result<Response<backend::Result>, Status> {
|
||||
let opts = req.into_inner();
|
||||
|
||||
// Model path: join ModelPath + Model, or just Model
|
||||
let model_path = if !opts.model_path.is_empty() && !opts.model.is_empty() {
|
||||
format!("{}/{}", opts.model_path, opts.model)
|
||||
} else if !opts.model.is_empty() {
|
||||
opts.model.clone()
|
||||
} else {
|
||||
"checkpoints/kokoro-v1.0.onnx".to_string()
|
||||
};
|
||||
|
||||
// Voices data path from AudioPath, or derive from model dir
|
||||
let voices_path = if !opts.audio_path.is_empty() {
|
||||
opts.audio_path.clone()
|
||||
} else {
|
||||
let model_dir = std::path::Path::new(&model_path)
|
||||
.parent()
|
||||
.map(|p| p.to_string_lossy().to_string())
|
||||
.unwrap_or_else(|| ".".to_string());
|
||||
format!("{}/voices-v1.0.bin", model_dir)
|
||||
};
|
||||
|
||||
// Parse options (key:value pairs)
|
||||
for opt in &opts.options {
|
||||
if let Some((key, value)) = opt.split_once(':') {
|
||||
match key {
|
||||
"lang_code" => *self.language.lock().unwrap() = value.to_string(),
|
||||
"speed" => {
|
||||
if let Ok(s) = value.parse::<f32>() {
|
||||
*self.speed.lock().unwrap() = s;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tracing::info!("Loading Kokoros model from: {}", model_path);
|
||||
tracing::info!("Loading voices from: {}", voices_path);
|
||||
tracing::info!("Language: {}", self.language.lock().unwrap());
|
||||
|
||||
let tts = TTSKoko::new(&model_path, &voices_path).await;
|
||||
*self.tts.lock().await = Some(tts);
|
||||
|
||||
tracing::info!("Kokoros TTS model loaded successfully");
|
||||
Ok(Response::new(backend::Result {
|
||||
success: true,
|
||||
message: "Kokoros TTS model loaded".into(),
|
||||
}))
|
||||
}
|
||||
|
||||
async fn tts(
|
||||
&self,
|
||||
req: Request<backend::TtsRequest>,
|
||||
) -> Result<Response<backend::Result>, Status> {
|
||||
let req = req.into_inner();
|
||||
let tts_guard = self.tts.lock().await;
|
||||
let tts = tts_guard
|
||||
.as_ref()
|
||||
.ok_or_else(|| Status::failed_precondition("Model not loaded"))?;
|
||||
|
||||
let voice = if req.voice.is_empty() {
|
||||
"af_heart"
|
||||
} else {
|
||||
&req.voice
|
||||
};
|
||||
let lang = req
|
||||
.language
|
||||
.filter(|l| !l.is_empty())
|
||||
.unwrap_or_else(|| self.language.lock().unwrap().clone());
|
||||
let speed = *self.speed.lock().unwrap();
|
||||
|
||||
tracing::info!(
|
||||
text = req.text,
|
||||
voice = voice,
|
||||
lang = lang.as_str(),
|
||||
dst = req.dst,
|
||||
"TTS request received"
|
||||
);
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
match tts.tts_raw_audio(&req.text, &lang, voice, speed, None, None, None, None) {
|
||||
Ok(samples) => {
|
||||
let duration_secs = samples.len() as f64 / 24000.0;
|
||||
tracing::info!(
|
||||
num_samples = samples.len(),
|
||||
audio_duration = format!("{:.2}s", duration_secs),
|
||||
inference_time = format!("{:.2}s", start.elapsed().as_secs_f64()),
|
||||
dst = req.dst,
|
||||
"TTS inference complete"
|
||||
);
|
||||
if let Err(e) = write_pcm16_wav(&req.dst, &samples, 24000) {
|
||||
tracing::error!("Failed to write WAV to {}: {}", req.dst, e);
|
||||
return Ok(Response::new(backend::Result {
|
||||
success: false,
|
||||
message: format!("Failed to write WAV: {}", e),
|
||||
}));
|
||||
}
|
||||
Ok(Response::new(backend::Result {
|
||||
success: true,
|
||||
message: String::new(),
|
||||
}))
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("TTS error: {}", e);
|
||||
Ok(Response::new(backend::Result {
|
||||
success: false,
|
||||
message: format!("TTS error: {}", e),
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type TTSStreamStream = ReceiverStream<Result<backend::Reply, Status>>;
|
||||
|
||||
async fn tts_stream(
|
||||
&self,
|
||||
req: Request<backend::TtsRequest>,
|
||||
) -> Result<Response<Self::TTSStreamStream>, Status> {
|
||||
let req = req.into_inner();
|
||||
let tts_guard = self.tts.lock().await;
|
||||
let tts = tts_guard
|
||||
.as_ref()
|
||||
.ok_or_else(|| Status::failed_precondition("Model not loaded"))?
|
||||
.clone();
|
||||
|
||||
let voice = if req.voice.is_empty() {
|
||||
"af_heart".to_string()
|
||||
} else {
|
||||
req.voice
|
||||
};
|
||||
let lang = req
|
||||
.language
|
||||
.filter(|l| !l.is_empty())
|
||||
.unwrap_or_else(|| self.language.lock().unwrap().clone());
|
||||
let speed = *self.speed.lock().unwrap();
|
||||
let text = req.text;
|
||||
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(32);
|
||||
|
||||
// Send sample rate info as first message
|
||||
let tx_clone = tx.clone();
|
||||
let _ = tx_clone
|
||||
.send(Ok(backend::Reply {
|
||||
message: br#"{"sample_rate":24000}"#.to_vec(),
|
||||
..Default::default()
|
||||
}))
|
||||
.await;
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let result = tts.tts_raw_audio_streaming(
|
||||
&text,
|
||||
&lang,
|
||||
&voice,
|
||||
speed,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
|audio_chunk: Vec<f32>| -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Convert f32 PCM to 16-bit PCM bytes (what LocalAI expects for streaming)
|
||||
let bytes: Vec<u8> = audio_chunk
|
||||
.iter()
|
||||
.flat_map(|&s| {
|
||||
let clamped = s.clamp(-1.0, 1.0);
|
||||
let i16_val = (clamped * 32767.0) as i16;
|
||||
i16_val.to_le_bytes()
|
||||
})
|
||||
.collect();
|
||||
tx.blocking_send(Ok(backend::Reply {
|
||||
audio: bytes,
|
||||
..Default::default()
|
||||
}))
|
||||
.map_err(|e| Box::new(e) as Box<dyn std::error::Error>)
|
||||
},
|
||||
);
|
||||
if let Err(e) = result {
|
||||
tracing::error!("TTSStream error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
Ok(Response::new(ReceiverStream::new(rx)))
|
||||
}
|
||||
|
||||
async fn status(
|
||||
&self,
|
||||
_req: Request<backend::HealthMessage>,
|
||||
) -> Result<Response<backend::StatusResponse>, Status> {
|
||||
let tts = self.tts.lock().await;
|
||||
let state = if tts.is_some() {
|
||||
backend::status_response::State::Ready as i32
|
||||
} else {
|
||||
backend::status_response::State::Uninitialized as i32
|
||||
};
|
||||
Ok(Response::new(backend::StatusResponse {
|
||||
state,
|
||||
memory: None,
|
||||
}))
|
||||
}
|
||||
|
||||
async fn free(
|
||||
&self,
|
||||
_req: Request<backend::HealthMessage>,
|
||||
) -> Result<Response<backend::Result>, Status> {
|
||||
*self.tts.lock().await = None;
|
||||
Ok(Response::new(backend::Result {
|
||||
success: true,
|
||||
message: "Model freed".into(),
|
||||
}))
|
||||
}
|
||||
|
||||
// --- Unimplemented RPCs ---
|
||||
|
||||
async fn predict(
|
||||
&self,
|
||||
_: Request<backend::PredictOptions>,
|
||||
) -> Result<Response<backend::Reply>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
type PredictStreamStream = ReceiverStream<Result<backend::Reply, Status>>;
|
||||
|
||||
async fn predict_stream(
|
||||
&self,
|
||||
_: Request<backend::PredictOptions>,
|
||||
) -> Result<Response<Self::PredictStreamStream>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn embedding(
|
||||
&self,
|
||||
_: Request<backend::PredictOptions>,
|
||||
) -> Result<Response<backend::EmbeddingResult>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn generate_image(
|
||||
&self,
|
||||
_: Request<backend::GenerateImageRequest>,
|
||||
) -> Result<Response<backend::Result>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn generate_video(
|
||||
&self,
|
||||
_: Request<backend::GenerateVideoRequest>,
|
||||
) -> Result<Response<backend::Result>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn audio_transcription(
|
||||
&self,
|
||||
_: Request<backend::TranscriptRequest>,
|
||||
) -> Result<Response<backend::TranscriptResult>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn sound_generation(
|
||||
&self,
|
||||
_: Request<backend::SoundGenerationRequest>,
|
||||
) -> Result<Response<backend::Result>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn tokenize_string(
|
||||
&self,
|
||||
_: Request<backend::PredictOptions>,
|
||||
) -> Result<Response<backend::TokenizationResponse>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn detect(
|
||||
&self,
|
||||
_: Request<backend::DetectOptions>,
|
||||
) -> Result<Response<backend::DetectResponse>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn stores_set(
|
||||
&self,
|
||||
_: Request<backend::StoresSetOptions>,
|
||||
) -> Result<Response<backend::Result>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn stores_delete(
|
||||
&self,
|
||||
_: Request<backend::StoresDeleteOptions>,
|
||||
) -> Result<Response<backend::Result>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn stores_get(
|
||||
&self,
|
||||
_: Request<backend::StoresGetOptions>,
|
||||
) -> Result<Response<backend::StoresGetResult>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn stores_find(
|
||||
&self,
|
||||
_: Request<backend::StoresFindOptions>,
|
||||
) -> Result<Response<backend::StoresFindResult>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn rerank(
|
||||
&self,
|
||||
_: Request<backend::RerankRequest>,
|
||||
) -> Result<Response<backend::RerankResult>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn get_metrics(
|
||||
&self,
|
||||
_: Request<backend::MetricsRequest>,
|
||||
) -> Result<Response<backend::MetricsResponse>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn vad(
|
||||
&self,
|
||||
_: Request<backend::VadRequest>,
|
||||
) -> Result<Response<backend::VadResponse>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn audio_encode(
|
||||
&self,
|
||||
_: Request<backend::AudioEncodeRequest>,
|
||||
) -> Result<Response<backend::AudioEncodeResult>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn audio_decode(
|
||||
&self,
|
||||
_: Request<backend::AudioDecodeRequest>,
|
||||
) -> Result<Response<backend::AudioDecodeResult>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn model_metadata(
|
||||
&self,
|
||||
_: Request<backend::ModelOptions>,
|
||||
) -> Result<Response<backend::ModelMetadataResponse>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn start_fine_tune(
|
||||
&self,
|
||||
_: Request<backend::FineTuneRequest>,
|
||||
) -> Result<Response<backend::FineTuneJobResult>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
type FineTuneProgressStream = ReceiverStream<Result<backend::FineTuneProgressUpdate, Status>>;
|
||||
|
||||
async fn fine_tune_progress(
|
||||
&self,
|
||||
_: Request<backend::FineTuneProgressRequest>,
|
||||
) -> Result<Response<Self::FineTuneProgressStream>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn stop_fine_tune(
|
||||
&self,
|
||||
_: Request<backend::FineTuneStopRequest>,
|
||||
) -> Result<Response<backend::Result>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn list_checkpoints(
|
||||
&self,
|
||||
_: Request<backend::ListCheckpointsRequest>,
|
||||
) -> Result<Response<backend::ListCheckpointsResponse>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn export_model(
|
||||
&self,
|
||||
_: Request<backend::ExportModelRequest>,
|
||||
) -> Result<Response<backend::Result>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn start_quantization(
|
||||
&self,
|
||||
_: Request<backend::QuantizationRequest>,
|
||||
) -> Result<Response<backend::QuantizationJobResult>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
type QuantizationProgressStream =
|
||||
ReceiverStream<Result<backend::QuantizationProgressUpdate, Status>>;
|
||||
|
||||
async fn quantization_progress(
|
||||
&self,
|
||||
_: Request<backend::QuantizationProgressRequest>,
|
||||
) -> Result<Response<Self::QuantizationProgressStream>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
|
||||
async fn stop_quantization(
|
||||
&self,
|
||||
_: Request<backend::QuantizationStopRequest>,
|
||||
) -> Result<Response<backend::Result>, Status> {
|
||||
Err(Status::unimplemented("Not supported"))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn wav_header_is_standard_pcm16() {
|
||||
let samples = vec![0.0f32, 0.5, -0.5, 1.0, -1.0];
|
||||
let path = std::env::temp_dir().join("kokoros_test.wav");
|
||||
let path_str = path.to_str().unwrap();
|
||||
|
||||
write_pcm16_wav(path_str, &samples, 24000).unwrap();
|
||||
|
||||
let data = std::fs::read(&path).unwrap();
|
||||
std::fs::remove_file(&path).unwrap();
|
||||
|
||||
// Must be exactly 44-byte header + data
|
||||
assert_eq!(data.len(), 44 + samples.len() * 2);
|
||||
|
||||
// RIFF header
|
||||
assert_eq!(&data[0..4], b"RIFF");
|
||||
assert_eq!(&data[8..12], b"WAVE");
|
||||
|
||||
// fmt chunk: 16 bytes, format=1 (PCM), channels=1, 16-bit
|
||||
assert_eq!(&data[12..16], b"fmt ");
|
||||
assert_eq!(u32::from_le_bytes(data[16..20].try_into().unwrap()), 16); // chunk size
|
||||
assert_eq!(u16::from_le_bytes(data[20..22].try_into().unwrap()), 1); // PCM format
|
||||
assert_eq!(u16::from_le_bytes(data[22..24].try_into().unwrap()), 1); // mono
|
||||
assert_eq!(u32::from_le_bytes(data[24..28].try_into().unwrap()), 24000); // sample rate
|
||||
assert_eq!(u16::from_le_bytes(data[34..36].try_into().unwrap()), 16); // bits per sample
|
||||
|
||||
// data chunk
|
||||
assert_eq!(&data[36..40], b"data");
|
||||
assert_eq!(
|
||||
u32::from_le_bytes(data[40..44].try_into().unwrap()),
|
||||
(samples.len() * 2) as u32
|
||||
);
|
||||
|
||||
// Verify sample values: 0.5 -> 16383, -0.5 -> -16383, 1.0 -> 32767, -1.0 -> -32767
|
||||
let s1 = i16::from_le_bytes(data[46..48].try_into().unwrap());
|
||||
assert_eq!(s1, 16383); // 0.5 * 32767
|
||||
let s3 = i16::from_le_bytes(data[50..52].try_into().unwrap());
|
||||
assert_eq!(s3, 32767); // 1.0 clamped
|
||||
let s4 = i16::from_le_bytes(data[52..54].try_into().unwrap());
|
||||
assert_eq!(s4, -32767); // -1.0 clamped
|
||||
}
|
||||
|
||||
/// Integration test: runs actual TTS inference and validates the output audio.
|
||||
/// Skipped unless KOKOROS_MODEL_PATH is set to a directory containing
|
||||
/// kokoro-v1.0.onnx and voices-v1.0.bin.
|
||||
#[tokio::test]
|
||||
async fn tts_produces_valid_speech() {
|
||||
let model_dir = match std::env::var("KOKOROS_MODEL_PATH") {
|
||||
Ok(p) => p,
|
||||
Err(_) => {
|
||||
eprintln!("KOKOROS_MODEL_PATH not set, skipping integration test");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let model_path = format!("{}/kokoro-v1.0.onnx", model_dir);
|
||||
let voices_path = format!("{}/voices-v1.0.bin", model_dir);
|
||||
|
||||
if !std::path::Path::new(&model_path).exists() {
|
||||
eprintln!("Model file not found at {}, skipping", model_path);
|
||||
return;
|
||||
}
|
||||
|
||||
let tts = TTSKoko::new(&model_path, &voices_path).await;
|
||||
|
||||
let input_text = "Hello world, this is a test of speech synthesis.";
|
||||
let out_path = std::env::temp_dir().join("kokoros_integration_test.wav");
|
||||
let out_str = out_path.to_str().unwrap();
|
||||
|
||||
let samples = tts
|
||||
.tts_raw_audio(input_text, "en-us", "af_heart", 1.0, None, None, None, None)
|
||||
.expect("tts_raw_audio failed");
|
||||
|
||||
write_pcm16_wav(out_str, &samples, 24000).unwrap();
|
||||
|
||||
let data = std::fs::read(&out_path).unwrap();
|
||||
std::fs::remove_file(&out_path).unwrap();
|
||||
|
||||
// --- WAV header sanity ---
|
||||
assert_eq!(&data[0..4], b"RIFF");
|
||||
assert_eq!(&data[8..12], b"WAVE");
|
||||
assert_eq!(u16::from_le_bytes(data[20..22].try_into().unwrap()), 1); // PCM
|
||||
assert_eq!(u32::from_le_bytes(data[24..28].try_into().unwrap()), 24000); // sample rate
|
||||
assert_eq!(u16::from_le_bytes(data[34..36].try_into().unwrap()), 16); // 16-bit
|
||||
|
||||
let num_samples = samples.len();
|
||||
let duration_secs = num_samples as f64 / 24000.0;
|
||||
|
||||
// --- Duration check ---
|
||||
// ~10 words should produce roughly 2-8 seconds of speech
|
||||
assert!(
|
||||
duration_secs > 1.0,
|
||||
"Audio too short: {:.2}s for {} words",
|
||||
duration_secs,
|
||||
input_text.split_whitespace().count()
|
||||
);
|
||||
assert!(
|
||||
duration_secs < 15.0,
|
||||
"Audio too long: {:.2}s for {} words",
|
||||
duration_secs,
|
||||
input_text.split_whitespace().count()
|
||||
);
|
||||
|
||||
// --- Energy check: not silence ---
|
||||
let rms = (samples.iter().map(|s| s * s).sum::<f32>() / num_samples as f32).sqrt();
|
||||
assert!(
|
||||
rms > 0.01,
|
||||
"Audio is near-silence: RMS = {:.6}",
|
||||
rms
|
||||
);
|
||||
|
||||
// --- Not clipped/saturated: should have dynamic range ---
|
||||
let max_abs = samples.iter().map(|s| s.abs()).fold(0.0f32, f32::max);
|
||||
assert!(
|
||||
max_abs < 1.0,
|
||||
"Audio is fully saturated (max |sample| = {:.4})",
|
||||
max_abs
|
||||
);
|
||||
assert!(
|
||||
max_abs > 0.05,
|
||||
"Audio has very low amplitude (max |sample| = {:.4})",
|
||||
max_abs
|
||||
);
|
||||
|
||||
// --- Speech-like spectral check ---
|
||||
// Speech should have significant energy variation (not white noise or DC).
|
||||
// Check that the signal has zero-crossings in a speech-like range (roughly
|
||||
// 50-400 crossings per 24000 samples = 100-8000 Hz fundamental range).
|
||||
let zero_crossings: usize = samples
|
||||
.windows(2)
|
||||
.filter(|w| (w[0] >= 0.0) != (w[1] >= 0.0))
|
||||
.count();
|
||||
let crossings_per_sec = zero_crossings as f64 / duration_secs;
|
||||
// White noise at 24kHz would have ~12000 crossings/sec.
|
||||
// Speech is typically 100-4000 crossings/sec.
|
||||
assert!(
|
||||
crossings_per_sec < 10000.0,
|
||||
"Too many zero crossings ({:.0}/s) — likely noise, not speech",
|
||||
crossings_per_sec
|
||||
);
|
||||
assert!(
|
||||
crossings_per_sec > 50.0,
|
||||
"Too few zero crossings ({:.0}/s) — likely DC or silence, not speech",
|
||||
crossings_per_sec
|
||||
);
|
||||
|
||||
eprintln!(
|
||||
"Integration test passed: duration={:.2}s, rms={:.4}, max={:.4}, zero_crossings={:.0}/s",
|
||||
duration_secs, rms, max_abs, crossings_per_sec
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -2514,6 +2514,132 @@
|
||||
- lang_code:a
|
||||
known_usecases:
|
||||
- tts
|
||||
- name: "kokoros"
|
||||
url: "github:mudler/LocalAI/gallery/kokoros.yaml@master"
|
||||
size: "327MB"
|
||||
urls:
|
||||
- https://github.com/lucasjinreal/Kokoros
|
||||
license: apache-2.0
|
||||
tags:
|
||||
- tts
|
||||
- kokoros
|
||||
- cpu
|
||||
- text-to-speech
|
||||
- rust
|
||||
description: |
|
||||
Kokoros is a pure Rust TTS backend using the Kokoro v1.0 ONNX model (82M parameters).
|
||||
Fast, streaming TTS with high quality. American English with af_heart voice.
|
||||
overrides:
|
||||
backend: "kokoros"
|
||||
name: "kokoros"
|
||||
description: "Kokoros Rust TTS - American English"
|
||||
parameters:
|
||||
model: "kokoro-v1.0.onnx"
|
||||
voice: "af_heart"
|
||||
options:
|
||||
- lang_code:en-us
|
||||
known_usecases:
|
||||
- tts
|
||||
files:
|
||||
- filename: "kokoro-v1.0.onnx"
|
||||
sha256: 7d5df8ecf7d4b1878015a32686053fd0eebe2bc377234608764cc0ef3636a6c5
|
||||
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"
|
||||
- filename: "voices-v1.0.bin"
|
||||
sha256: bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
|
||||
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
|
||||
- name: "kokoros-ja"
|
||||
url: "github:mudler/LocalAI/gallery/kokoros.yaml@master"
|
||||
size: "327MB"
|
||||
urls:
|
||||
- https://github.com/lucasjinreal/Kokoros
|
||||
license: apache-2.0
|
||||
tags:
|
||||
- tts
|
||||
- kokoros
|
||||
- japanese
|
||||
- text-to-speech
|
||||
description: |
|
||||
Kokoros Rust TTS - Japanese. Uses the Kokoro v1.0 ONNX model with Japanese phonemization.
|
||||
overrides:
|
||||
backend: "kokoros"
|
||||
name: "kokoros-ja"
|
||||
description: "Kokoros Rust TTS - Japanese"
|
||||
parameters:
|
||||
model: "kokoro-v1.0.onnx"
|
||||
voice: "jf_alpha"
|
||||
options:
|
||||
- lang_code:ja
|
||||
known_usecases:
|
||||
- tts
|
||||
files:
|
||||
- filename: "kokoro-v1.0.onnx"
|
||||
sha256: 7d5df8ecf7d4b1878015a32686053fd0eebe2bc377234608764cc0ef3636a6c5
|
||||
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"
|
||||
- filename: "voices-v1.0.bin"
|
||||
sha256: bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
|
||||
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
|
||||
- name: "kokoros-cmn"
|
||||
url: "github:mudler/LocalAI/gallery/kokoros.yaml@master"
|
||||
size: "327MB"
|
||||
urls:
|
||||
- https://github.com/lucasjinreal/Kokoros
|
||||
license: apache-2.0
|
||||
tags:
|
||||
- tts
|
||||
- kokoros
|
||||
- chinese
|
||||
- text-to-speech
|
||||
description: |
|
||||
Kokoros Rust TTS - Mandarin Chinese.
|
||||
overrides:
|
||||
backend: "kokoros"
|
||||
name: "kokoros-cmn"
|
||||
description: "Kokoros Rust TTS - Mandarin Chinese"
|
||||
parameters:
|
||||
model: "kokoro-v1.0.onnx"
|
||||
voice: "zf_xiaobei"
|
||||
options:
|
||||
- lang_code:cmn
|
||||
known_usecases:
|
||||
- tts
|
||||
files:
|
||||
- filename: "kokoro-v1.0.onnx"
|
||||
sha256: 7d5df8ecf7d4b1878015a32686053fd0eebe2bc377234608764cc0ef3636a6c5
|
||||
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"
|
||||
- filename: "voices-v1.0.bin"
|
||||
sha256: bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
|
||||
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
|
||||
- name: "kokoros-de"
|
||||
url: "github:mudler/LocalAI/gallery/kokoros.yaml@master"
|
||||
size: "327MB"
|
||||
urls:
|
||||
- https://github.com/lucasjinreal/Kokoros
|
||||
license: apache-2.0
|
||||
tags:
|
||||
- tts
|
||||
- kokoros
|
||||
- german
|
||||
- text-to-speech
|
||||
description: |
|
||||
Kokoros Rust TTS - German.
|
||||
overrides:
|
||||
backend: "kokoros"
|
||||
name: "kokoros-de"
|
||||
description: "Kokoros Rust TTS - German"
|
||||
parameters:
|
||||
model: "kokoro-v1.0.onnx"
|
||||
voice: "df_greta"
|
||||
options:
|
||||
- lang_code:de
|
||||
known_usecases:
|
||||
- tts
|
||||
files:
|
||||
- filename: "kokoro-v1.0.onnx"
|
||||
sha256: 7d5df8ecf7d4b1878015a32686053fd0eebe2bc377234608764cc0ef3636a6c5
|
||||
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/kokoro-v1.0.onnx"
|
||||
- filename: "voices-v1.0.bin"
|
||||
sha256: bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
|
||||
uri: "https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files-v1.0/voices-v1.0.bin"
|
||||
- name: "kitten-tts"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
|
||||
3
gallery/kokoros.yaml
Normal file
3
gallery/kokoros.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
---
|
||||
config_file: |
|
||||
backend: kokoros
|
||||
@@ -21,6 +21,9 @@ function inferBackendPath(item) {
|
||||
if (item.dockerfile.endsWith("golang")) {
|
||||
return `backend/go/${item.backend}/`;
|
||||
}
|
||||
if (item.dockerfile.endsWith("rust")) {
|
||||
return `backend/rust/${item.backend}/`;
|
||||
}
|
||||
if (item.dockerfile.endsWith("llama-cpp")) {
|
||||
return `backend/cpp/llama-cpp/`;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user