diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index fb32a52b9..2478f1ffc 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -1674,6 +1674,20 @@ jobs: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + # voxtral + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64,linux/arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-voxtral' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "voxtral" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' #silero-vad - build-type: '' cuda-major-version: "" @@ -1945,6 +1959,10 @@ jobs: tag-suffix: "-metal-darwin-arm64-whisper" build-type: "metal" lang: "go" + - backend: "voxtral" + tag-suffix: "-metal-darwin-arm64-voxtral" + build-type: "metal" + lang: "go" - backend: "vibevoice" tag-suffix: "-metal-darwin-arm64-vibevoice" build-type: "mps" diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index c1786e16d..79e43c499 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -30,6 +30,10 @@ jobs: variable: "PIPER_VERSION" branch: "master" file: "backend/go/piper/Makefile" + - repository: "antirez/voxtral.c" + variable: "VOXTRAL_VERSION" + branch: "main" + file: "backend/go/voxtral/Makefile" runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index a8c45f107..fee41fe7f 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -361,3 +361,34 @@ jobs: run: | make --jobs=5 --output-sync=target -C backend/python/voxcpm make --jobs=5 --output-sync=target -C backend/python/voxcpm test + tests-voxtral: + runs-on: ubuntu-latest + steps: + - name: Clone + uses: actions/checkout@v6 + with: + submodules: true + - name: Dependencies + run: | + sudo apt-get update + sudo apt-get install -y build-essential cmake curl libopenblas-dev ffmpeg + - name: Setup Go + uses: actions/setup-go@v5 + # You can test your matrix by printing the current Go version + - name: Display Go version + run: go version + - name: Proto Dependencies + run: | + # Install protoc + curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \ + unzip -j -d /usr/local/bin protoc.zip bin/protoc && \ + rm protoc.zip + go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 + go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af + PATH="$PATH:$HOME/go/bin" make protogen-go + - name: Build voxtral + run: | + make --jobs=5 --output-sync=target -C backend/go/voxtral + - name: Test voxtral + run: | + make --jobs=5 --output-sync=target -C backend/go/voxtral test diff --git a/Makefile b/Makefile index 4f56ef202..daefa00c7 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/voxtral GOCMD=go GOTEST=$(GOCMD) test @@ -453,6 +453,7 @@ BACKEND_HUGGINGFACE = huggingface|golang|.|false|true BACKEND_SILERO_VAD = silero-vad|golang|.|false|true BACKEND_STABLEDIFFUSION_GGML = stablediffusion-ggml|golang|.|--progress=plain|true BACKEND_WHISPER = whisper|golang|.|false|true +BACKEND_VOXTRAL = voxtral|golang|.|false|true # Python backends with root context BACKEND_RERANKERS = rerankers|python|.|false|true @@ -506,6 +507,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE))) $(eval $(call generate-docker-build-target,$(BACKEND_SILERO_VAD))) $(eval $(call generate-docker-build-target,$(BACKEND_STABLEDIFFUSION_GGML))) $(eval $(call generate-docker-build-target,$(BACKEND_WHISPER))) +$(eval $(call generate-docker-build-target,$(BACKEND_VOXTRAL))) $(eval $(call generate-docker-build-target,$(BACKEND_RERANKERS))) $(eval $(call generate-docker-build-target,$(BACKEND_TRANSFORMERS))) $(eval $(call generate-docker-build-target,$(BACKEND_OUTETTS))) @@ -533,7 +535,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_ACE_STEP))) docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step +docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-voxtral ######################################################## ### Mock Backend for E2E Tests diff --git a/backend/Dockerfile.golang b/backend/Dockerfile.golang index 5c7f33caf..fce4c7724 100644 --- a/backend/Dockerfile.golang +++ b/backend/Dockerfile.golang @@ -20,7 +20,7 @@ RUN apt-get update && \ build-essential \ git ccache \ ca-certificates \ - make cmake wget \ + make cmake wget libopenblas-dev \ curl unzip \ libssl-dev && \ apt-get clean && \ diff --git a/backend/go/voxtral/.gitignore b/backend/go/voxtral/.gitignore new file mode 100644 index 000000000..dc5a592fd --- /dev/null +++ b/backend/go/voxtral/.gitignore @@ -0,0 +1,9 @@ +.cache/ +sources/ +build/ +build-*/ +package/ +voxtral +*.so +*.dylib +compile_commands.json diff --git a/backend/go/voxtral/CMakeLists.txt b/backend/go/voxtral/CMakeLists.txt new file mode 100644 index 000000000..0ff7c37b0 --- /dev/null +++ b/backend/go/voxtral/CMakeLists.txt @@ -0,0 +1,84 @@ +cmake_minimum_required(VERSION 3.12) + +if(USE_METAL) + project(govoxtral LANGUAGES C OBJC) +else() + project(govoxtral LANGUAGES C) +endif() + +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Workaround: CMake + GCC linker depfile generation fails for MODULE libraries +set(CMAKE_C_LINKER_DEPFILE_SUPPORTED FALSE) + +# Build voxtral.c as a library +set(VOXTRAL_SOURCES + sources/voxtral.c/voxtral.c + sources/voxtral.c/voxtral_kernels.c + sources/voxtral.c/voxtral_audio.c + sources/voxtral.c/voxtral_encoder.c + sources/voxtral.c/voxtral_decoder.c + sources/voxtral.c/voxtral_tokenizer.c + sources/voxtral.c/voxtral_safetensors.c +) + +# Metal GPU acceleration (macOS arm64 only) +if(USE_METAL) + # Generate embedded shader header from .metal source via xxd + add_custom_command( + OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/sources/voxtral.c/voxtral_shaders_source.h + COMMAND xxd -i voxtral_shaders.metal > voxtral_shaders_source.h + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/sources/voxtral.c + DEPENDS sources/voxtral.c/voxtral_shaders.metal + COMMENT "Generating embedded Metal shaders header" + ) + list(APPEND VOXTRAL_SOURCES sources/voxtral.c/voxtral_metal.m) + set_source_files_properties(sources/voxtral.c/voxtral_metal.m PROPERTIES + COMPILE_FLAGS "-fobjc-arc" + ) +endif() + +add_library(govoxtral MODULE csrc/govoxtral.c ${VOXTRAL_SOURCES}) + +target_include_directories(govoxtral PRIVATE sources/voxtral.c csrc) + +target_compile_options(govoxtral PRIVATE -O3 -ffast-math) + +if(USE_METAL) + target_compile_definitions(govoxtral PRIVATE USE_BLAS USE_METAL ACCELERATE_NEW_LAPACK) + target_link_libraries(govoxtral PRIVATE + "-framework Accelerate" + "-framework Metal" + "-framework MetalPerformanceShaders" + "-framework MetalPerformanceShadersGraph" + "-framework Foundation" + "-framework AudioToolbox" + "-framework CoreFoundation" + m + ) + # Ensure the generated shader header is built before compiling + target_sources(govoxtral PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/sources/voxtral.c/voxtral_shaders_source.h + ) +elseif(USE_OPENBLAS) + # Try to find OpenBLAS; use it if available, otherwise fall back to pure C + find_package(BLAS) + if(BLAS_FOUND) + target_compile_definitions(govoxtral PRIVATE USE_BLAS USE_OPENBLAS) + target_link_libraries(govoxtral PRIVATE ${BLAS_LIBRARIES} m) + target_include_directories(govoxtral PRIVATE /usr/include/openblas) + else() + message(WARNING "OpenBLAS requested but not found, building without BLAS") + target_link_libraries(govoxtral PRIVATE m) + endif() +elseif(APPLE) + # macOS without Metal: use Accelerate framework + target_compile_definitions(govoxtral PRIVATE USE_BLAS ACCELERATE_NEW_LAPACK) + target_link_libraries(govoxtral PRIVATE "-framework Accelerate" m) +else() + target_link_libraries(govoxtral PRIVATE m) +endif() + +set_property(TARGET govoxtral PROPERTY C_STANDARD 11) +set_target_properties(govoxtral PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) diff --git a/backend/go/voxtral/Makefile b/backend/go/voxtral/Makefile new file mode 100644 index 000000000..cde510fa9 --- /dev/null +++ b/backend/go/voxtral/Makefile @@ -0,0 +1,124 @@ +.NOTPARALLEL: + +CMAKE_ARGS?= +BUILD_TYPE?= +NATIVE?=true + +GOCMD?=go +GO_TAGS?= +JOBS?=$(shell nproc --ignore=1 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) + +# voxtral.c version +VOXTRAL_REPO?=https://github.com/antirez/voxtral.c +VOXTRAL_VERSION?=8f810dd23c44be5453cb46c92216a3eaab46e85f + +# Detect OS +UNAME_S := $(shell uname -s) + +# Shared library extension +ifeq ($(UNAME_S),Darwin) + SO_EXT=dylib +else + SO_EXT=so +endif + +SO_TARGET?=libgovoxtral.$(SO_EXT) + +CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF + +ifeq ($(NATIVE),false) +ifneq ($(UNAME_S),Darwin) + CMAKE_ARGS+=-DCMAKE_C_FLAGS="-march=x86-64" +endif +endif + +ifeq ($(BUILD_TYPE),cublas) + CMAKE_ARGS+=-DUSE_OPENBLAS=OFF +else ifeq ($(BUILD_TYPE),hipblas) + CMAKE_ARGS+=-DUSE_OPENBLAS=OFF +else ifeq ($(BUILD_TYPE),metal) + CMAKE_ARGS+=-DUSE_OPENBLAS=OFF -DUSE_METAL=ON +else ifeq ($(UNAME_S),Darwin) + # Default on macOS: use Accelerate (no OpenBLAS needed) + CMAKE_ARGS+=-DUSE_OPENBLAS=OFF +else + CMAKE_ARGS+=-DUSE_OPENBLAS=ON +endif + +# Only build CPU variants on Linux; on Darwin build a single dylib +ifeq ($(UNAME_S),Linux) +VARIANT_TARGETS = libgovoxtral-avx.so libgovoxtral-avx2.so libgovoxtral-fallback.so +else ifeq ($(UNAME_S),Darwin) +VARIANT_TARGETS = libgovoxtral-fallback.dylib +else +VARIANT_TARGETS = libgovoxtral-fallback.so +endif + +sources/voxtral.c: + mkdir -p sources/voxtral.c + cd sources/voxtral.c && \ + git init && \ + git remote add origin $(VOXTRAL_REPO) && \ + git fetch origin && \ + git checkout $(VOXTRAL_VERSION) && \ + git submodule update --init --recursive --depth 1 --single-branch + +voxtral: main.go govoxtral.go $(VARIANT_TARGETS) + CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o voxtral ./ + +package: voxtral + bash package.sh + +build: package + +clean: purge + rm -rf libgovoxtral*.so libgovoxtral*.dylib package sources/voxtral.c voxtral + +purge: + rm -rf build* + +# Build all variants (Linux only) +ifeq ($(UNAME_S),Linux) +libgovoxtral-avx.so: sources/voxtral.c + $(MAKE) purge + $(info Building voxtral: avx) + SO_TARGET=libgovoxtral-avx.so CMAKE_ARGS="$(CMAKE_ARGS) -DCMAKE_C_FLAGS='-mavx -mno-avx2 -mno-avx512f -mno-fma -mno-f16c'" $(MAKE) libgovoxtral-custom + rm -rfv build* + +libgovoxtral-avx2.so: sources/voxtral.c + $(MAKE) purge + $(info Building voxtral: avx2) + SO_TARGET=libgovoxtral-avx2.so CMAKE_ARGS="$(CMAKE_ARGS) -DCMAKE_C_FLAGS='-mavx -mavx2 -mfma -mf16c'" $(MAKE) libgovoxtral-custom + rm -rfv build* +endif + +# Build fallback variant +ifeq ($(UNAME_S),Darwin) +libgovoxtral-fallback.dylib: sources/voxtral.c + $(MAKE) purge + $(info Building voxtral: darwin fallback) + SO_TARGET=libgovoxtral-fallback.dylib NATIVE=true $(MAKE) libgovoxtral-custom + rm -rfv build* +else +libgovoxtral-fallback.so: sources/voxtral.c + $(MAKE) purge + $(info Building voxtral: fallback) + SO_TARGET=libgovoxtral-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DCMAKE_C_FLAGS='-mno-avx -mno-avx2 -mno-avx512f -mno-fma -mno-f16c'" $(MAKE) libgovoxtral-custom + rm -rfv build* +endif + +libgovoxtral-custom: CMakeLists.txt csrc/govoxtral.c csrc/govoxtral.h + mkdir -p build-$(SO_TARGET) && \ + cd build-$(SO_TARGET) && \ + cmake .. $(CMAKE_ARGS) && \ + cmake --build . --config Release -j$(JOBS) && \ + cd .. && \ + (mv build-$(SO_TARGET)/libgovoxtral.so ./$(SO_TARGET) 2>/dev/null || \ + mv build-$(SO_TARGET)/libgovoxtral.dylib ./$(SO_TARGET) 2>/dev/null) + +test: voxtral + @echo "Running voxtral tests..." + bash test.sh + @echo "voxtral tests completed." + +all: voxtral package diff --git a/backend/go/voxtral/csrc/govoxtral.c b/backend/go/voxtral/csrc/govoxtral.c new file mode 100644 index 000000000..63580113d --- /dev/null +++ b/backend/go/voxtral/csrc/govoxtral.c @@ -0,0 +1,62 @@ +#include "govoxtral.h" +#include "voxtral.h" +#include "voxtral_audio.h" +#ifdef USE_METAL +#include "voxtral_metal.h" +#endif +#include +#include +#include + +static vox_ctx_t *ctx = NULL; +static char *last_result = NULL; +static int metal_initialized = 0; + +int load_model(const char *model_dir) { + if (ctx != NULL) { + vox_free(ctx); + ctx = NULL; + } + +#ifdef USE_METAL + if (!metal_initialized) { + vox_metal_init(); + metal_initialized = 1; + } +#endif + + ctx = vox_load(model_dir); + if (ctx == NULL) { + fprintf(stderr, "error: failed to load voxtral model from %s\n", model_dir); + return 1; + } + + return 0; +} + +const char *transcribe(const char *wav_path) { + if (ctx == NULL) { + fprintf(stderr, "error: model not loaded\n"); + return ""; + } + + if (last_result != NULL) { + free(last_result); + last_result = NULL; + } + + last_result = vox_transcribe(ctx, wav_path); + if (last_result == NULL) { + fprintf(stderr, "error: transcription failed for %s\n", wav_path); + return ""; + } + + return last_result; +} + +void free_result(void) { + if (last_result != NULL) { + free(last_result); + last_result = NULL; + } +} diff --git a/backend/go/voxtral/csrc/govoxtral.h b/backend/go/voxtral/csrc/govoxtral.h new file mode 100644 index 000000000..0b5cac95f --- /dev/null +++ b/backend/go/voxtral/csrc/govoxtral.h @@ -0,0 +1,8 @@ +#ifndef GOVOXTRAL_H +#define GOVOXTRAL_H + +extern int load_model(const char *model_dir); +extern const char *transcribe(const char *wav_path); +extern void free_result(void); + +#endif /* GOVOXTRAL_H */ diff --git a/backend/go/voxtral/govoxtral.go b/backend/go/voxtral/govoxtral.go new file mode 100644 index 000000000..e5d40aa6b --- /dev/null +++ b/backend/go/voxtral/govoxtral.go @@ -0,0 +1,60 @@ +package main + +import ( + "fmt" + "os" + "strings" + + "github.com/mudler/LocalAI/pkg/grpc/base" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + "github.com/mudler/LocalAI/pkg/utils" +) + +var ( + CppLoadModel func(modelDir string) int + CppTranscribe func(wavPath string) string + CppFreeResult func() +) + +type Voxtral struct { + base.SingleThread +} + +func (v *Voxtral) Load(opts *pb.ModelOptions) error { + if ret := CppLoadModel(opts.ModelFile); ret != 0 { + return fmt.Errorf("failed to load Voxtral model from %s", opts.ModelFile) + } + return nil +} + +func (v *Voxtral) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) { + dir, err := os.MkdirTemp("", "voxtral") + if err != nil { + return pb.TranscriptResult{}, err + } + defer os.RemoveAll(dir) + + convertedPath := dir + "/converted.wav" + + if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil { + return pb.TranscriptResult{}, err + } + + result := strings.Clone(CppTranscribe(convertedPath)) + CppFreeResult() + + text := strings.TrimSpace(result) + + segments := []*pb.TranscriptSegment{} + if text != "" { + segments = append(segments, &pb.TranscriptSegment{ + Id: 0, + Text: text, + }) + } + + return pb.TranscriptResult{ + Segments: segments, + Text: text, + }, nil +} diff --git a/backend/go/voxtral/main.go b/backend/go/voxtral/main.go new file mode 100644 index 000000000..bc4ca6f72 --- /dev/null +++ b/backend/go/voxtral/main.go @@ -0,0 +1,53 @@ +package main + +// Note: this is started internally by LocalAI and a server is allocated for each model +import ( + "flag" + "os" + "runtime" + + "github.com/ebitengine/purego" + grpc "github.com/mudler/LocalAI/pkg/grpc" +) + +var ( + addr = flag.String("addr", "localhost:50051", "the address to connect to") +) + +type LibFuncs struct { + FuncPtr any + Name string +} + +func main() { + // Get library name from environment variable, default to fallback + libName := os.Getenv("VOXTRAL_LIBRARY") + if libName == "" { + if runtime.GOOS == "darwin" { + libName = "./libgovoxtral-fallback.dylib" + } else { + libName = "./libgovoxtral-fallback.so" + } + } + + gosd, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL) + if err != nil { + panic(err) + } + + libFuncs := []LibFuncs{ + {&CppLoadModel, "load_model"}, + {&CppTranscribe, "transcribe"}, + {&CppFreeResult, "free_result"}, + } + + for _, lf := range libFuncs { + purego.RegisterLibFunc(lf.FuncPtr, gosd, lf.Name) + } + + flag.Parse() + + if err := grpc.StartServer(*addr, &Voxtral{}); err != nil { + panic(err) + } +} diff --git a/backend/go/voxtral/package.sh b/backend/go/voxtral/package.sh new file mode 100644 index 000000000..8465a36da --- /dev/null +++ b/backend/go/voxtral/package.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Script to copy the appropriate libraries based on architecture + +set -e + +CURDIR=$(dirname "$(realpath $0)") +REPO_ROOT="${CURDIR}/../../.." + +# Create lib directory +mkdir -p $CURDIR/package/lib + +cp -avf $CURDIR/voxtral $CURDIR/package/ +cp -fv $CURDIR/libgovoxtral-*.so $CURDIR/package/ 2>/dev/null || true +cp -fv $CURDIR/libgovoxtral-*.dylib $CURDIR/package/ 2>/dev/null || true +cp -fv $CURDIR/run.sh $CURDIR/package/ + +# Detect architecture and copy appropriate libraries +if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then + # x86_64 architecture + echo "Detected x86_64 architecture, copying x86_64 libraries..." + cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so + cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2 + cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0 + # OpenBLAS if available + if [ -f /usr/lib/x86_64-linux-gnu/libopenblas.so.0 ]; then + cp -arfLv /usr/lib/x86_64-linux-gnu/libopenblas.so.0 $CURDIR/package/lib/ + fi +elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then + # ARM64 architecture + echo "Detected ARM64 architecture, copying ARM64 libraries..." + cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so + cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2 + cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0 + # OpenBLAS if available + if [ -f /usr/lib/aarch64-linux-gnu/libopenblas.so.0 ]; then + cp -arfLv /usr/lib/aarch64-linux-gnu/libopenblas.so.0 $CURDIR/package/lib/ + fi +elif [ $(uname -s) = "Darwin" ]; then + echo "Detected Darwin — system frameworks linked dynamically, no bundled libs needed" +else + echo "Error: Could not detect architecture" + exit 1 +fi + +# Package GPU libraries based on BUILD_TYPE +GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh" +if [ -f "$GPU_LIB_SCRIPT" ]; then + echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..." + source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib" + package_gpu_libs +fi + +echo "Packaging completed successfully" +ls -liah $CURDIR/package/ +ls -liah $CURDIR/package/lib/ diff --git a/backend/go/voxtral/run.sh b/backend/go/voxtral/run.sh new file mode 100644 index 000000000..748c30341 --- /dev/null +++ b/backend/go/voxtral/run.sh @@ -0,0 +1,49 @@ +#!/bin/bash +set -ex + +# Get the absolute current dir where the script is located +CURDIR=$(dirname "$(realpath $0)") + +cd / + +echo "CPU info:" +if [ "$(uname)" != "Darwin" ]; then + grep -e "model\sname" /proc/cpuinfo | head -1 + grep -e "flags" /proc/cpuinfo | head -1 +fi + +if [ "$(uname)" = "Darwin" ]; then + # macOS: single dylib variant (Metal or Accelerate) + LIBRARY="$CURDIR/libgovoxtral-fallback.dylib" + export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH +else + LIBRARY="$CURDIR/libgovoxtral-fallback.so" + + if grep -q -e "\savx\s" /proc/cpuinfo ; then + echo "CPU: AVX found OK" + if [ -e $CURDIR/libgovoxtral-avx.so ]; then + LIBRARY="$CURDIR/libgovoxtral-avx.so" + fi + fi + + if grep -q -e "\savx2\s" /proc/cpuinfo ; then + echo "CPU: AVX2 found OK" + if [ -e $CURDIR/libgovoxtral-avx2.so ]; then + LIBRARY="$CURDIR/libgovoxtral-avx2.so" + fi + fi + + export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH +fi + +export VOXTRAL_LIBRARY=$LIBRARY + +# If there is a lib/ld.so, use it (Linux only) +if [ -f $CURDIR/lib/ld.so ]; then + echo "Using lib/ld.so" + echo "Using library: $LIBRARY" + exec $CURDIR/lib/ld.so $CURDIR/voxtral "$@" +fi + +echo "Using library: $LIBRARY" +exec $CURDIR/voxtral "$@" diff --git a/backend/go/voxtral/test.sh b/backend/go/voxtral/test.sh new file mode 100644 index 000000000..70fa7a4af --- /dev/null +++ b/backend/go/voxtral/test.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -e + +CURDIR=$(dirname "$(realpath $0)") + +echo "Running voxtral backend tests..." + +# The test requires: +# - VOXTRAL_MODEL_DIR: path to directory containing consolidated.safetensors + tekken.json +# - VOXTRAL_BINARY: path to the voxtral binary (defaults to ./voxtral) +# +# Tests that require the model will be skipped if VOXTRAL_MODEL_DIR is not set. + +cd "$CURDIR" +export VOXTRAL_MODEL_DIR="${VOXTRAL_MODEL_DIR:-./voxtral-model}" + +if [ ! -d "$VOXTRAL_MODEL_DIR" ]; then + echo "Creating voxtral-model directory for tests..." + mkdir -p "$VOXTRAL_MODEL_DIR" + MODEL_ID="mistralai/Voxtral-Mini-4B-Realtime-2602" + echo "Model: ${MODEL_ID}" + echo "" + + # Files to download + FILES=( + "consolidated.safetensors" + "params.json" + "tekken.json" + ) + + BASE_URL="https://huggingface.co/${MODEL_ID}/resolve/main" + + for file in "${FILES[@]}"; do + dest="${VOXTRAL_MODEL_DIR}/${file}" + if [ -f "${dest}" ]; then + echo " [skip] ${file} (already exists)" + else + echo " [download] ${file}..." + curl -L -o "${dest}" "${BASE_URL}/${file}" --progress-bar + echo " [done] ${file}" + fi + done +fi + +# Run Go tests +go test -v -timeout 300s ./... + +echo "All voxtral tests passed." diff --git a/backend/go/voxtral/voxtral_test.go b/backend/go/voxtral/voxtral_test.go new file mode 100644 index 000000000..018b332a3 --- /dev/null +++ b/backend/go/voxtral/voxtral_test.go @@ -0,0 +1,201 @@ +package main + +import ( + "context" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" +) + +const ( + testAddr = "localhost:50051" + sampleAudio = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav" + startupWait = 5 * time.Second +) + +func skipIfNoModel(t *testing.T) string { + t.Helper() + modelDir := os.Getenv("VOXTRAL_MODEL_DIR") + if modelDir == "" { + t.Skip("VOXTRAL_MODEL_DIR not set, skipping test (set to voxtral model directory)") + } + if _, err := os.Stat(filepath.Join(modelDir, "consolidated.safetensors")); os.IsNotExist(err) { + t.Skipf("Model file not found in %s, skipping", modelDir) + } + return modelDir +} + +func startServer(t *testing.T) *exec.Cmd { + t.Helper() + binary := os.Getenv("VOXTRAL_BINARY") + if binary == "" { + binary = "./voxtral" + } + if _, err := os.Stat(binary); os.IsNotExist(err) { + t.Skipf("Backend binary not found at %s, skipping", binary) + } + cmd := exec.Command(binary, "--addr", testAddr) + cmd.Stdout = os.Stderr + cmd.Stderr = os.Stderr + if err := cmd.Start(); err != nil { + t.Fatalf("Failed to start server: %v", err) + } + time.Sleep(startupWait) + return cmd +} + +func stopServer(cmd *exec.Cmd) { + if cmd != nil && cmd.Process != nil { + cmd.Process.Kill() + cmd.Wait() + } +} + +func dialGRPC(t *testing.T) *grpc.ClientConn { + t.Helper() + conn, err := grpc.Dial(testAddr, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithDefaultCallOptions( + grpc.MaxCallRecvMsgSize(50*1024*1024), + grpc.MaxCallSendMsgSize(50*1024*1024), + ), + ) + if err != nil { + t.Fatalf("Failed to dial gRPC: %v", err) + } + return conn +} + +func downloadFile(url, dest string) error { + resp, err := http.Get(url) + if err != nil { + return fmt.Errorf("HTTP GET failed: %w", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("bad status: %s", resp.Status) + } + f, err := os.Create(dest) + if err != nil { + return err + } + defer f.Close() + _, err = io.Copy(f, resp.Body) + return err +} + +func TestServerHealth(t *testing.T) { + cmd := startServer(t) + defer stopServer(cmd) + + conn := dialGRPC(t) + defer conn.Close() + + client := pb.NewBackendClient(conn) + resp, err := client.Health(context.Background(), &pb.HealthMessage{}) + if err != nil { + t.Fatalf("Health check failed: %v", err) + } + if string(resp.Message) != "OK" { + t.Fatalf("Expected OK, got %s", string(resp.Message)) + } +} + +func TestLoadModel(t *testing.T) { + modelDir := skipIfNoModel(t) + cmd := startServer(t) + defer stopServer(cmd) + + conn := dialGRPC(t) + defer conn.Close() + + client := pb.NewBackendClient(conn) + resp, err := client.LoadModel(context.Background(), &pb.ModelOptions{ + ModelFile: modelDir, + }) + if err != nil { + t.Fatalf("LoadModel failed: %v", err) + } + if !resp.Success { + t.Fatalf("LoadModel returned failure: %s", resp.Message) + } +} + +func TestAudioTranscription(t *testing.T) { + modelDir := skipIfNoModel(t) + + tmpDir, err := os.MkdirTemp("", "voxtral-test") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(tmpDir) + + // Download sample audio — JFK "ask not what your country can do for you" clip + audioFile := filepath.Join(tmpDir, "sample.wav") + t.Log("Downloading sample audio...") + if err := downloadFile(sampleAudio, audioFile); err != nil { + t.Fatalf("Failed to download sample audio: %v", err) + } + + cmd := startServer(t) + defer stopServer(cmd) + + conn := dialGRPC(t) + defer conn.Close() + + client := pb.NewBackendClient(conn) + + // Load model + loadResp, err := client.LoadModel(context.Background(), &pb.ModelOptions{ + ModelFile: modelDir, + }) + if err != nil { + t.Fatalf("LoadModel failed: %v", err) + } + if !loadResp.Success { + t.Fatalf("LoadModel returned failure: %s", loadResp.Message) + } + + // Transcribe + transcriptResp, err := client.AudioTranscription(context.Background(), &pb.TranscriptRequest{ + Dst: audioFile, + }) + if err != nil { + t.Fatalf("AudioTranscription failed: %v", err) + } + if transcriptResp == nil { + t.Fatal("AudioTranscription returned nil") + } + + t.Logf("Transcribed text: %s", transcriptResp.Text) + t.Logf("Number of segments: %d", len(transcriptResp.Segments)) + + if transcriptResp.Text == "" { + t.Fatal("Transcription returned empty text") + } + + allText := strings.ToLower(transcriptResp.Text) + for _, seg := range transcriptResp.Segments { + allText += " " + strings.ToLower(seg.Text) + } + t.Logf("All text: %s", allText) + + if !strings.Contains(allText, "big") { + t.Errorf("Expected 'big' in transcription, got: %s", allText) + } + + // The sample audio should contain recognizable speech + if len(allText) < 10 { + t.Errorf("Transcription too short: %q", allText) + } +} diff --git a/backend/index.yaml b/backend/index.yaml index 3b43f9b86..646e34b9c 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -56,6 +56,21 @@ nvidia-cuda-12: "cuda12-whisper" nvidia-l4t-cuda-12: "nvidia-l4t-arm64-whisper" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-whisper" +- &voxtral + name: "voxtral" + alias: "voxtral" + license: mit + description: | + Voxtral Realtime 4B Pure C speech-to-text inference engine + urls: + - https://github.com/mudler/voxtral.c + tags: + - audio-transcription + - CPU + - Metal + capabilities: + default: "cpu-voxtral" + metal-darwin-arm64: "metal-voxtral" - &stablediffusionggml name: "stablediffusion-ggml" alias: "stablediffusion-ggml" @@ -2594,3 +2609,24 @@ uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-pocket-tts" mirrors: - localai/localai-backends:master-metal-darwin-arm64-pocket-tts +## voxtral +- !!merge <<: *voxtral + name: "cpu-voxtral" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voxtral" + mirrors: + - localai/localai-backends:latest-cpu-voxtral +- !!merge <<: *voxtral + name: "cpu-voxtral-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voxtral" + mirrors: + - localai/localai-backends:master-cpu-voxtral +- !!merge <<: *voxtral + name: "metal-voxtral" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-voxtral" + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-voxtral +- !!merge <<: *voxtral + name: "metal-voxtral-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-voxtral" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-voxtral diff --git a/docs/content/features/backends.md b/docs/content/features/backends.md index 50821e992..a63461e7a 100644 --- a/docs/content/features/backends.md +++ b/docs/content/features/backends.md @@ -122,3 +122,4 @@ LocalAI supports various types of backends: - **Diffusion Backends**: For image generation - **TTS Backends**: For text-to-speech conversion - **Whisper Backends**: For speech-to-text conversion +- **Sound Generation Backends**: For music and audio generation (e.g., ACE-Step) \ No newline at end of file