mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-13 08:03:18 -05:00
Compare commits
35 Commits
v3.11.0
...
fix/step-f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1f0110368d | ||
|
|
2fd026e958 | ||
|
|
08718b656e | ||
|
|
7121b189f7 | ||
|
|
f6c80a6987 | ||
|
|
4a4d65f8e8 | ||
|
|
2858e71606 | ||
|
|
088205339c | ||
|
|
8616397d59 | ||
|
|
1698f92bd0 | ||
|
|
02c95a57ae | ||
|
|
2ab6be1d0c | ||
|
|
9d78ec1bd8 | ||
|
|
b10b85de52 | ||
|
|
1479bee894 | ||
|
|
cff972094c | ||
|
|
79a25f7ae9 | ||
|
|
7270a98ce5 | ||
|
|
0ee92317ec | ||
|
|
743d2d1947 | ||
|
|
df04843f34 | ||
|
|
780877d1d0 | ||
|
|
08eeed61f4 | ||
|
|
5207ff84dc | ||
|
|
4ade2e61ab | ||
|
|
818be98314 | ||
|
|
056c438452 | ||
|
|
0c040beb59 | ||
|
|
bf5a1dd840 | ||
|
|
f44200bec8 | ||
|
|
3b1b08efd6 | ||
|
|
3d8791067f | ||
|
|
da8207b73b | ||
|
|
aa9ca401fa | ||
|
|
e43c0c3ffc |
3
.env
3
.env
@@ -26,6 +26,9 @@
|
||||
## Disables COMPEL (Diffusers)
|
||||
# COMPEL=0
|
||||
|
||||
## Disables SD_EMBED (Diffusers)
|
||||
# SD_EMBED=0
|
||||
|
||||
## Enable/Disable single backend (useful if only one GPU is available)
|
||||
# LOCALAI_SINGLE_ACTIVE_BACKEND=true
|
||||
|
||||
|
||||
18
.github/workflows/backend.yml
vendored
18
.github/workflows/backend.yml
vendored
@@ -1674,6 +1674,20 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
# voxtral
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64,linux/arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-voxtral'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "voxtral"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
#silero-vad
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
@@ -1945,6 +1959,10 @@ jobs:
|
||||
tag-suffix: "-metal-darwin-arm64-whisper"
|
||||
build-type: "metal"
|
||||
lang: "go"
|
||||
- backend: "voxtral"
|
||||
tag-suffix: "-metal-darwin-arm64-voxtral"
|
||||
build-type: "metal"
|
||||
lang: "go"
|
||||
- backend: "vibevoice"
|
||||
tag-suffix: "-metal-darwin-arm64-vibevoice"
|
||||
build-type: "mps"
|
||||
|
||||
4
.github/workflows/bump_deps.yaml
vendored
4
.github/workflows/bump_deps.yaml
vendored
@@ -30,6 +30,10 @@ jobs:
|
||||
variable: "PIPER_VERSION"
|
||||
branch: "master"
|
||||
file: "backend/go/piper/Makefile"
|
||||
- repository: "antirez/voxtral.c"
|
||||
variable: "VOXTRAL_VERSION"
|
||||
branch: "main"
|
||||
file: "backend/go/voxtral/Makefile"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v6
|
||||
|
||||
31
.github/workflows/test-extra.yml
vendored
31
.github/workflows/test-extra.yml
vendored
@@ -361,3 +361,34 @@ jobs:
|
||||
run: |
|
||||
make --jobs=5 --output-sync=target -C backend/python/voxcpm
|
||||
make --jobs=5 --output-sync=target -C backend/python/voxcpm test
|
||||
tests-voxtral:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential cmake curl libopenblas-dev ffmpeg
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v5
|
||||
# You can test your matrix by printing the current Go version
|
||||
- name: Display Go version
|
||||
run: go version
|
||||
- name: Proto Dependencies
|
||||
run: |
|
||||
# Install protoc
|
||||
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
|
||||
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
|
||||
rm protoc.zip
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||
PATH="$PATH:$HOME/go/bin" make protogen-go
|
||||
- name: Build voxtral
|
||||
run: |
|
||||
make --jobs=5 --output-sync=target -C backend/go/voxtral
|
||||
- name: Test voxtral
|
||||
run: |
|
||||
make --jobs=5 --output-sync=target -C backend/go/voxtral test
|
||||
|
||||
6
Makefile
6
Makefile
@@ -1,5 +1,5 @@
|
||||
# Disable parallel execution for backend builds
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/voxtral
|
||||
|
||||
GOCMD=go
|
||||
GOTEST=$(GOCMD) test
|
||||
@@ -453,6 +453,7 @@ BACKEND_HUGGINGFACE = huggingface|golang|.|false|true
|
||||
BACKEND_SILERO_VAD = silero-vad|golang|.|false|true
|
||||
BACKEND_STABLEDIFFUSION_GGML = stablediffusion-ggml|golang|.|--progress=plain|true
|
||||
BACKEND_WHISPER = whisper|golang|.|false|true
|
||||
BACKEND_VOXTRAL = voxtral|golang|.|false|true
|
||||
|
||||
# Python backends with root context
|
||||
BACKEND_RERANKERS = rerankers|python|.|false|true
|
||||
@@ -506,6 +507,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_SILERO_VAD)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_STABLEDIFFUSION_GGML)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_WHISPER)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_VOXTRAL)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_RERANKERS)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_TRANSFORMERS)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_OUTETTS)))
|
||||
@@ -533,7 +535,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_ACE_STEP)))
|
||||
docker-save-%: backend-images
|
||||
docker save local-ai-backend:$* -o backend-images/$*.tar
|
||||
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-voxtral
|
||||
|
||||
########################################################
|
||||
### Mock Backend for E2E Tests
|
||||
|
||||
@@ -20,7 +20,7 @@ RUN apt-get update && \
|
||||
build-essential \
|
||||
git ccache \
|
||||
ca-certificates \
|
||||
make cmake wget \
|
||||
make cmake wget libopenblas-dev \
|
||||
curl unzip \
|
||||
libssl-dev && \
|
||||
apt-get clean && \
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
LLAMA_VERSION?=8872ad2125336d209a9911a82101f80095a9831d
|
||||
LLAMA_VERSION?=338085c69e486b7155e5b03d7b5087e02c0e2528
|
||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
@@ -294,6 +294,76 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const
|
||||
return data;
|
||||
}
|
||||
|
||||
static bool template_uses_arguments_items_filter(const std::string & template_src) {
|
||||
return template_src.find("arguments|items") != std::string::npos ||
|
||||
template_src.find("arguments | items") != std::string::npos ||
|
||||
template_src.find("arguments| items") != std::string::npos ||
|
||||
template_src.find("arguments |items") != std::string::npos;
|
||||
}
|
||||
|
||||
static void normalize_tool_call_arguments_for_template(
|
||||
json & messages,
|
||||
const std::string & template_src,
|
||||
const char * request_name)
|
||||
{
|
||||
if (!messages.is_array() || !template_uses_arguments_items_filter(template_src)) {
|
||||
return;
|
||||
}
|
||||
|
||||
size_t converted = 0;
|
||||
size_t failed = 0;
|
||||
|
||||
for (auto & message : messages) {
|
||||
if (!message.is_object() || !message.contains("tool_calls") || !message["tool_calls"].is_array()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (auto & tool_call : message["tool_calls"]) {
|
||||
if (!tool_call.is_object() || !tool_call.contains("function") || !tool_call["function"].is_object()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto & function = tool_call["function"];
|
||||
if (!function.contains("arguments")) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto & arguments = function["arguments"];
|
||||
if (!arguments.is_string()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const std::string args_str = arguments.get<std::string>();
|
||||
if (args_str.empty()) {
|
||||
arguments = json::object();
|
||||
converted++;
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
json parsed_args = json::parse(args_str);
|
||||
if (parsed_args.is_object()) {
|
||||
arguments = parsed_args;
|
||||
converted++;
|
||||
}
|
||||
} catch (const json::parse_error &) {
|
||||
failed++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (converted > 0) {
|
||||
SRV_INF("[TOOLS DEBUG] %s: Converted %zu tool call argument strings to JSON objects for arguments|items template compatibility\n",
|
||||
request_name,
|
||||
converted);
|
||||
}
|
||||
if (failed > 0) {
|
||||
SRV_WRN("[TOOLS DEBUG] %s: Failed to parse %zu tool call argument strings as JSON for arguments|items template compatibility\n",
|
||||
request_name,
|
||||
failed);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const std::vector<ggml_type> kv_cache_types = {
|
||||
GGML_TYPE_F32,
|
||||
@@ -1255,6 +1325,11 @@ public:
|
||||
body_json["add_generation_prompt"] = data["add_generation_prompt"];
|
||||
}
|
||||
|
||||
if (body_json.contains("messages") && ctx_server.impl->chat_params.tmpls) {
|
||||
const auto template_src = common_chat_templates_source(ctx_server.impl->chat_params.tmpls.get());
|
||||
normalize_tool_call_arguments_for_template(body_json["messages"], template_src, "PredictStream");
|
||||
}
|
||||
|
||||
// Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)
|
||||
SRV_DBG("[CONVERSATION DEBUG] PredictStream: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str());
|
||||
|
||||
@@ -1986,6 +2061,11 @@ public:
|
||||
body_json["add_generation_prompt"] = data["add_generation_prompt"];
|
||||
}
|
||||
|
||||
if (body_json.contains("messages") && ctx_server.impl->chat_params.tmpls) {
|
||||
const auto template_src = common_chat_templates_source(ctx_server.impl->chat_params.tmpls.get());
|
||||
normalize_tool_call_arguments_for_template(body_json["messages"], template_src, "Predict");
|
||||
}
|
||||
|
||||
// Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)
|
||||
SRV_DBG("[CONVERSATION DEBUG] Predict: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str());
|
||||
|
||||
|
||||
2
backend/go/stablediffusion-ggml/.gitignore
vendored
2
backend/go/stablediffusion-ggml/.gitignore
vendored
@@ -2,5 +2,5 @@ package/
|
||||
sources/
|
||||
.cache/
|
||||
build/
|
||||
libgosd.so
|
||||
*.so
|
||||
stablediffusion-ggml
|
||||
|
||||
@@ -66,15 +66,18 @@ sources/stablediffusion-ggml.cpp:
|
||||
git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
libgosd.so: sources/stablediffusion-ggml.cpp CMakeLists.txt gosd.cpp gosd.h
|
||||
mkdir -p build && \
|
||||
cd build && \
|
||||
cmake .. $(CMAKE_ARGS) && \
|
||||
cmake --build . --config Release -j$(JOBS) && \
|
||||
cd .. && \
|
||||
mv build/libgosd.so ./
|
||||
# Detect OS
|
||||
UNAME_S := $(shell uname -s)
|
||||
|
||||
stablediffusion-ggml: main.go gosd.go libgosd.so
|
||||
# Only build CPU variants on Linux
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
VARIANT_TARGETS = libgosd-avx.so libgosd-avx2.so libgosd-avx512.so libgosd-fallback.so
|
||||
else
|
||||
# On non-Linux (e.g., Darwin), build only fallback variant
|
||||
VARIANT_TARGETS = libgosd-fallback.so
|
||||
endif
|
||||
|
||||
stablediffusion-ggml: main.go gosd.go $(VARIANT_TARGETS)
|
||||
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o stablediffusion-ggml ./
|
||||
|
||||
package: stablediffusion-ggml
|
||||
@@ -82,5 +85,46 @@ package: stablediffusion-ggml
|
||||
|
||||
build: package
|
||||
|
||||
clean:
|
||||
rm -rf libgosd.so build stablediffusion-ggml package sources
|
||||
clean: purge
|
||||
rm -rf libgosd*.so stablediffusion-ggml package sources
|
||||
|
||||
purge:
|
||||
rm -rf build*
|
||||
|
||||
# Build all variants (Linux only)
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
libgosd-avx.so: sources/stablediffusion-ggml.cpp
|
||||
$(MAKE) purge
|
||||
$(info ${GREEN}I stablediffusion-ggml build info:avx${RESET})
|
||||
SO_TARGET=libgosd-avx.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgosd-custom
|
||||
rm -rfv build*
|
||||
|
||||
libgosd-avx2.so: sources/stablediffusion-ggml.cpp
|
||||
$(MAKE) purge
|
||||
$(info ${GREEN}I stablediffusion-ggml build info:avx2${RESET})
|
||||
SO_TARGET=libgosd-avx2.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgosd-custom
|
||||
rm -rfv build*
|
||||
|
||||
libgosd-avx512.so: sources/stablediffusion-ggml.cpp
|
||||
$(MAKE) purge
|
||||
$(info ${GREEN}I stablediffusion-ggml build info:avx512${RESET})
|
||||
SO_TARGET=libgosd-avx512.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgosd-custom
|
||||
rm -rfv build*
|
||||
endif
|
||||
|
||||
# Build fallback variant (all platforms)
|
||||
libgosd-fallback.so: sources/stablediffusion-ggml.cpp
|
||||
$(MAKE) purge
|
||||
$(info ${GREEN}I stablediffusion-ggml build info:fallback${RESET})
|
||||
SO_TARGET=libgosd-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgosd-custom
|
||||
rm -rfv build*
|
||||
|
||||
libgosd-custom: CMakeLists.txt gosd.cpp gosd.h
|
||||
mkdir -p build-$(SO_TARGET) && \
|
||||
cd build-$(SO_TARGET) && \
|
||||
cmake .. $(CMAKE_ARGS) && \
|
||||
cmake --build . --config Release -j$(JOBS) && \
|
||||
cd .. && \
|
||||
mv build-$(SO_TARGET)/libgosd.so ./$(SO_TARGET)
|
||||
|
||||
all: stablediffusion-ggml package
|
||||
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"os"
|
||||
|
||||
"github.com/ebitengine/purego"
|
||||
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||
@@ -17,7 +18,13 @@ type LibFuncs struct {
|
||||
}
|
||||
|
||||
func main() {
|
||||
gosd, err := purego.Dlopen("./libgosd.so", purego.RTLD_NOW|purego.RTLD_GLOBAL)
|
||||
// Get library name from environment variable, default to fallback
|
||||
libName := os.Getenv("SD_LIBRARY")
|
||||
if libName == "" {
|
||||
libName = "./libgosd-fallback.so"
|
||||
}
|
||||
|
||||
gosd, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ REPO_ROOT="${CURDIR}/../../.."
|
||||
# Create lib directory
|
||||
mkdir -p $CURDIR/package/lib
|
||||
|
||||
cp -avf $CURDIR/libgosd.so $CURDIR/package/
|
||||
cp -avf $CURDIR/libgosd-*.so $CURDIR/package/
|
||||
cp -avf $CURDIR/stablediffusion-ggml $CURDIR/package/
|
||||
cp -fv $CURDIR/run.sh $CURDIR/package/
|
||||
|
||||
|
||||
@@ -1,14 +1,52 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
# Get the absolute current dir where the script is located
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
|
||||
cd /
|
||||
|
||||
echo "CPU info:"
|
||||
if [ "$(uname)" != "Darwin" ]; then
|
||||
grep -e "model\sname" /proc/cpuinfo | head -1
|
||||
grep -e "flags" /proc/cpuinfo | head -1
|
||||
fi
|
||||
|
||||
LIBRARY="$CURDIR/libgosd-fallback.so"
|
||||
|
||||
if [ "$(uname)" != "Darwin" ]; then
|
||||
if grep -q -e "\savx\s" /proc/cpuinfo ; then
|
||||
echo "CPU: AVX found OK"
|
||||
if [ -e $CURDIR/libgosd-avx.so ]; then
|
||||
LIBRARY="$CURDIR/libgosd-avx.so"
|
||||
fi
|
||||
fi
|
||||
|
||||
if grep -q -e "\savx2\s" /proc/cpuinfo ; then
|
||||
echo "CPU: AVX2 found OK"
|
||||
if [ -e $CURDIR/libgosd-avx2.so ]; then
|
||||
LIBRARY="$CURDIR/libgosd-avx2.so"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Check avx 512
|
||||
if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
|
||||
echo "CPU: AVX512F found OK"
|
||||
if [ -e $CURDIR/libgosd-avx512.so ]; then
|
||||
LIBRARY="$CURDIR/libgosd-avx512.so"
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
|
||||
export SD_LIBRARY=$LIBRARY
|
||||
|
||||
# If there is a lib/ld.so, use it
|
||||
if [ -f $CURDIR/lib/ld.so ]; then
|
||||
echo "Using lib/ld.so"
|
||||
echo "Using library: $LIBRARY"
|
||||
exec $CURDIR/lib/ld.so $CURDIR/stablediffusion-ggml "$@"
|
||||
fi
|
||||
|
||||
exec $CURDIR/stablediffusion-ggml "$@"
|
||||
echo "Using library: $LIBRARY"
|
||||
exec $CURDIR/stablediffusion-ggml "$@"
|
||||
|
||||
9
backend/go/voxtral/.gitignore
vendored
Normal file
9
backend/go/voxtral/.gitignore
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
.cache/
|
||||
sources/
|
||||
build/
|
||||
build-*/
|
||||
package/
|
||||
voxtral
|
||||
*.so
|
||||
*.dylib
|
||||
compile_commands.json
|
||||
84
backend/go/voxtral/CMakeLists.txt
Normal file
84
backend/go/voxtral/CMakeLists.txt
Normal file
@@ -0,0 +1,84 @@
|
||||
cmake_minimum_required(VERSION 3.12)
|
||||
|
||||
if(USE_METAL)
|
||||
project(govoxtral LANGUAGES C OBJC)
|
||||
else()
|
||||
project(govoxtral LANGUAGES C)
|
||||
endif()
|
||||
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
# Workaround: CMake + GCC linker depfile generation fails for MODULE libraries
|
||||
set(CMAKE_C_LINKER_DEPFILE_SUPPORTED FALSE)
|
||||
|
||||
# Build voxtral.c as a library
|
||||
set(VOXTRAL_SOURCES
|
||||
sources/voxtral.c/voxtral.c
|
||||
sources/voxtral.c/voxtral_kernels.c
|
||||
sources/voxtral.c/voxtral_audio.c
|
||||
sources/voxtral.c/voxtral_encoder.c
|
||||
sources/voxtral.c/voxtral_decoder.c
|
||||
sources/voxtral.c/voxtral_tokenizer.c
|
||||
sources/voxtral.c/voxtral_safetensors.c
|
||||
)
|
||||
|
||||
# Metal GPU acceleration (macOS arm64 only)
|
||||
if(USE_METAL)
|
||||
# Generate embedded shader header from .metal source via xxd
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/sources/voxtral.c/voxtral_shaders_source.h
|
||||
COMMAND xxd -i voxtral_shaders.metal > voxtral_shaders_source.h
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/sources/voxtral.c
|
||||
DEPENDS sources/voxtral.c/voxtral_shaders.metal
|
||||
COMMENT "Generating embedded Metal shaders header"
|
||||
)
|
||||
list(APPEND VOXTRAL_SOURCES sources/voxtral.c/voxtral_metal.m)
|
||||
set_source_files_properties(sources/voxtral.c/voxtral_metal.m PROPERTIES
|
||||
COMPILE_FLAGS "-fobjc-arc"
|
||||
)
|
||||
endif()
|
||||
|
||||
add_library(govoxtral MODULE csrc/govoxtral.c ${VOXTRAL_SOURCES})
|
||||
|
||||
target_include_directories(govoxtral PRIVATE sources/voxtral.c csrc)
|
||||
|
||||
target_compile_options(govoxtral PRIVATE -O3 -ffast-math)
|
||||
|
||||
if(USE_METAL)
|
||||
target_compile_definitions(govoxtral PRIVATE USE_BLAS USE_METAL ACCELERATE_NEW_LAPACK)
|
||||
target_link_libraries(govoxtral PRIVATE
|
||||
"-framework Accelerate"
|
||||
"-framework Metal"
|
||||
"-framework MetalPerformanceShaders"
|
||||
"-framework MetalPerformanceShadersGraph"
|
||||
"-framework Foundation"
|
||||
"-framework AudioToolbox"
|
||||
"-framework CoreFoundation"
|
||||
m
|
||||
)
|
||||
# Ensure the generated shader header is built before compiling
|
||||
target_sources(govoxtral PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/sources/voxtral.c/voxtral_shaders_source.h
|
||||
)
|
||||
elseif(USE_OPENBLAS)
|
||||
# Try to find OpenBLAS; use it if available, otherwise fall back to pure C
|
||||
find_package(BLAS)
|
||||
if(BLAS_FOUND)
|
||||
target_compile_definitions(govoxtral PRIVATE USE_BLAS USE_OPENBLAS)
|
||||
target_link_libraries(govoxtral PRIVATE ${BLAS_LIBRARIES} m)
|
||||
target_include_directories(govoxtral PRIVATE /usr/include/openblas)
|
||||
else()
|
||||
message(WARNING "OpenBLAS requested but not found, building without BLAS")
|
||||
target_link_libraries(govoxtral PRIVATE m)
|
||||
endif()
|
||||
elseif(APPLE)
|
||||
# macOS without Metal: use Accelerate framework
|
||||
target_compile_definitions(govoxtral PRIVATE USE_BLAS ACCELERATE_NEW_LAPACK)
|
||||
target_link_libraries(govoxtral PRIVATE "-framework Accelerate" m)
|
||||
else()
|
||||
target_link_libraries(govoxtral PRIVATE m)
|
||||
endif()
|
||||
|
||||
set_property(TARGET govoxtral PROPERTY C_STANDARD 11)
|
||||
set_target_properties(govoxtral PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
|
||||
107
backend/go/voxtral/Makefile
Normal file
107
backend/go/voxtral/Makefile
Normal file
@@ -0,0 +1,107 @@
|
||||
.NOTPARALLEL:
|
||||
|
||||
CMAKE_ARGS?=
|
||||
BUILD_TYPE?=
|
||||
NATIVE?=true
|
||||
|
||||
GOCMD?=go
|
||||
GO_TAGS?=
|
||||
JOBS?=$(shell nproc --ignore=1 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
||||
|
||||
# voxtral.c version
|
||||
VOXTRAL_REPO?=https://github.com/antirez/voxtral.c
|
||||
VOXTRAL_VERSION?=c9e8773a2042d67c637fc492c8a655c485354080
|
||||
|
||||
# Detect OS
|
||||
UNAME_S := $(shell uname -s)
|
||||
|
||||
# Shared library extension
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
SO_EXT=dylib
|
||||
else
|
||||
SO_EXT=so
|
||||
endif
|
||||
|
||||
SO_TARGET?=libgovoxtral.$(SO_EXT)
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
ifeq ($(NATIVE),false)
|
||||
ifneq ($(UNAME_S),Darwin)
|
||||
CMAKE_ARGS+=-DCMAKE_C_FLAGS="-march=x86-64"
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_TYPE),cublas)
|
||||
CMAKE_ARGS+=-DUSE_OPENBLAS=OFF
|
||||
else ifeq ($(BUILD_TYPE),hipblas)
|
||||
CMAKE_ARGS+=-DUSE_OPENBLAS=OFF
|
||||
else ifeq ($(BUILD_TYPE),metal)
|
||||
CMAKE_ARGS+=-DUSE_OPENBLAS=OFF -DUSE_METAL=ON
|
||||
else ifeq ($(UNAME_S),Darwin)
|
||||
# Default on macOS: use Accelerate (no OpenBLAS needed)
|
||||
CMAKE_ARGS+=-DUSE_OPENBLAS=OFF
|
||||
else
|
||||
CMAKE_ARGS+=-DUSE_OPENBLAS=ON
|
||||
endif
|
||||
|
||||
# Single library target
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
VARIANT_TARGETS = libgovoxtral.dylib
|
||||
else
|
||||
VARIANT_TARGETS = libgovoxtral.so
|
||||
endif
|
||||
|
||||
sources/voxtral.c:
|
||||
mkdir -p sources/voxtral.c
|
||||
cd sources/voxtral.c && \
|
||||
git init && \
|
||||
git remote add origin $(VOXTRAL_REPO) && \
|
||||
git fetch origin && \
|
||||
git checkout $(VOXTRAL_VERSION) && \
|
||||
git submodule update --init --recursive --depth 1 --single-branch
|
||||
|
||||
voxtral: main.go govoxtral.go $(VARIANT_TARGETS)
|
||||
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o voxtral ./
|
||||
|
||||
package: voxtral
|
||||
bash package.sh
|
||||
|
||||
build: package
|
||||
|
||||
clean: purge
|
||||
rm -rf libgovoxtral.so libgovoxtral.dylib package sources/voxtral.c voxtral
|
||||
|
||||
purge:
|
||||
rm -rf build*
|
||||
|
||||
# Build single library
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
libgovoxtral.dylib: sources/voxtral.c
|
||||
$(MAKE) purge
|
||||
$(info Building voxtral: darwin)
|
||||
SO_TARGET=libgovoxtral.dylib NATIVE=true $(MAKE) libgovoxtral-custom
|
||||
rm -rfv build*
|
||||
else
|
||||
libgovoxtral.so: sources/voxtral.c
|
||||
$(MAKE) purge
|
||||
$(info Building voxtral)
|
||||
SO_TARGET=libgovoxtral.so $(MAKE) libgovoxtral-custom
|
||||
rm -rfv build*
|
||||
endif
|
||||
|
||||
libgovoxtral-custom: CMakeLists.txt csrc/govoxtral.c csrc/govoxtral.h
|
||||
mkdir -p build-$(SO_TARGET) && \
|
||||
cd build-$(SO_TARGET) && \
|
||||
cmake .. $(CMAKE_ARGS) && \
|
||||
cmake --build . --config Release -j$(JOBS) && \
|
||||
cd .. && \
|
||||
(mv build-$(SO_TARGET)/libgovoxtral.so ./$(SO_TARGET) 2>/dev/null || \
|
||||
mv build-$(SO_TARGET)/libgovoxtral.dylib ./$(SO_TARGET) 2>/dev/null)
|
||||
|
||||
test: voxtral
|
||||
@echo "Running voxtral tests..."
|
||||
bash test.sh
|
||||
@echo "voxtral tests completed."
|
||||
|
||||
all: voxtral package
|
||||
62
backend/go/voxtral/csrc/govoxtral.c
Normal file
62
backend/go/voxtral/csrc/govoxtral.c
Normal file
@@ -0,0 +1,62 @@
|
||||
#include "govoxtral.h"
|
||||
#include "voxtral.h"
|
||||
#include "voxtral_audio.h"
|
||||
#ifdef USE_METAL
|
||||
#include "voxtral_metal.h"
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
static vox_ctx_t *ctx = NULL;
|
||||
static char *last_result = NULL;
|
||||
static int metal_initialized = 0;
|
||||
|
||||
int load_model(const char *model_dir) {
|
||||
if (ctx != NULL) {
|
||||
vox_free(ctx);
|
||||
ctx = NULL;
|
||||
}
|
||||
|
||||
#ifdef USE_METAL
|
||||
if (!metal_initialized) {
|
||||
vox_metal_init();
|
||||
metal_initialized = 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
ctx = vox_load(model_dir);
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "error: failed to load voxtral model from %s\n", model_dir);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char *transcribe(const char *wav_path) {
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "error: model not loaded\n");
|
||||
return "";
|
||||
}
|
||||
|
||||
if (last_result != NULL) {
|
||||
free(last_result);
|
||||
last_result = NULL;
|
||||
}
|
||||
|
||||
last_result = vox_transcribe(ctx, wav_path);
|
||||
if (last_result == NULL) {
|
||||
fprintf(stderr, "error: transcription failed for %s\n", wav_path);
|
||||
return "";
|
||||
}
|
||||
|
||||
return last_result;
|
||||
}
|
||||
|
||||
void free_result(void) {
|
||||
if (last_result != NULL) {
|
||||
free(last_result);
|
||||
last_result = NULL;
|
||||
}
|
||||
}
|
||||
8
backend/go/voxtral/csrc/govoxtral.h
Normal file
8
backend/go/voxtral/csrc/govoxtral.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#ifndef GOVOXTRAL_H
|
||||
#define GOVOXTRAL_H
|
||||
|
||||
extern int load_model(const char *model_dir);
|
||||
extern const char *transcribe(const char *wav_path);
|
||||
extern void free_result(void);
|
||||
|
||||
#endif /* GOVOXTRAL_H */
|
||||
60
backend/go/voxtral/govoxtral.go
Normal file
60
backend/go/voxtral/govoxtral.go
Normal file
@@ -0,0 +1,60 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/utils"
|
||||
)
|
||||
|
||||
var (
|
||||
CppLoadModel func(modelDir string) int
|
||||
CppTranscribe func(wavPath string) string
|
||||
CppFreeResult func()
|
||||
)
|
||||
|
||||
type Voxtral struct {
|
||||
base.SingleThread
|
||||
}
|
||||
|
||||
func (v *Voxtral) Load(opts *pb.ModelOptions) error {
|
||||
if ret := CppLoadModel(opts.ModelFile); ret != 0 {
|
||||
return fmt.Errorf("failed to load Voxtral model from %s", opts.ModelFile)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (v *Voxtral) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
|
||||
dir, err := os.MkdirTemp("", "voxtral")
|
||||
if err != nil {
|
||||
return pb.TranscriptResult{}, err
|
||||
}
|
||||
defer os.RemoveAll(dir)
|
||||
|
||||
convertedPath := dir + "/converted.wav"
|
||||
|
||||
if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil {
|
||||
return pb.TranscriptResult{}, err
|
||||
}
|
||||
|
||||
result := strings.Clone(CppTranscribe(convertedPath))
|
||||
CppFreeResult()
|
||||
|
||||
text := strings.TrimSpace(result)
|
||||
|
||||
segments := []*pb.TranscriptSegment{}
|
||||
if text != "" {
|
||||
segments = append(segments, &pb.TranscriptSegment{
|
||||
Id: 0,
|
||||
Text: text,
|
||||
})
|
||||
}
|
||||
|
||||
return pb.TranscriptResult{
|
||||
Segments: segments,
|
||||
Text: text,
|
||||
}, nil
|
||||
}
|
||||
53
backend/go/voxtral/main.go
Normal file
53
backend/go/voxtral/main.go
Normal file
@@ -0,0 +1,53 @@
|
||||
package main
|
||||
|
||||
// Note: this is started internally by LocalAI and a server is allocated for each model
|
||||
import (
|
||||
"flag"
|
||||
"os"
|
||||
"runtime"
|
||||
|
||||
"github.com/ebitengine/purego"
|
||||
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||
)
|
||||
|
||||
var (
|
||||
addr = flag.String("addr", "localhost:50051", "the address to connect to")
|
||||
)
|
||||
|
||||
type LibFuncs struct {
|
||||
FuncPtr any
|
||||
Name string
|
||||
}
|
||||
|
||||
func main() {
|
||||
// Get library name from environment variable, default to fallback
|
||||
libName := os.Getenv("VOXTRAL_LIBRARY")
|
||||
if libName == "" {
|
||||
if runtime.GOOS == "darwin" {
|
||||
libName = "./libgovoxtral.dylib"
|
||||
} else {
|
||||
libName = "./libgovoxtral.so"
|
||||
}
|
||||
}
|
||||
|
||||
gosd, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
libFuncs := []LibFuncs{
|
||||
{&CppLoadModel, "load_model"},
|
||||
{&CppTranscribe, "transcribe"},
|
||||
{&CppFreeResult, "free_result"},
|
||||
}
|
||||
|
||||
for _, lf := range libFuncs {
|
||||
purego.RegisterLibFunc(lf.FuncPtr, gosd, lf.Name)
|
||||
}
|
||||
|
||||
flag.Parse()
|
||||
|
||||
if err := grpc.StartServer(*addr, &Voxtral{}); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
68
backend/go/voxtral/package.sh
Normal file
68
backend/go/voxtral/package.sh
Normal file
@@ -0,0 +1,68 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Script to copy the appropriate libraries based on architecture
|
||||
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
REPO_ROOT="${CURDIR}/../../.."
|
||||
|
||||
# Create lib directory
|
||||
mkdir -p $CURDIR/package/lib
|
||||
|
||||
cp -avf $CURDIR/voxtral $CURDIR/package/
|
||||
cp -fv $CURDIR/libgovoxtral-*.so $CURDIR/package/ 2>/dev/null || true
|
||||
cp -fv $CURDIR/libgovoxtral-*.dylib $CURDIR/package/ 2>/dev/null || true
|
||||
cp -fv $CURDIR/run.sh $CURDIR/package/
|
||||
|
||||
# Detect architecture and copy appropriate libraries
|
||||
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||
# x86_64 architecture
|
||||
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||
cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
|
||||
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
|
||||
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
|
||||
# OpenBLAS if available
|
||||
if [ -f /usr/lib/x86_64-linux-gnu/libopenblas.so.0 ]; then
|
||||
cp -arfLv /usr/lib/x86_64-linux-gnu/libopenblas.so.0 $CURDIR/package/lib/
|
||||
fi
|
||||
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||
# ARM64 architecture
|
||||
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||
cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
|
||||
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
|
||||
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
|
||||
# OpenBLAS if available
|
||||
if [ -f /usr/lib/aarch64-linux-gnu/libopenblas.so.0 ]; then
|
||||
cp -arfLv /usr/lib/aarch64-linux-gnu/libopenblas.so.0 $CURDIR/package/lib/
|
||||
fi
|
||||
elif [ $(uname -s) = "Darwin" ]; then
|
||||
echo "Detected Darwin — system frameworks linked dynamically, no bundled libs needed"
|
||||
else
|
||||
echo "Error: Could not detect architecture"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Package GPU libraries based on BUILD_TYPE
|
||||
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||
package_gpu_libs
|
||||
fi
|
||||
|
||||
echo "Packaging completed successfully"
|
||||
ls -liah $CURDIR/package/
|
||||
ls -liah $CURDIR/package/lib/
|
||||
49
backend/go/voxtral/run.sh
Normal file
49
backend/go/voxtral/run.sh
Normal file
@@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
# Get the absolute current dir where the script is located
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
|
||||
cd /
|
||||
|
||||
echo "CPU info:"
|
||||
if [ "$(uname)" != "Darwin" ]; then
|
||||
grep -e "model\sname" /proc/cpuinfo | head -1
|
||||
grep -e "flags" /proc/cpuinfo | head -1
|
||||
fi
|
||||
|
||||
if [ "$(uname)" = "Darwin" ]; then
|
||||
# macOS: single dylib variant (Metal or Accelerate)
|
||||
LIBRARY="$CURDIR/libgovoxtral-fallback.dylib"
|
||||
export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
|
||||
else
|
||||
LIBRARY="$CURDIR/libgovoxtral-fallback.so"
|
||||
|
||||
if grep -q -e "\savx\s" /proc/cpuinfo ; then
|
||||
echo "CPU: AVX found OK"
|
||||
if [ -e $CURDIR/libgovoxtral-avx.so ]; then
|
||||
LIBRARY="$CURDIR/libgovoxtral-avx.so"
|
||||
fi
|
||||
fi
|
||||
|
||||
if grep -q -e "\savx2\s" /proc/cpuinfo ; then
|
||||
echo "CPU: AVX2 found OK"
|
||||
if [ -e $CURDIR/libgovoxtral-avx2.so ]; then
|
||||
LIBRARY="$CURDIR/libgovoxtral-avx2.so"
|
||||
fi
|
||||
fi
|
||||
|
||||
export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
|
||||
fi
|
||||
|
||||
export VOXTRAL_LIBRARY=$LIBRARY
|
||||
|
||||
# If there is a lib/ld.so, use it (Linux only)
|
||||
if [ -f $CURDIR/lib/ld.so ]; then
|
||||
echo "Using lib/ld.so"
|
||||
echo "Using library: $LIBRARY"
|
||||
exec $CURDIR/lib/ld.so $CURDIR/voxtral "$@"
|
||||
fi
|
||||
|
||||
echo "Using library: $LIBRARY"
|
||||
exec $CURDIR/voxtral "$@"
|
||||
48
backend/go/voxtral/test.sh
Normal file
48
backend/go/voxtral/test.sh
Normal file
@@ -0,0 +1,48 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
CURDIR=$(dirname "$(realpath $0)")
|
||||
|
||||
echo "Running voxtral backend tests..."
|
||||
|
||||
# The test requires:
|
||||
# - VOXTRAL_MODEL_DIR: path to directory containing consolidated.safetensors + tekken.json
|
||||
# - VOXTRAL_BINARY: path to the voxtral binary (defaults to ./voxtral)
|
||||
#
|
||||
# Tests that require the model will be skipped if VOXTRAL_MODEL_DIR is not set.
|
||||
|
||||
cd "$CURDIR"
|
||||
export VOXTRAL_MODEL_DIR="${VOXTRAL_MODEL_DIR:-./voxtral-model}"
|
||||
|
||||
if [ ! -d "$VOXTRAL_MODEL_DIR" ]; then
|
||||
echo "Creating voxtral-model directory for tests..."
|
||||
mkdir -p "$VOXTRAL_MODEL_DIR"
|
||||
MODEL_ID="mistralai/Voxtral-Mini-4B-Realtime-2602"
|
||||
echo "Model: ${MODEL_ID}"
|
||||
echo ""
|
||||
|
||||
# Files to download
|
||||
FILES=(
|
||||
"consolidated.safetensors"
|
||||
"params.json"
|
||||
"tekken.json"
|
||||
)
|
||||
|
||||
BASE_URL="https://huggingface.co/${MODEL_ID}/resolve/main"
|
||||
|
||||
for file in "${FILES[@]}"; do
|
||||
dest="${VOXTRAL_MODEL_DIR}/${file}"
|
||||
if [ -f "${dest}" ]; then
|
||||
echo " [skip] ${file} (already exists)"
|
||||
else
|
||||
echo " [download] ${file}..."
|
||||
curl -L -o "${dest}" "${BASE_URL}/${file}" --progress-bar
|
||||
echo " [done] ${file}"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# Run Go tests
|
||||
go test -v -timeout 300s ./...
|
||||
|
||||
echo "All voxtral tests passed."
|
||||
201
backend/go/voxtral/voxtral_test.go
Normal file
201
backend/go/voxtral/voxtral_test.go
Normal file
@@ -0,0 +1,201 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"google.golang.org/grpc"
|
||||
"google.golang.org/grpc/credentials/insecure"
|
||||
)
|
||||
|
||||
const (
|
||||
testAddr = "localhost:50051"
|
||||
sampleAudio = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav"
|
||||
startupWait = 5 * time.Second
|
||||
)
|
||||
|
||||
func skipIfNoModel(t *testing.T) string {
|
||||
t.Helper()
|
||||
modelDir := os.Getenv("VOXTRAL_MODEL_DIR")
|
||||
if modelDir == "" {
|
||||
t.Skip("VOXTRAL_MODEL_DIR not set, skipping test (set to voxtral model directory)")
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(modelDir, "consolidated.safetensors")); os.IsNotExist(err) {
|
||||
t.Skipf("Model file not found in %s, skipping", modelDir)
|
||||
}
|
||||
return modelDir
|
||||
}
|
||||
|
||||
func startServer(t *testing.T) *exec.Cmd {
|
||||
t.Helper()
|
||||
binary := os.Getenv("VOXTRAL_BINARY")
|
||||
if binary == "" {
|
||||
binary = "./voxtral"
|
||||
}
|
||||
if _, err := os.Stat(binary); os.IsNotExist(err) {
|
||||
t.Skipf("Backend binary not found at %s, skipping", binary)
|
||||
}
|
||||
cmd := exec.Command(binary, "--addr", testAddr)
|
||||
cmd.Stdout = os.Stderr
|
||||
cmd.Stderr = os.Stderr
|
||||
if err := cmd.Start(); err != nil {
|
||||
t.Fatalf("Failed to start server: %v", err)
|
||||
}
|
||||
time.Sleep(startupWait)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func stopServer(cmd *exec.Cmd) {
|
||||
if cmd != nil && cmd.Process != nil {
|
||||
cmd.Process.Kill()
|
||||
cmd.Wait()
|
||||
}
|
||||
}
|
||||
|
||||
func dialGRPC(t *testing.T) *grpc.ClientConn {
|
||||
t.Helper()
|
||||
conn, err := grpc.Dial(testAddr,
|
||||
grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||
grpc.WithDefaultCallOptions(
|
||||
grpc.MaxCallRecvMsgSize(50*1024*1024),
|
||||
grpc.MaxCallSendMsgSize(50*1024*1024),
|
||||
),
|
||||
)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to dial gRPC: %v", err)
|
||||
}
|
||||
return conn
|
||||
}
|
||||
|
||||
func downloadFile(url, dest string) error {
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
return fmt.Errorf("HTTP GET failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("bad status: %s", resp.Status)
|
||||
}
|
||||
f, err := os.Create(dest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
_, err = io.Copy(f, resp.Body)
|
||||
return err
|
||||
}
|
||||
|
||||
func TestServerHealth(t *testing.T) {
|
||||
cmd := startServer(t)
|
||||
defer stopServer(cmd)
|
||||
|
||||
conn := dialGRPC(t)
|
||||
defer conn.Close()
|
||||
|
||||
client := pb.NewBackendClient(conn)
|
||||
resp, err := client.Health(context.Background(), &pb.HealthMessage{})
|
||||
if err != nil {
|
||||
t.Fatalf("Health check failed: %v", err)
|
||||
}
|
||||
if string(resp.Message) != "OK" {
|
||||
t.Fatalf("Expected OK, got %s", string(resp.Message))
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadModel(t *testing.T) {
|
||||
modelDir := skipIfNoModel(t)
|
||||
cmd := startServer(t)
|
||||
defer stopServer(cmd)
|
||||
|
||||
conn := dialGRPC(t)
|
||||
defer conn.Close()
|
||||
|
||||
client := pb.NewBackendClient(conn)
|
||||
resp, err := client.LoadModel(context.Background(), &pb.ModelOptions{
|
||||
ModelFile: modelDir,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("LoadModel failed: %v", err)
|
||||
}
|
||||
if !resp.Success {
|
||||
t.Fatalf("LoadModel returned failure: %s", resp.Message)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAudioTranscription(t *testing.T) {
|
||||
modelDir := skipIfNoModel(t)
|
||||
|
||||
tmpDir, err := os.MkdirTemp("", "voxtral-test")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
// Download sample audio — JFK "ask not what your country can do for you" clip
|
||||
audioFile := filepath.Join(tmpDir, "sample.wav")
|
||||
t.Log("Downloading sample audio...")
|
||||
if err := downloadFile(sampleAudio, audioFile); err != nil {
|
||||
t.Fatalf("Failed to download sample audio: %v", err)
|
||||
}
|
||||
|
||||
cmd := startServer(t)
|
||||
defer stopServer(cmd)
|
||||
|
||||
conn := dialGRPC(t)
|
||||
defer conn.Close()
|
||||
|
||||
client := pb.NewBackendClient(conn)
|
||||
|
||||
// Load model
|
||||
loadResp, err := client.LoadModel(context.Background(), &pb.ModelOptions{
|
||||
ModelFile: modelDir,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("LoadModel failed: %v", err)
|
||||
}
|
||||
if !loadResp.Success {
|
||||
t.Fatalf("LoadModel returned failure: %s", loadResp.Message)
|
||||
}
|
||||
|
||||
// Transcribe
|
||||
transcriptResp, err := client.AudioTranscription(context.Background(), &pb.TranscriptRequest{
|
||||
Dst: audioFile,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("AudioTranscription failed: %v", err)
|
||||
}
|
||||
if transcriptResp == nil {
|
||||
t.Fatal("AudioTranscription returned nil")
|
||||
}
|
||||
|
||||
t.Logf("Transcribed text: %s", transcriptResp.Text)
|
||||
t.Logf("Number of segments: %d", len(transcriptResp.Segments))
|
||||
|
||||
if transcriptResp.Text == "" {
|
||||
t.Fatal("Transcription returned empty text")
|
||||
}
|
||||
|
||||
allText := strings.ToLower(transcriptResp.Text)
|
||||
for _, seg := range transcriptResp.Segments {
|
||||
allText += " " + strings.ToLower(seg.Text)
|
||||
}
|
||||
t.Logf("All text: %s", allText)
|
||||
|
||||
if !strings.Contains(allText, "big") {
|
||||
t.Errorf("Expected 'big' in transcription, got: %s", allText)
|
||||
}
|
||||
|
||||
// The sample audio should contain recognizable speech
|
||||
if len(allText) < 10 {
|
||||
t.Errorf("Transcription too short: %q", allText)
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
|
||||
WHISPER_CPP_VERSION?=941bdabbe4561bc6de68981aea01bc5ab05781c5
|
||||
WHISPER_CPP_VERSION?=764482c3175d9c3bc6089c1ec84df7d1b9537d83
|
||||
SO_TARGET?=libgowhisper.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
@@ -88,19 +88,19 @@ ifeq ($(UNAME_S),Linux)
|
||||
libgowhisper-avx.so: sources/whisper.cpp
|
||||
$(MAKE) purge
|
||||
$(info ${GREEN}I whisper build info:avx${RESET})
|
||||
SO_TARGET=libgowhisper-avx.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) libgowhisper-custom
|
||||
SO_TARGET=libgowhisper-avx.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgowhisper-custom
|
||||
rm -rfv build*
|
||||
|
||||
libgowhisper-avx2.so: sources/whisper.cpp
|
||||
$(MAKE) purge
|
||||
$(info ${GREEN}I whisper build info:avx2${RESET})
|
||||
SO_TARGET=libgowhisper-avx2.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) libgowhisper-custom
|
||||
SO_TARGET=libgowhisper-avx2.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgowhisper-custom
|
||||
rm -rfv build*
|
||||
|
||||
libgowhisper-avx512.so: sources/whisper.cpp
|
||||
$(MAKE) purge
|
||||
$(info ${GREEN}I whisper build info:avx512${RESET})
|
||||
SO_TARGET=libgowhisper-avx512.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) libgowhisper-custom
|
||||
SO_TARGET=libgowhisper-avx512.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgowhisper-custom
|
||||
rm -rfv build*
|
||||
endif
|
||||
|
||||
@@ -108,7 +108,7 @@ endif
|
||||
libgowhisper-fallback.so: sources/whisper.cpp
|
||||
$(MAKE) purge
|
||||
$(info ${GREEN}I whisper build info:fallback${RESET})
|
||||
SO_TARGET=libgowhisper-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) libgowhisper-custom
|
||||
SO_TARGET=libgowhisper-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgowhisper-custom
|
||||
rm -rfv build*
|
||||
|
||||
libgowhisper-custom: CMakeLists.txt gowhisper.cpp gowhisper.h
|
||||
|
||||
@@ -56,6 +56,21 @@
|
||||
nvidia-cuda-12: "cuda12-whisper"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-whisper"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-whisper"
|
||||
- &voxtral
|
||||
name: "voxtral"
|
||||
alias: "voxtral"
|
||||
license: mit
|
||||
description: |
|
||||
Voxtral Realtime 4B Pure C speech-to-text inference engine
|
||||
urls:
|
||||
- https://github.com/mudler/voxtral.c
|
||||
tags:
|
||||
- audio-transcription
|
||||
- CPU
|
||||
- Metal
|
||||
capabilities:
|
||||
default: "cpu-voxtral"
|
||||
metal-darwin-arm64: "metal-voxtral"
|
||||
- &stablediffusionggml
|
||||
name: "stablediffusion-ggml"
|
||||
alias: "stablediffusion-ggml"
|
||||
@@ -2594,3 +2609,24 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-pocket-tts"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-pocket-tts
|
||||
## voxtral
|
||||
- !!merge <<: *voxtral
|
||||
name: "cpu-voxtral"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voxtral"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-cpu-voxtral
|
||||
- !!merge <<: *voxtral
|
||||
name: "cpu-voxtral-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voxtral"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-voxtral
|
||||
- !!merge <<: *voxtral
|
||||
name: "metal-voxtral"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-voxtral"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-metal-darwin-arm64-voxtral
|
||||
- !!merge <<: *voxtral
|
||||
name: "metal-voxtral-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-voxtral"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-voxtral
|
||||
|
||||
@@ -115,6 +115,7 @@ Available pipelines: AnimateDiffPipeline, AnimateDiffVideoToVideoPipeline, ...
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `COMPEL` | `0` | Enable Compel for prompt weighting |
|
||||
| `SD_EMBED` | `0` | Enable sd_embed for prompt weighting |
|
||||
| `XPU` | `0` | Enable Intel XPU support |
|
||||
| `CLIPSKIP` | `1` | Enable CLIP skip support |
|
||||
| `SAFETENSORS` | `1` | Use safetensors format |
|
||||
|
||||
@@ -40,6 +40,7 @@ from compel import Compel, ReturnedEmbeddingsType
|
||||
from optimum.quanto import freeze, qfloat8, quantize
|
||||
from transformers import T5EncoderModel
|
||||
from safetensors.torch import load_file
|
||||
from sd_embed.embedding_funcs import get_weighted_text_embeddings_sd15, get_weighted_text_embeddings_sdxl, get_weighted_text_embeddings_sd3, get_weighted_text_embeddings_flux1
|
||||
|
||||
# Import LTX-2 specific utilities
|
||||
from diffusers.pipelines.ltx2.export_utils import encode_video as ltx2_encode_video
|
||||
@@ -47,6 +48,7 @@ from diffusers import LTX2VideoTransformer3DModel, GGUFQuantizationConfig
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
COMPEL = os.environ.get("COMPEL", "0") == "1"
|
||||
SD_EMBED = os.environ.get("SD_EMBED", "0") == "1"
|
||||
XPU = os.environ.get("XPU", "0") == "1"
|
||||
CLIPSKIP = os.environ.get("CLIPSKIP", "1") == "1"
|
||||
SAFETENSORS = os.environ.get("SAFETENSORS", "1") == "1"
|
||||
@@ -737,6 +739,51 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
kwargs["prompt_embeds"] = conditioning
|
||||
kwargs["pooled_prompt_embeds"] = pooled
|
||||
# pass the kwargs dictionary to the self.pipe method
|
||||
image = self.pipe(
|
||||
guidance_scale=self.cfg_scale,
|
||||
**kwargs
|
||||
).images[0]
|
||||
elif SD_EMBED:
|
||||
if self.PipelineType == "StableDiffusionPipeline":
|
||||
(
|
||||
kwargs["prompt_embeds"],
|
||||
kwargs["negative_prompt_embeds"],
|
||||
) = get_weighted_text_embeddings_sd15(
|
||||
pipe = self.pipe,
|
||||
prompt = prompt,
|
||||
neg_prompt = request.negative_prompt if hasattr(request, 'negative_prompt') else None,
|
||||
)
|
||||
if self.PipelineType == "StableDiffusionXLPipeline":
|
||||
(
|
||||
kwargs["prompt_embeds"],
|
||||
kwargs["negative_prompt_embeds"],
|
||||
kwargs["pooled_prompt_embeds"],
|
||||
kwargs["negative_pooled_prompt_embeds"],
|
||||
) = get_weighted_text_embeddings_sdxl(
|
||||
pipe = self.pipe,
|
||||
prompt = prompt,
|
||||
neg_prompt = request.negative_prompt if hasattr(request, 'negative_prompt') else None
|
||||
)
|
||||
if self.PipelineType == "StableDiffusion3Pipeline":
|
||||
(
|
||||
kwargs["prompt_embeds"],
|
||||
kwargs["negative_prompt_embeds"],
|
||||
kwargs["pooled_prompt_embeds"],
|
||||
kwargs["negative_pooled_prompt_embeds"],
|
||||
) = get_weighted_text_embeddings_sd3(
|
||||
pipe = self.pipe,
|
||||
prompt = prompt,
|
||||
neg_prompt = request.negative_prompt if hasattr(request, 'negative_prompt') else None
|
||||
)
|
||||
if self.PipelineType == "FluxTransformer2DModel":
|
||||
(
|
||||
kwargs["prompt_embeds"],
|
||||
kwargs["pooled_prompt_embeds"],
|
||||
) = get_weighted_text_embeddings_flux1(
|
||||
pipe = self.pipe,
|
||||
prompt = prompt,
|
||||
)
|
||||
|
||||
image = self.pipe(
|
||||
guidance_scale=self.cfg_scale,
|
||||
**kwargs
|
||||
|
||||
@@ -5,6 +5,7 @@ transformers
|
||||
torchvision==0.22.1
|
||||
accelerate
|
||||
compel
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
peft
|
||||
sentencepiece
|
||||
torch==2.7.1
|
||||
|
||||
@@ -5,6 +5,7 @@ transformers
|
||||
torchvision
|
||||
accelerate
|
||||
compel
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
peft
|
||||
sentencepiece
|
||||
torch
|
||||
|
||||
@@ -5,6 +5,7 @@ transformers
|
||||
torchvision
|
||||
accelerate
|
||||
compel
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
peft
|
||||
sentencepiece
|
||||
torch
|
||||
|
||||
@@ -8,6 +8,7 @@ opencv-python
|
||||
transformers
|
||||
accelerate
|
||||
compel
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
peft
|
||||
sentencepiece
|
||||
optimum-quanto
|
||||
|
||||
@@ -4,6 +4,7 @@ git+https://github.com/huggingface/diffusers
|
||||
transformers
|
||||
accelerate
|
||||
compel
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
peft
|
||||
optimum-quanto
|
||||
numpy<2
|
||||
|
||||
@@ -4,6 +4,7 @@ git+https://github.com/huggingface/diffusers
|
||||
transformers
|
||||
accelerate
|
||||
compel
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
peft
|
||||
optimum-quanto
|
||||
numpy<2
|
||||
|
||||
@@ -5,6 +5,7 @@ opencv-python
|
||||
transformers
|
||||
accelerate
|
||||
compel
|
||||
git+https://github.com/xhinker/sd_embed
|
||||
peft
|
||||
sentencepiece
|
||||
optimum-quanto
|
||||
|
||||
@@ -10,7 +10,11 @@ import sys
|
||||
import os
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
import moonshine_onnx
|
||||
from moonshine_voice import (
|
||||
Transcriber,
|
||||
get_model_for_language,
|
||||
load_wav_file,
|
||||
)
|
||||
|
||||
import grpc
|
||||
|
||||
@@ -25,16 +29,49 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
"""
|
||||
BackendServicer is the class that implements the gRPC service
|
||||
"""
|
||||
def __init__(self):
|
||||
self.transcriber = None
|
||||
self.model_name = None
|
||||
|
||||
def Health(self, request, context):
|
||||
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
||||
|
||||
def LoadModel(self, request, context):
|
||||
try:
|
||||
print("Preparing models, please wait", file=sys.stderr)
|
||||
# Store the model name for use in transcription
|
||||
# Model name format: e.g., "moonshine/tiny"
|
||||
self.model_name = request.Model
|
||||
print(f"Model name set to: {self.model_name}", file=sys.stderr)
|
||||
|
||||
# Default values
|
||||
language = "en"
|
||||
model_arch = None
|
||||
|
||||
# Parse options from request
|
||||
options = request.Options
|
||||
self.options = {}
|
||||
|
||||
# The options are a list of strings in this form optname:optvalue
|
||||
for opt in options:
|
||||
if ":" not in opt:
|
||||
continue
|
||||
key, value = opt.split(":", 1)
|
||||
self.options[key] = value
|
||||
|
||||
print(f"Options: {self.options}", file=sys.stderr)
|
||||
|
||||
# Extract language and model_arch from options
|
||||
if "language" in self.options:
|
||||
language = self.options["language"]
|
||||
if "model_arch" in self.options:
|
||||
model_arch = self.options["model_arch"]
|
||||
|
||||
# Get the model path and architecture
|
||||
model_path, model_arch = get_model_for_language(language, model_arch)
|
||||
print(f"Loading model: {model_path} with architecture: {model_arch} for language: {language}", file=sys.stderr)
|
||||
|
||||
# Initialize the transcriber
|
||||
self.transcriber = Transcriber(model_path=model_path, model_arch=model_arch)
|
||||
print("Model loaded successfully", file=sys.stderr)
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
@@ -43,33 +80,44 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
resultSegments = []
|
||||
text = ""
|
||||
try:
|
||||
# moonshine_onnx.transcribe returns a list of strings
|
||||
transcriptions = moonshine_onnx.transcribe(request.dst, self.model_name)
|
||||
if self.transcriber is None:
|
||||
raise Exception("Model not loaded. Call LoadModel first.")
|
||||
|
||||
# Load the audio file
|
||||
audio_data, sample_rate = load_wav_file(request.dst)
|
||||
print(f"Loaded audio file: {request.dst} with sample rate: {sample_rate}", file=sys.stderr)
|
||||
|
||||
# Transcribe without streaming
|
||||
transcript = self.transcriber.transcribe_without_streaming(
|
||||
audio_data, sample_rate=sample_rate, flags=0
|
||||
)
|
||||
|
||||
# Process transcript lines
|
||||
full_text_parts = []
|
||||
for idx, line in enumerate(transcript.lines):
|
||||
line_text = line.text.strip()
|
||||
full_text_parts.append(line_text)
|
||||
|
||||
# Create segment with timing information
|
||||
start_ms = int(line.start_time * 1000)
|
||||
end_ms = int((line.start_time + line.duration) * 1000)
|
||||
|
||||
resultSegments.append(backend_pb2.TranscriptSegment(
|
||||
id=idx,
|
||||
start=start_ms,
|
||||
end=end_ms,
|
||||
text=line_text
|
||||
))
|
||||
|
||||
print(f"Segment {idx}: [{line.start_time:.2f}s - {line.start_time + line.duration:.2f}s] {line_text}", file=sys.stderr)
|
||||
|
||||
# Combine all transcriptions into a single text
|
||||
if isinstance(transcriptions, list):
|
||||
text = " ".join(transcriptions)
|
||||
# Create segments for each transcription in the list
|
||||
for id, trans in enumerate(transcriptions):
|
||||
# Since moonshine doesn't provide timing info, we'll create a single segment
|
||||
# with id and text, using approximate timing
|
||||
resultSegments.append(backend_pb2.TranscriptSegment(
|
||||
id=id,
|
||||
start=0,
|
||||
end=0,
|
||||
text=trans
|
||||
))
|
||||
else:
|
||||
# Handle case where it's not a list (shouldn't happen, but be safe)
|
||||
text = str(transcriptions)
|
||||
resultSegments.append(backend_pb2.TranscriptSegment(
|
||||
id=0,
|
||||
start=0,
|
||||
end=0,
|
||||
text=text
|
||||
))
|
||||
text = " ".join(full_text_parts)
|
||||
|
||||
except Exception as err:
|
||||
print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return backend_pb2.TranscriptResult(segments=[], text="")
|
||||
|
||||
return backend_pb2.TranscriptResult(segments=resultSegments, text=text)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.71.0
|
||||
protobuf
|
||||
grpcio-tools
|
||||
useful-moonshine-onnx@git+https://git@github.com/moonshine-ai/moonshine.git#subdirectory=moonshine-onnx
|
||||
moonshine-voice
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.71.0
|
||||
protobuf
|
||||
grpcio-tools
|
||||
useful-moonshine-onnx@git+https://git@github.com/moonshine-ai/moonshine.git#subdirectory=moonshine-onnx
|
||||
moonshine-voice
|
||||
@@ -112,7 +112,7 @@ class TestBackendServicer(unittest.TestCase):
|
||||
self.assertGreaterEqual(len(transcript_response.segments), 0)
|
||||
|
||||
# Verify the transcription contains the expected text
|
||||
expected_text = "This is the micro machine man presenting the most midget miniature"
|
||||
expected_text = "This is the micro machine man"
|
||||
self.assertIn(
|
||||
expected_text.lower(),
|
||||
transcript_response.text.lower(),
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
setuptools
|
||||
grpcio==1.76.0
|
||||
protobuf
|
||||
certifi
|
||||
|
||||
@@ -76,42 +76,35 @@ func (lo *LoadOptions) Apply(options ...ConfigLoaderOption) {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: either in the next PR or the next commit, I want to merge these down into a single function that looks at the first few characters of the file to determine if we need to deserialize to []BackendConfig or BackendConfig
|
||||
func readMultipleModelConfigsFromFile(file string, opts ...ConfigLoaderOption) ([]*ModelConfig, error) {
|
||||
c := &[]*ModelConfig{}
|
||||
// readModelConfigsFromFile reads a config file that may contain either a single
|
||||
// ModelConfig or an array of ModelConfigs. It tries to unmarshal as an array first,
|
||||
// then falls back to a single config if that fails.
|
||||
func readModelConfigsFromFile(file string, opts ...ConfigLoaderOption) ([]*ModelConfig, error) {
|
||||
f, err := os.ReadFile(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("readMultipleModelConfigsFromFile cannot read config file %q: %w", file, err)
|
||||
}
|
||||
if err := yaml.Unmarshal(f, c); err != nil {
|
||||
return nil, fmt.Errorf("readMultipleModelConfigsFromFile cannot unmarshal config file %q: %w", file, err)
|
||||
return nil, fmt.Errorf("readModelConfigsFromFile cannot read config file %q: %w", file, err)
|
||||
}
|
||||
|
||||
for _, cc := range *c {
|
||||
cc.modelConfigFile = file
|
||||
cc.SetDefaults(opts...)
|
||||
// Try to unmarshal as array first
|
||||
var configs []*ModelConfig
|
||||
if err := yaml.Unmarshal(f, &configs); err == nil && len(configs) > 0 {
|
||||
for _, cc := range configs {
|
||||
cc.modelConfigFile = file
|
||||
cc.SetDefaults(opts...)
|
||||
}
|
||||
return configs, nil
|
||||
}
|
||||
|
||||
return *c, nil
|
||||
}
|
||||
|
||||
func readModelConfigFromFile(file string, opts ...ConfigLoaderOption) (*ModelConfig, error) {
|
||||
lo := &LoadOptions{}
|
||||
lo.Apply(opts...)
|
||||
|
||||
// Fall back to single config
|
||||
c := &ModelConfig{}
|
||||
f, err := os.ReadFile(file)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("readModelConfigFromFile cannot read config file %q: %w", file, err)
|
||||
}
|
||||
if err := yaml.Unmarshal(f, c); err != nil {
|
||||
return nil, fmt.Errorf("readModelConfigFromFile cannot unmarshal config file %q: %w", file, err)
|
||||
return nil, fmt.Errorf("readModelConfigsFromFile cannot unmarshal config file %q: %w", file, err)
|
||||
}
|
||||
|
||||
c.SetDefaults(opts...)
|
||||
|
||||
c.modelConfigFile = file
|
||||
return c, nil
|
||||
c.SetDefaults(opts...)
|
||||
|
||||
return []*ModelConfig{c}, nil
|
||||
}
|
||||
|
||||
// Load a config file for a model
|
||||
@@ -163,7 +156,7 @@ func (bcl *ModelConfigLoader) LoadModelConfigFileByNameDefaultOptions(modelName
|
||||
func (bcl *ModelConfigLoader) LoadMultipleModelConfigsSingleFile(file string, opts ...ConfigLoaderOption) error {
|
||||
bcl.Lock()
|
||||
defer bcl.Unlock()
|
||||
c, err := readMultipleModelConfigsFromFile(file, opts...)
|
||||
c, err := readModelConfigsFromFile(file, opts...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot load config file: %w", err)
|
||||
}
|
||||
@@ -181,11 +174,18 @@ func (bcl *ModelConfigLoader) LoadMultipleModelConfigsSingleFile(file string, op
|
||||
func (bcl *ModelConfigLoader) ReadModelConfig(file string, opts ...ConfigLoaderOption) error {
|
||||
bcl.Lock()
|
||||
defer bcl.Unlock()
|
||||
c, err := readModelConfigFromFile(file, opts...)
|
||||
configs, err := readModelConfigsFromFile(file, opts...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("ReadModelConfig cannot read config file %q: %w", file, err)
|
||||
}
|
||||
if len(configs) == 0 {
|
||||
return fmt.Errorf("ReadModelConfig: no configs found in file %q", file)
|
||||
}
|
||||
if len(configs) > 1 {
|
||||
xlog.Warn("ReadModelConig: read more than one config from file, only using first", "file", file, "configs", len(configs))
|
||||
}
|
||||
|
||||
c := configs[0]
|
||||
if valid, err := c.Validate(); valid {
|
||||
bcl.configs[c.Name] = *c
|
||||
} else {
|
||||
@@ -375,15 +375,23 @@ func (bcl *ModelConfigLoader) LoadModelConfigsFromPath(path string, opts ...Conf
|
||||
strings.HasPrefix(file.Name(), ".") {
|
||||
continue
|
||||
}
|
||||
c, err := readModelConfigFromFile(filepath.Join(path, file.Name()), opts...)
|
||||
|
||||
filePath := filepath.Join(path, file.Name())
|
||||
|
||||
// Read config(s) - handles both single and array formats
|
||||
configs, err := readModelConfigsFromFile(filePath, opts...)
|
||||
if err != nil {
|
||||
xlog.Error("LoadModelConfigsFromPath cannot read config file", "error", err, "File Name", file.Name())
|
||||
continue
|
||||
}
|
||||
if valid, validationErr := c.Validate(); valid {
|
||||
bcl.configs[c.Name] = *c
|
||||
} else {
|
||||
xlog.Error("config is not valid", "error", validationErr, "Name", c.Name)
|
||||
|
||||
// Validate and store each config
|
||||
for _, c := range configs {
|
||||
if valid, validationErr := c.Validate(); valid {
|
||||
bcl.configs[c.Name] = *c
|
||||
} else {
|
||||
xlog.Error("config is not valid", "error", validationErr, "Name", c.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -25,7 +25,8 @@ known_usecases:
|
||||
- COMPLETION
|
||||
`)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
config, err := readModelConfigFromFile(tmp.Name())
|
||||
configs, err := readModelConfigsFromFile(tmp.Name())
|
||||
config := configs[0]
|
||||
Expect(err).To(BeNil())
|
||||
Expect(config).ToNot(BeNil())
|
||||
valid, err := config.Validate()
|
||||
@@ -43,7 +44,8 @@ backend: "foo-bar"
|
||||
parameters:
|
||||
model: "foo-bar"`)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
config, err := readModelConfigFromFile(tmp.Name())
|
||||
configs, err := readModelConfigsFromFile(tmp.Name())
|
||||
config := configs[0]
|
||||
Expect(err).To(BeNil())
|
||||
Expect(config).ToNot(BeNil())
|
||||
// two configs in config.yaml
|
||||
@@ -62,7 +64,8 @@ parameters:
|
||||
defer os.Remove(tmp.Name())
|
||||
_, err = io.Copy(tmp, resp.Body)
|
||||
Expect(err).To(BeNil())
|
||||
config, err = readModelConfigFromFile(tmp.Name())
|
||||
configs, err = readModelConfigsFromFile(tmp.Name())
|
||||
config = configs[0]
|
||||
Expect(err).To(BeNil())
|
||||
Expect(config).ToNot(BeNil())
|
||||
// two configs in config.yaml
|
||||
@@ -188,7 +191,8 @@ mcp:
|
||||
}
|
||||
}`)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
config, err := readModelConfigFromFile(tmp.Name())
|
||||
configs, err := readModelConfigsFromFile(tmp.Name())
|
||||
config := configs[0]
|
||||
Expect(err).To(BeNil())
|
||||
Expect(config).ToNot(BeNil())
|
||||
valid, err := config.Validate()
|
||||
@@ -218,7 +222,8 @@ mcp:
|
||||
}
|
||||
}`)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
config, err := readModelConfigFromFile(tmp.Name())
|
||||
configs, err := readModelConfigsFromFile(tmp.Name())
|
||||
config := configs[0]
|
||||
Expect(err).To(BeNil())
|
||||
Expect(config).ToNot(BeNil())
|
||||
valid, err := config.Validate()
|
||||
|
||||
@@ -16,7 +16,7 @@ var _ = Describe("Test cases for config related functions", func() {
|
||||
Context("Test Read configuration functions", func() {
|
||||
configFile = os.Getenv("CONFIG_FILE")
|
||||
It("Test readConfigFile", func() {
|
||||
config, err := readMultipleModelConfigsFromFile(configFile)
|
||||
config, err := readModelConfigsFromFile(configFile)
|
||||
Expect(err).To(BeNil())
|
||||
Expect(config).ToNot(BeNil())
|
||||
// two configs in config.yaml
|
||||
|
||||
@@ -109,10 +109,10 @@ func API(application *application.Application) (*echo.Echo, error) {
|
||||
res := c.Response()
|
||||
err := next(c)
|
||||
|
||||
// Fix for #7989: Reduce log verbosity of Web UI polling
|
||||
// If the path is /api/operations and the request was successful (200),
|
||||
// Fix for #7989: Reduce log verbosity of Web UI polling and resources API
|
||||
// If the path is /api/operations or /api/resources and the request was successful (200),
|
||||
// we log it at DEBUG level (hidden by default) instead of INFO.
|
||||
if req.URL.Path == "/api/operations" && res.Status == 200 {
|
||||
if (req.URL.Path == "/api/operations" || req.URL.Path == "/api/resources") && res.Status == 200 {
|
||||
xlog.Debug("HTTP request", "method", req.Method, "path", req.URL.Path, "status", res.Status)
|
||||
} else {
|
||||
xlog.Info("HTTP request", "method", req.Method, "path", req.URL.Path, "status", res.Status)
|
||||
|
||||
@@ -336,6 +336,7 @@ var _ = Describe("API test", func() {
|
||||
Name: "bert",
|
||||
URL: bertEmbeddingsURL,
|
||||
},
|
||||
Overrides: map[string]interface{}{"backend": "llama-cpp"},
|
||||
},
|
||||
{
|
||||
Metadata: gallery.Metadata{
|
||||
@@ -953,7 +954,8 @@ parameters:
|
||||
It("returns the models list", func() {
|
||||
models, err := client.ListModels(context.TODO())
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(len(models.Models)).To(Equal(7)) // If "config.yaml" should be included, this should be 8?
|
||||
// A model called "bert" can be present in the model directory depending on the order of the tests
|
||||
Expect(len(models.Models)).To(BeNumerically(">=", 8))
|
||||
})
|
||||
It("can generate completions via ggml", func() {
|
||||
if runtime.GOOS != "linux" {
|
||||
|
||||
@@ -23,10 +23,15 @@ import (
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/utils"
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
func downloadFile(url string) (string, error) {
|
||||
if err := utils.ValidateExternalURL(url); err != nil {
|
||||
return "", fmt.Errorf("URL validation failed: %w", err)
|
||||
}
|
||||
|
||||
// Get the data
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
|
||||
@@ -32,13 +32,26 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
// XXX: Presently it seems all ASR/VAD backends use 16Khz. If a backend uses 24Khz then it will likely still work, but have reduced performance
|
||||
localSampleRate = 16000
|
||||
remoteSampleRate = 24000
|
||||
defaultRemoteSampleRate = 24000
|
||||
)
|
||||
|
||||
// A model can be "emulated" that is: transcribe audio to text -> feed text to the LLM -> generate audio as result
|
||||
// If the model support instead audio-to-audio, we will use the specific gRPC calls instead
|
||||
|
||||
// LockedWebsocket wraps a websocket connection with a mutex for safe concurrent writes
|
||||
type LockedWebsocket struct {
|
||||
*websocket.Conn
|
||||
sync.Mutex
|
||||
}
|
||||
|
||||
func (l *LockedWebsocket) WriteMessage(messageType int, data []byte) error {
|
||||
l.Lock()
|
||||
defer l.Unlock()
|
||||
return l.Conn.WriteMessage(messageType, data)
|
||||
}
|
||||
|
||||
// Session represents a single WebSocket connection and its state
|
||||
type Session struct {
|
||||
ID string
|
||||
@@ -58,7 +71,8 @@ type Session struct {
|
||||
DefaultConversationID string
|
||||
ModelInterface Model
|
||||
// The pipeline model config or the config for an any-to-any model
|
||||
ModelConfig *config.ModelConfig
|
||||
ModelConfig *config.ModelConfig
|
||||
InputSampleRate int
|
||||
}
|
||||
|
||||
func (s *Session) FromClient(session *types.SessionUnion) {
|
||||
@@ -162,7 +176,8 @@ func Realtime(application *application.Application) echo.HandlerFunc {
|
||||
}
|
||||
|
||||
func registerRealtime(application *application.Application, model string) func(c *websocket.Conn) {
|
||||
return func(c *websocket.Conn) {
|
||||
return func(conn *websocket.Conn) {
|
||||
c := &LockedWebsocket{Conn: conn}
|
||||
|
||||
evaluator := application.TemplatesEvaluator()
|
||||
xlog.Debug("Realtime WebSocket connection established", "address", c.RemoteAddr().String(), "model", model)
|
||||
@@ -183,14 +198,13 @@ func registerRealtime(application *application.Application, model string) func(c
|
||||
}
|
||||
|
||||
sttModel := cfg.Pipeline.Transcription
|
||||
ttsModel := cfg.Pipeline.TTS
|
||||
|
||||
sessionID := generateSessionID()
|
||||
session := &Session{
|
||||
ID: sessionID,
|
||||
TranscriptionOnly: false,
|
||||
Model: model,
|
||||
Voice: ttsModel,
|
||||
Voice: cfg.TTSConfig.Voice,
|
||||
ModelConfig: cfg,
|
||||
TurnDetection: &types.TurnDetectionUnion{
|
||||
ServerVad: &types.ServerVad{
|
||||
@@ -203,7 +217,8 @@ func registerRealtime(application *application.Application, model string) func(c
|
||||
InputAudioTranscription: &types.AudioTranscription{
|
||||
Model: sttModel,
|
||||
},
|
||||
Conversations: make(map[string]*Conversation),
|
||||
Conversations: make(map[string]*Conversation),
|
||||
InputSampleRate: defaultRemoteSampleRate,
|
||||
}
|
||||
|
||||
// Create a default conversation
|
||||
@@ -383,7 +398,36 @@ func registerRealtime(application *application.Application, model string) func(c
|
||||
|
||||
case types.ConversationItemCreateEvent:
|
||||
xlog.Debug("recv", "message", string(msg))
|
||||
sendNotImplemented(c, "conversation.item.create")
|
||||
// Add the item to the conversation
|
||||
item := e.Item
|
||||
// Ensure IDs are present
|
||||
if item.User != nil && item.User.ID == "" {
|
||||
item.User.ID = generateItemID()
|
||||
}
|
||||
if item.Assistant != nil && item.Assistant.ID == "" {
|
||||
item.Assistant.ID = generateItemID()
|
||||
}
|
||||
if item.System != nil && item.System.ID == "" {
|
||||
item.System.ID = generateItemID()
|
||||
}
|
||||
if item.FunctionCall != nil && item.FunctionCall.ID == "" {
|
||||
item.FunctionCall.ID = generateItemID()
|
||||
}
|
||||
if item.FunctionCallOutput != nil && item.FunctionCallOutput.ID == "" {
|
||||
item.FunctionCallOutput.ID = generateItemID()
|
||||
}
|
||||
|
||||
conversation.Lock.Lock()
|
||||
conversation.Items = append(conversation.Items, &item)
|
||||
conversation.Lock.Unlock()
|
||||
|
||||
sendEvent(c, types.ConversationItemAddedEvent{
|
||||
ServerEventBase: types.ServerEventBase{
|
||||
EventID: e.EventID,
|
||||
},
|
||||
PreviousItemID: e.PreviousItemID,
|
||||
Item: item,
|
||||
})
|
||||
|
||||
case types.ConversationItemDeleteEvent:
|
||||
sendError(c, "not_implemented", "Deleting items not implemented", "", "event_TODO")
|
||||
@@ -429,7 +473,34 @@ func registerRealtime(application *application.Application, model string) func(c
|
||||
|
||||
case types.ResponseCreateEvent:
|
||||
xlog.Debug("recv", "message", string(msg))
|
||||
sendNotImplemented(c, "response.create")
|
||||
|
||||
// Handle optional items to add to context
|
||||
if len(e.Response.Input) > 0 {
|
||||
conversation.Lock.Lock()
|
||||
for _, item := range e.Response.Input {
|
||||
// Ensure IDs are present
|
||||
if item.User != nil && item.User.ID == "" {
|
||||
item.User.ID = generateItemID()
|
||||
}
|
||||
if item.Assistant != nil && item.Assistant.ID == "" {
|
||||
item.Assistant.ID = generateItemID()
|
||||
}
|
||||
if item.System != nil && item.System.ID == "" {
|
||||
item.System.ID = generateItemID()
|
||||
}
|
||||
if item.FunctionCall != nil && item.FunctionCall.ID == "" {
|
||||
item.FunctionCall.ID = generateItemID()
|
||||
}
|
||||
if item.FunctionCallOutput != nil && item.FunctionCallOutput.ID == "" {
|
||||
item.FunctionCallOutput.ID = generateItemID()
|
||||
}
|
||||
|
||||
conversation.Items = append(conversation.Items, &item)
|
||||
}
|
||||
conversation.Lock.Unlock()
|
||||
}
|
||||
|
||||
go triggerResponse(session, conversation, c, &e.Response)
|
||||
|
||||
case types.ResponseCancelEvent:
|
||||
xlog.Debug("recv", "message", string(msg))
|
||||
@@ -456,7 +527,7 @@ func registerRealtime(application *application.Application, model string) func(c
|
||||
}
|
||||
|
||||
// Helper function to send events to the client
|
||||
func sendEvent(c *websocket.Conn, event types.ServerEvent) {
|
||||
func sendEvent(c *LockedWebsocket, event types.ServerEvent) {
|
||||
eventBytes, err := json.Marshal(event)
|
||||
if err != nil {
|
||||
xlog.Error("failed to marshal event", "error", err)
|
||||
@@ -468,7 +539,7 @@ func sendEvent(c *websocket.Conn, event types.ServerEvent) {
|
||||
}
|
||||
|
||||
// Helper function to send errors to the client
|
||||
func sendError(c *websocket.Conn, code, message, param, eventID string) {
|
||||
func sendError(c *LockedWebsocket, code, message, param, eventID string) {
|
||||
errorEvent := types.ErrorEvent{
|
||||
ServerEventBase: types.ServerEventBase{
|
||||
EventID: eventID,
|
||||
@@ -485,7 +556,7 @@ func sendError(c *websocket.Conn, code, message, param, eventID string) {
|
||||
sendEvent(c, errorEvent)
|
||||
}
|
||||
|
||||
func sendNotImplemented(c *websocket.Conn, message string) {
|
||||
func sendNotImplemented(c *LockedWebsocket, message string) {
|
||||
sendError(c, "not_implemented", message, "", "event_TODO")
|
||||
}
|
||||
|
||||
@@ -530,6 +601,12 @@ func updateTransSession(session *Session, update *types.SessionUnion, cl *config
|
||||
session.TurnDetection = update.Transcription.Audio.Input.TurnDetection
|
||||
}
|
||||
|
||||
if update.Transcription.Audio.Input.Format != nil && update.Transcription.Audio.Input.Format.PCM != nil {
|
||||
if update.Transcription.Audio.Input.Format.PCM.Rate > 0 {
|
||||
session.InputSampleRate = update.Transcription.Audio.Input.Format.PCM.Rate
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -557,13 +634,13 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
|
||||
session.InputAudioTranscription = &types.AudioTranscription{}
|
||||
}
|
||||
session.InputAudioTranscription.Model = cfg.Pipeline.Transcription
|
||||
session.Voice = cfg.Pipeline.TTS
|
||||
session.Voice = cfg.TTSConfig.Voice
|
||||
session.Model = rt.Model
|
||||
session.ModelConfig = cfg
|
||||
}
|
||||
|
||||
if rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "" {
|
||||
xlog.Warn("Ignoring voice setting; not implemented", "voice", rt.Audio.Output.Voice)
|
||||
session.Voice = string(rt.Audio.Output.Voice)
|
||||
}
|
||||
|
||||
if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil {
|
||||
@@ -583,6 +660,12 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
|
||||
session.TurnDetection = rt.Audio.Input.TurnDetection
|
||||
}
|
||||
|
||||
if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Format != nil && rt.Audio.Input.Format.PCM != nil {
|
||||
if rt.Audio.Input.Format.PCM.Rate > 0 {
|
||||
session.InputSampleRate = rt.Audio.Input.Format.PCM.Rate
|
||||
}
|
||||
}
|
||||
|
||||
if rt.Instructions != "" {
|
||||
session.Instructions = rt.Instructions
|
||||
}
|
||||
@@ -599,7 +682,7 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
|
||||
|
||||
// handleVAD is a goroutine that listens for audio data from the client,
|
||||
// runs VAD on the audio data, and commits utterances to the conversation
|
||||
func handleVAD(session *Session, conv *Conversation, c *websocket.Conn, done chan struct{}) {
|
||||
func handleVAD(session *Session, conv *Conversation, c *LockedWebsocket, done chan struct{}) {
|
||||
vadContext, cancel := context.WithCancel(context.Background())
|
||||
go func() {
|
||||
<-done
|
||||
@@ -628,12 +711,12 @@ func handleVAD(session *Session, conv *Conversation, c *websocket.Conn, done cha
|
||||
session.AudioBufferLock.Unlock()
|
||||
|
||||
aints := sound.BytesToInt16sLE(allAudio)
|
||||
if len(aints) == 0 || len(aints) < int(silenceThreshold)*remoteSampleRate {
|
||||
if len(aints) == 0 || len(aints) < int(silenceThreshold)*session.InputSampleRate {
|
||||
continue
|
||||
}
|
||||
|
||||
// Resample from 24kHz to 16kHz
|
||||
aints = sound.ResampleInt16(aints, remoteSampleRate, localSampleRate)
|
||||
// Resample from InputSampleRate to 16kHz
|
||||
aints = sound.ResampleInt16(aints, session.InputSampleRate, localSampleRate)
|
||||
|
||||
segments, err := runVAD(vadContext, session, aints)
|
||||
if err != nil {
|
||||
@@ -713,7 +796,7 @@ func handleVAD(session *Session, conv *Conversation, c *websocket.Conn, done cha
|
||||
}
|
||||
}
|
||||
|
||||
func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Conversation, c *websocket.Conn) {
|
||||
func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Conversation, c *LockedWebsocket) {
|
||||
if len(utt) == 0 {
|
||||
return
|
||||
}
|
||||
@@ -746,6 +829,10 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
|
||||
tr, err := session.ModelInterface.Transcribe(ctx, f.Name(), session.InputAudioTranscription.Language, false, false, session.InputAudioTranscription.Prompt)
|
||||
if err != nil {
|
||||
sendError(c, "transcription_failed", err.Error(), "", "event_TODO")
|
||||
return
|
||||
} else if tr == nil {
|
||||
sendError(c, "transcription_failed", "trancribe result is nil", "", "event_TODO")
|
||||
return
|
||||
}
|
||||
|
||||
transcript = tr.Text
|
||||
@@ -791,11 +878,10 @@ func runVAD(ctx context.Context, session *Session, adata []int16) ([]schema.VADS
|
||||
}
|
||||
|
||||
// Function to generate a response based on the conversation
|
||||
func generateResponse(session *Session, utt []byte, transcript string, conv *Conversation, c *websocket.Conn, mt int) {
|
||||
func generateResponse(session *Session, utt []byte, transcript string, conv *Conversation, c *LockedWebsocket, mt int) {
|
||||
xlog.Debug("Generating realtime response...")
|
||||
|
||||
config := session.ModelInterface.PredictConfig()
|
||||
|
||||
// Create user message item
|
||||
item := types.MessageItemUnion{
|
||||
User: &types.MessageItemUser{
|
||||
ID: generateItemID(),
|
||||
@@ -817,33 +903,73 @@ func generateResponse(session *Session, utt []byte, transcript string, conv *Con
|
||||
Item: item,
|
||||
})
|
||||
|
||||
triggerResponse(session, conv, c, nil)
|
||||
}
|
||||
|
||||
func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, overrides *types.ResponseCreateParams) {
|
||||
config := session.ModelInterface.PredictConfig()
|
||||
|
||||
// Default values
|
||||
tools := session.Tools
|
||||
toolChoice := session.ToolChoice
|
||||
instructions := session.Instructions
|
||||
// Overrides
|
||||
if overrides != nil {
|
||||
if overrides.Tools != nil {
|
||||
tools = overrides.Tools
|
||||
}
|
||||
if overrides.ToolChoice != nil {
|
||||
toolChoice = overrides.ToolChoice
|
||||
}
|
||||
if overrides.Instructions != "" {
|
||||
instructions = overrides.Instructions
|
||||
}
|
||||
}
|
||||
|
||||
var conversationHistory schema.Messages
|
||||
conversationHistory = append(conversationHistory, schema.Message{
|
||||
Role: string(types.MessageRoleSystem),
|
||||
StringContent: session.Instructions,
|
||||
Content: session.Instructions,
|
||||
StringContent: instructions,
|
||||
Content: instructions,
|
||||
})
|
||||
|
||||
imgIndex := 0
|
||||
conv.Lock.Lock()
|
||||
for _, item := range conv.Items {
|
||||
if item.User != nil {
|
||||
msg := schema.Message{
|
||||
Role: string(types.MessageRoleUser),
|
||||
}
|
||||
textContent := ""
|
||||
nrOfImgsInMessage := 0
|
||||
for _, content := range item.User.Content {
|
||||
switch content.Type {
|
||||
case types.MessageContentTypeInputText:
|
||||
conversationHistory = append(conversationHistory, schema.Message{
|
||||
Role: string(types.MessageRoleUser),
|
||||
StringContent: content.Text,
|
||||
Content: content.Text,
|
||||
})
|
||||
textContent += content.Text
|
||||
case types.MessageContentTypeInputAudio:
|
||||
conversationHistory = append(conversationHistory, schema.Message{
|
||||
Role: string(types.MessageRoleUser),
|
||||
StringContent: content.Transcript,
|
||||
Content: content.Transcript,
|
||||
StringAudios: []string{content.Audio},
|
||||
})
|
||||
textContent += content.Transcript
|
||||
case types.MessageContentTypeInputImage:
|
||||
msg.StringImages = append(msg.StringImages, content.ImageURL)
|
||||
imgIndex++
|
||||
nrOfImgsInMessage++
|
||||
}
|
||||
}
|
||||
if nrOfImgsInMessage > 0 {
|
||||
templated, err := templates.TemplateMultiModal(config.TemplateConfig.Multimodal, templates.MultiModalOptions{
|
||||
TotalImages: imgIndex,
|
||||
ImagesInMessage: nrOfImgsInMessage,
|
||||
}, textContent)
|
||||
if err != nil {
|
||||
xlog.Warn("Failed to apply multimodal template", "error", err)
|
||||
templated = textContent
|
||||
}
|
||||
msg.StringContent = templated
|
||||
msg.Content = templated
|
||||
} else {
|
||||
msg.StringContent = textContent
|
||||
msg.Content = textContent
|
||||
}
|
||||
conversationHistory = append(conversationHistory, msg)
|
||||
} else if item.Assistant != nil {
|
||||
for _, content := range item.Assistant.Content {
|
||||
switch content.Type {
|
||||
@@ -874,6 +1000,11 @@ func generateResponse(session *Session, utt []byte, transcript string, conv *Con
|
||||
}
|
||||
conv.Lock.Unlock()
|
||||
|
||||
var images []string
|
||||
for _, m := range conversationHistory {
|
||||
images = append(images, m.StringImages...)
|
||||
}
|
||||
|
||||
responseID := generateUniqueID()
|
||||
sendEvent(c, types.ResponseCreatedEvent{
|
||||
ServerEventBase: types.ServerEventBase{},
|
||||
@@ -884,15 +1015,15 @@ func generateResponse(session *Session, utt []byte, transcript string, conv *Con
|
||||
},
|
||||
})
|
||||
|
||||
predFunc, err := session.ModelInterface.Predict(context.TODO(), conversationHistory, nil, nil, nil, nil, session.Tools, session.ToolChoice, nil, nil, nil)
|
||||
predFunc, err := session.ModelInterface.Predict(context.TODO(), conversationHistory, images, nil, nil, nil, tools, toolChoice, nil, nil, nil)
|
||||
if err != nil {
|
||||
sendError(c, "inference_failed", fmt.Sprintf("backend error: %v", err), "", item.Assistant.ID)
|
||||
sendError(c, "inference_failed", fmt.Sprintf("backend error: %v", err), "", "") // item.Assistant.ID is unknown here
|
||||
return
|
||||
}
|
||||
|
||||
pred, err := predFunc()
|
||||
if err != nil {
|
||||
sendError(c, "prediction_failed", fmt.Sprintf("backend error: %v", err), "", item.Assistant.ID)
|
||||
sendError(c, "prediction_failed", fmt.Sprintf("backend error: %v", err), "", "")
|
||||
return
|
||||
}
|
||||
|
||||
@@ -1006,7 +1137,16 @@ func generateResponse(session *Session, utt []byte, transcript string, conv *Con
|
||||
sendError(c, "tts_error", fmt.Sprintf("Failed to read TTS audio: %v", err), "", item.Assistant.ID)
|
||||
return
|
||||
}
|
||||
audioString := base64.StdEncoding.EncodeToString(audioBytes)
|
||||
|
||||
// Strip WAV header (44 bytes) to get raw PCM data
|
||||
// The OpenAI Realtime API expects raw PCM, not WAV files
|
||||
const wavHeaderSize = 44
|
||||
pcmData := audioBytes
|
||||
if len(audioBytes) > wavHeaderSize {
|
||||
pcmData = audioBytes[wavHeaderSize:]
|
||||
}
|
||||
|
||||
audioString := base64.StdEncoding.EncodeToString(pcmData)
|
||||
|
||||
sendEvent(c, types.ResponseOutputAudioTranscriptDeltaEvent{
|
||||
ServerEventBase: types.ServerEventBase{},
|
||||
@@ -1131,7 +1271,6 @@ func generateResponse(session *Session, utt []byte, transcript string, conv *Con
|
||||
Status: types.ResponseStatusCompleted,
|
||||
},
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
// Helper functions to generate unique IDs
|
||||
|
||||
@@ -26,6 +26,7 @@ const (
|
||||
MessageContentTypeTranscript MessageContentType = "transcript"
|
||||
MessageContentTypeInputText MessageContentType = "input_text"
|
||||
MessageContentTypeInputAudio MessageContentType = "input_audio"
|
||||
MessageContentTypeInputImage MessageContentType = "input_image"
|
||||
MessageContentTypeOutputText MessageContentType = "output_text"
|
||||
MessageContentTypeOutputAudio MessageContentType = "output_audio"
|
||||
)
|
||||
|
||||
@@ -1026,10 +1026,11 @@ parameters:
|
||||
if (!config.name) {
|
||||
throw new Error('Model name is required');
|
||||
}
|
||||
if (!config.backend) {
|
||||
const isPipeline = config.pipeline && (config.pipeline.vad || config.pipeline.transcription || config.pipeline.tts || config.pipeline.llm);
|
||||
if (!isPipeline && !config.backend) {
|
||||
throw new Error('Backend is required');
|
||||
}
|
||||
if (!config.parameters || !config.parameters.model) {
|
||||
if (!isPipeline && (!config.parameters || !config.parameters.model)) {
|
||||
throw new Error('Model file/path is required in parameters.model');
|
||||
}
|
||||
|
||||
@@ -1041,7 +1042,6 @@ parameters:
|
||||
|
||||
async saveConfig() {
|
||||
try {
|
||||
// Validate before saving
|
||||
const yamlContent = this.yamlEditor.getValue();
|
||||
const config = jsyaml.load(yamlContent);
|
||||
|
||||
@@ -1052,13 +1052,13 @@ parameters:
|
||||
if (!config.name) {
|
||||
throw new Error('Model name is required');
|
||||
}
|
||||
if (!config.backend) {
|
||||
const isPipeline = config.pipeline && (config.pipeline.vad || config.pipeline.transcription || config.pipeline.tts || config.pipeline.llm);
|
||||
if (!isPipeline && !config.backend) {
|
||||
throw new Error('Backend is required');
|
||||
}
|
||||
if (!config.parameters || !config.parameters.model) {
|
||||
if (!isPipeline && (!config.parameters || !config.parameters.model)) {
|
||||
throw new Error('Model file/path is required in parameters.model');
|
||||
}
|
||||
|
||||
const endpoint = this.isEditMode ? `/models/edit/{{.ModelName}}` : '/models/import';
|
||||
|
||||
const response = await fetch(endpoint, {
|
||||
|
||||
@@ -676,6 +676,7 @@ func (s *AgentJobService) executeJobInternal(job schema.Job, task schema.Task, c
|
||||
job.Status = schema.JobStatusRunning
|
||||
job.StartedAt = &now
|
||||
s.jobs.Set(job.ID, job)
|
||||
xlog.Info("Job started", "job_id", job.ID, "task_id", job.TaskID)
|
||||
|
||||
// Load model config
|
||||
modelConfig, err := s.configLoader.LoadModelConfigFileByNameDefaultOptions(task.Model, s.appConfig)
|
||||
@@ -980,6 +981,7 @@ func (s *AgentJobService) executeJobInternal(job schema.Job, task schema.Task, c
|
||||
job.Result = f.LastMessage().Content
|
||||
job.CompletedAt = &completedAt
|
||||
s.jobs.Set(job.ID, job)
|
||||
xlog.Info("Job completed", "job_id", job.ID, "status", job.Status)
|
||||
|
||||
// Save to file (async)
|
||||
go func() {
|
||||
|
||||
@@ -122,3 +122,4 @@ LocalAI supports various types of backends:
|
||||
- **Diffusion Backends**: For image generation
|
||||
- **TTS Backends**: For text-to-speech conversion
|
||||
- **Whisper Backends**: For speech-to-text conversion
|
||||
- **Sound Generation Backends**: For music and audio generation (e.g., ACE-Step)
|
||||
@@ -14,7 +14,7 @@ LocalAI to ease out installations of models provide a way to preload models on s
|
||||
|
||||
|
||||
{{% notice note %}}
|
||||
The models in this gallery are not directly maintained by LocalAI. If you find a model that is not working, please open an issue on the model gallery repository.
|
||||
The models in this gallery are not directly maintained by LocalAI. If you find a model that is not working, please open an issue on the [main LocalAI repository](https://github.com/mudler/LocalAI/issues).
|
||||
{{% /notice %}}
|
||||
|
||||
{{% notice note %}}
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
{
|
||||
"version": "v3.10.1"
|
||||
"version": "v3.11.0"
|
||||
}
|
||||
|
||||
@@ -1,4 +1,239 @@
|
||||
---
|
||||
- name: nemo-parakeet-tdt-0.6b
|
||||
license: apache-2.0
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
description: |
|
||||
NVIDIA NeMo Parakeet TDT 0.6B v3 is an automatic speech recognition (ASR) model from NVIDIA's NeMo toolkit. Parakeet models are state-of-the-art ASR models trained on large-scale English audio data.
|
||||
urls:
|
||||
- https://huggingface.co/nvidia/parakeet-tdt-0.6b-v3
|
||||
- https://github.com/NVIDIA/NeMo
|
||||
tags:
|
||||
- stt
|
||||
- speech-to-text
|
||||
- asr
|
||||
- nvidia
|
||||
- nemo
|
||||
- parakeet
|
||||
- cpu
|
||||
- gpu
|
||||
overrides:
|
||||
backend: nemo
|
||||
known_usecases:
|
||||
- transcript
|
||||
parameters:
|
||||
model: nvidia/parakeet-tdt-0.6b-v3
|
||||
- name: voxtral-mini-4b-realtime
|
||||
license: apache-2.0
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
description: |
|
||||
Voxtral Mini 4B Realtime is a speech-to-text model from Mistral AI. It is a 4B parameter model optimized for fast, accurate audio transcription with low latency, making it ideal for real-time applications. The model uses the Voxtral architecture for efficient audio processing.
|
||||
urls:
|
||||
- https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602
|
||||
- https://github.com/antirez/voxtral.c
|
||||
tags:
|
||||
- stt
|
||||
- speech-to-text
|
||||
- audio-transcription
|
||||
- cpu
|
||||
- metal
|
||||
- mistral
|
||||
overrides:
|
||||
backend: voxtral
|
||||
known_usecases:
|
||||
- transcript
|
||||
parameters:
|
||||
model: voxtral-model
|
||||
files:
|
||||
- filename: voxtral-model/consolidated.safetensors
|
||||
uri: https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602/resolve/main/consolidated.safetensors
|
||||
sha256: 263f178fe752c90a2ae58f037a95ed092db8b14768b0978b8c48f66979c8345d
|
||||
- filename: voxtral-model/params.json
|
||||
uri: https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602/resolve/main/params.json
|
||||
- filename: voxtral-model/tekken.json
|
||||
uri: https://huggingface.co/mistralai/Voxtral-Mini-4B-Realtime-2602/resolve/main/tekken.json
|
||||
sha256: 8434af1d39eba99f0ef46cf1450bf1a63fa941a26933a1ef5dbbf4adf0d00e44
|
||||
- name: moonshine-tiny
|
||||
license: apache-2.0
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
description: |
|
||||
Moonshine Tiny is a lightweight speech-to-text model optimized for fast transcription. It is designed for efficient on-device ASR with high accuracy relative to its size.
|
||||
urls:
|
||||
- https://github.com/moonshine-ai/moonshine
|
||||
tags:
|
||||
- stt
|
||||
- speech-to-text
|
||||
- asr
|
||||
- audio-transcription
|
||||
- cpu
|
||||
- gpu
|
||||
overrides:
|
||||
backend: moonshine
|
||||
known_usecases:
|
||||
- transcript
|
||||
parameters:
|
||||
model: moonshine/tiny
|
||||
- name: whisperx-tiny
|
||||
license: mit
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
description: |
|
||||
WhisperX Tiny is a fast and accurate speech recognition model with speaker diarization capabilities. Built on OpenAI's Whisper with additional features for alignment and speaker segmentation.
|
||||
urls:
|
||||
- https://github.com/m-bain/whisperX
|
||||
tags:
|
||||
- stt
|
||||
- speech-to-text
|
||||
- asr
|
||||
- audio-transcription
|
||||
- speaker-diarization
|
||||
- cpu
|
||||
- gpu
|
||||
overrides:
|
||||
backend: whisperx
|
||||
known_usecases:
|
||||
- transcript
|
||||
parameters:
|
||||
model: tiny
|
||||
- name: voxcpm-1.5
|
||||
license: apache-2.0
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
description: |
|
||||
VoxCPM 1.5 is an end-to-end text-to-speech (TTS) model from ModelBest. It features zero-shot voice cloning and high-quality speech synthesis capabilities.
|
||||
urls:
|
||||
- https://huggingface.co/openbmb/VoxCPM1.5
|
||||
tags:
|
||||
- tts
|
||||
- text-to-speech
|
||||
- voice-cloning
|
||||
- cpu
|
||||
- gpu
|
||||
overrides:
|
||||
backend: voxcpm
|
||||
known_usecases:
|
||||
- tts
|
||||
parameters:
|
||||
model: openbmb/VoxCPM1.5
|
||||
- name: neutts-air
|
||||
license: apache-2.0
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
description: |
|
||||
NeuTTS Air is the world's first super-realistic, on-device TTS speech language model with instant voice cloning. Built on a 0.5B LLM backbone, it brings natural-sounding speech, real-time performance, and speaker cloning to local devices.
|
||||
urls:
|
||||
- https://github.com/neuphonic/neutts-air
|
||||
tags:
|
||||
- tts
|
||||
- text-to-speech
|
||||
- voice-cloning
|
||||
- cpu
|
||||
- gpu
|
||||
overrides:
|
||||
backend: neutts
|
||||
known_usecases:
|
||||
- tts
|
||||
- name: vllm-omni-z-image-turbo
|
||||
license: apache-2.0
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
description: |
|
||||
Z-Image-Turbo via vLLM-Omni - A distilled version of Z-Image optimized for speed with only 8 NFEs. Offers sub-second inference latency on enterprise-grade H800 GPUs and fits within 16GB VRAM. Excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.
|
||||
urls:
|
||||
- https://huggingface.co/Tongyi-MAI/Z-Image-Turbo
|
||||
tags:
|
||||
- text-to-image
|
||||
- image-generation
|
||||
- vllm-omni
|
||||
- z-image
|
||||
- cpu
|
||||
- gpu
|
||||
overrides:
|
||||
backend: vllm-omni
|
||||
known_usecases:
|
||||
- image_generation
|
||||
parameters:
|
||||
model: Tongyi-MAI/Z-Image-Turbo
|
||||
- name: vllm-omni-wan2.2-t2v
|
||||
license: apache-2.0
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
description: |
|
||||
Wan2.2-T2V-A14B via vLLM-Omni - Text-to-video generation model from Wan-AI. Generates high-quality videos from text prompts using a 14B parameter diffusion model.
|
||||
urls:
|
||||
- https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B-Diffusers
|
||||
tags:
|
||||
- text-to-video
|
||||
- video-generation
|
||||
- vllm-omni
|
||||
- wan
|
||||
- cpu
|
||||
- gpu
|
||||
overrides:
|
||||
backend: vllm-omni
|
||||
known_usecases:
|
||||
- video_generation
|
||||
parameters:
|
||||
model: Wan-AI/Wan2.2-T2V-A14B-Diffusers
|
||||
- name: vllm-omni-wan2.2-i2v
|
||||
license: apache-2.0
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
description: |
|
||||
Wan2.2-I2V-A14B via vLLM-Omni - Image-to-video generation model from Wan-AI. Generates high-quality videos from images using a 14B parameter diffusion model.
|
||||
urls:
|
||||
- https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers
|
||||
tags:
|
||||
- image-to-video
|
||||
- video-generation
|
||||
- vllm-omni
|
||||
- wan
|
||||
- cpu
|
||||
- gpu
|
||||
overrides:
|
||||
backend: vllm-omni
|
||||
known_usecases:
|
||||
- video_generation
|
||||
parameters:
|
||||
model: Wan-AI/Wan2.2-I2V-A14B-Diffusers
|
||||
- name: vllm-omni-qwen3-omni-30b
|
||||
license: apache-2.0
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
description: |
|
||||
Qwen3-Omni-30B-A3B-Instruct via vLLM-Omni - A large multimodal model (30B active, 3B activated per token) from Alibaba Qwen team. Supports text, image, audio, and video understanding with text and speech output. Features native multimodal understanding across all modalities.
|
||||
urls:
|
||||
- https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct
|
||||
tags:
|
||||
- llm
|
||||
- multimodal
|
||||
- vision
|
||||
- audio
|
||||
- video
|
||||
- vllm-omni
|
||||
- qwen3
|
||||
- cpu
|
||||
- gpu
|
||||
overrides:
|
||||
backend: vllm-omni
|
||||
known_usecases:
|
||||
- chat
|
||||
- multimodal
|
||||
parameters:
|
||||
model: Qwen/Qwen3-Omni-30B-A3B-Instruct
|
||||
- name: vllm-omni-qwen3-tts-custom-voice
|
||||
license: apache-2.0
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
description: |
|
||||
Qwen3-TTS-12Hz-1.7B-CustomVoice via vLLM-Omni - Text-to-speech model from Alibaba Qwen team with custom voice cloning capabilities. Generates natural-sounding speech with voice personalization.
|
||||
urls:
|
||||
- https://huggingface.co/Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
|
||||
tags:
|
||||
- tts
|
||||
- text-to-speech
|
||||
- voice-cloning
|
||||
- vllm-omni
|
||||
- qwen3
|
||||
- cpu
|
||||
- gpu
|
||||
overrides:
|
||||
backend: vllm-omni
|
||||
known_usecases:
|
||||
- tts
|
||||
parameters:
|
||||
model: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
|
||||
- name: "ace-step-turbo"
|
||||
license: mit
|
||||
tags:
|
||||
@@ -388,7 +623,7 @@
|
||||
files:
|
||||
- filename: llama-cpp/models/GLM-4.7-Flash-Q4_K_M.gguf
|
||||
uri: https://huggingface.co/unsloth/GLM-4.7-Flash-GGUF/resolve/main/GLM-4.7-Flash-Q4_K_M.gguf
|
||||
sha256: 73ba18480e06ccda453a26263c0e2be2bd86294e827b1812ddea2f88bba2d924
|
||||
sha256: 29837ed2c0fc5f51981adf8ac8083fcf80743c598381f13e9f06cbad0498b174
|
||||
- name: "qwen3-vl-reranker-8b"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
@@ -12398,6 +12633,311 @@
|
||||
- filename: llama-cpp/mmproj/mmproj-mistral-community_pixtral-12b-f16.gguf
|
||||
sha256: a0b21e5a3b0f9b0b604385c45bb841142e7a5ac7660fa6a397dbc87c66b2083e
|
||||
uri: huggingface://bartowski/mistral-community_pixtral-12b-GGUF/mmproj-mistral-community_pixtral-12b-f16.gguf
|
||||
- !!merge <<: *mistral03
|
||||
name: "mistralai_ministral-3-14b-instruct-2512-multimodal"
|
||||
urls:
|
||||
- https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512
|
||||
- https://huggingface.co/unsloth/Ministral-3-14B-Instruct-2512-GGUF
|
||||
description: |
|
||||
The largest model in the Ministral 3 family, Ministral 3 14B offers frontier capabilities and performance comparable to its larger Mistral Small 3.2 24B counterpart. A powerful and efficient language model with vision capabilities.
|
||||
|
||||
The Ministral 3 family is designed for edge deployment, capable of running on a wide range of hardware. Ministral 3 14B can even be deployed locally, capable of fitting in 24GB of VRAM in FP8, and less if further quantized.
|
||||
|
||||
Key Features:
|
||||
Ministral 3 14B consists of two main architectural components:
|
||||
|
||||
- 13.5B Language Model
|
||||
- 0.4B Vision Encoder
|
||||
|
||||
The Ministral 3 14B Instruct model offers the following capabilities:
|
||||
|
||||
- Vision: Enables the model to analyze images and provide insights based on visual content, in addition to text.
|
||||
- Multilingual: Supports dozens of languages, including English, French, Spanish, German, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Arabic.
|
||||
- System Prompt: Maintains strong adherence and support for system prompts.
|
||||
- Agentic: Offers best-in-class agentic capabilities with native function calling and JSON outputting.
|
||||
- Edge-Optimized: Delivers best-in-class performance at a small scale, deployable anywhere.
|
||||
- Apache 2.0 License: Open-source license allowing usage and modification for both commercial and non-commercial purposes.
|
||||
- Large Context Window: Supports a 256k context window.
|
||||
|
||||
This gallery entry includes mmproj for multimodality and uses Unsloth recommended defaults.
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
- gpu
|
||||
- mistral
|
||||
- cpu
|
||||
- function-calling
|
||||
- multimodal
|
||||
overrides:
|
||||
context_size: 16384
|
||||
parameters:
|
||||
model: llama-cpp/models/mistralai_Ministral-3-14B-Instruct-2512-Q4_K_M.gguf
|
||||
temperature: 0.15
|
||||
mmproj: llama-cpp/mmproj/mmproj-mistralai_Ministral-3-14B-Instruct-2512-f32.gguf
|
||||
files:
|
||||
- filename: llama-cpp/models/mistralai_Ministral-3-14B-Instruct-2512-Q4_K_M.gguf
|
||||
sha256: 76ce697c065f2e40f1e8e958118b02cab38e2c10a6015f7d7908036a292dc8c8
|
||||
uri: huggingface://unsloth/Ministral-3-14B-Instruct-2512-GGUF/Ministral-3-14B-Instruct-2512-Q4_K_M.gguf
|
||||
- filename: llama-cpp/mmproj/mmproj-mistralai_Ministral-3-14B-Instruct-2512-f32.gguf
|
||||
sha256: 2740ba9e9b30b09be4282a9a9f617ec43dc47b89aed416cb09b5f698f90783b5
|
||||
uri: huggingface://unsloth/Ministral-3-14B-Instruct-2512-GGUF/mmproj-F32.gguf
|
||||
- !!merge <<: *mistral03
|
||||
name: "mistralai_ministral-3-14b-reasoning-2512-multimodal"
|
||||
urls:
|
||||
- https://huggingface.co/mistralai/Ministral-3-14B-Reasoning-2512
|
||||
- https://huggingface.co/unsloth/Ministral-3-14B-Reasoning-2512-GGUF
|
||||
description: |
|
||||
The largest model in the Ministral 3 family, Ministral 3 14B offers frontier capabilities and performance comparable to its larger Mistral Small 3.2 24B counterpart. A powerful and efficient language model with vision capabilities.
|
||||
|
||||
This model is the reasoning post-trained version, trained for reasoning tasks, making it ideal for math, coding and stem related use cases.
|
||||
|
||||
The Ministral 3 family is designed for edge deployment, capable of running on a wide range of hardware. Ministral 3 14B can even be deployed locally, capable of fitting in 32GB of VRAM in BF16, and less than 24GB of RAM/VRAM when quantized.
|
||||
|
||||
Key Features:
|
||||
Ministral 3 14B consists of two main architectural components:
|
||||
|
||||
|
||||
- 13.5B Language Model
|
||||
- 0.4B Vision Encoder
|
||||
|
||||
The Ministral 3 14B Reasoning model offers the following capabilities:
|
||||
|
||||
|
||||
- Vision: Enables the model to analyze images and provide insights based on visual content, in addition to text.
|
||||
- Multilingual: Supports dozens of languages, including English, French, Spanish, German, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Arabic.
|
||||
- System Prompt: Maintains strong adherence and support for system prompts.
|
||||
- Agentic: Offers best-in-class agentic capabilities with native function calling and JSON outputting.
|
||||
- Reasoning: Excels at complex, multi-step reasoning and dynamic problem-solving.
|
||||
- Edge-Optimized: Delivers best-in-class performance at a small scale, deployable anywhere.
|
||||
- Apache 2.0 License: Open-source license allowing usage and modification for both commercial and non-commercial purposes.
|
||||
- Large Context Window: Supports a 256k context window.
|
||||
|
||||
|
||||
This gallery entry includes mmproj for multimodality and uses Unsloth recommended defaults.
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
- gpu
|
||||
- mistral
|
||||
- cpu
|
||||
- function-calling
|
||||
- multimodal
|
||||
overrides:
|
||||
context_size: 32768
|
||||
parameters:
|
||||
model: llama-cpp/models/mistralai_Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf
|
||||
temperature: 0.7
|
||||
top_p: 0.95
|
||||
mmproj: llama-cpp/mmproj/mmproj-mistralai_Ministral-3-14B-Reasoning-2512-f32.gguf
|
||||
files:
|
||||
- filename: llama-cpp/models/mistralai_Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf
|
||||
sha256: f577390559b89ebdbfe52cc234ea334649c24e6003ffa4b6a2474c5e2a47aa17
|
||||
uri: huggingface://unsloth/Ministral-3-14B-Reasoning-2512-GGUF/Ministral-3-14B-Reasoning-2512-Q4_K_M.gguf
|
||||
- filename: llama-cpp/mmproj/mmproj-mistralai_Ministral-3-14B-Reasoning-2512-f32.gguf
|
||||
sha256: 891bf262a032968f6e5b3d4e9ffc84cf6381890033c2f5204fbdf4817af4ab9b
|
||||
uri: huggingface://unsloth/Ministral-3-14B-Reasoning-2512-GGUF/mmproj-F32.gguf
|
||||
- !!merge <<: *mistral03
|
||||
name: "mistralai_ministral-3-8b-instruct-2512-multimodal"
|
||||
urls:
|
||||
- https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512
|
||||
- https://huggingface.co/unsloth/Ministral-3-8B-Instruct-2512-GGUF
|
||||
description: |
|
||||
A balanced model in the Ministral 3 family, Ministral 3 8B is a powerful, efficient tiny language model with vision capabilities.
|
||||
|
||||
The Ministral 3 family is designed for edge deployment, capable of running on a wide range of hardware. Ministral 3 8B can even be deployed locally, capable of fitting in 12GB of VRAM in FP8, and less if further quantized.
|
||||
|
||||
Key Features:
|
||||
Ministral 3 8B consists of two main architectural components:
|
||||
|
||||
- 8.4B Language Model
|
||||
- 0.4B Vision Encoder
|
||||
|
||||
The Ministral 3 8B Instruct model offers the following capabilities:
|
||||
|
||||
- Vision: Enables the model to analyze images and provide insights based on visual content, in addition to text.
|
||||
- Multilingual: Supports dozens of languages, including English, French, Spanish, German, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Arabic.
|
||||
- System Prompt: Maintains strong adherence and support for system prompts.
|
||||
- Agentic: Offers best-in-class agentic capabilities with native function calling and JSON outputting.
|
||||
- Edge-Optimized: Delivers best-in-class performance at a small scale, deployable anywhere.
|
||||
- Apache 2.0 License: Open-source license allowing usage and modification for both commercial and non-commercial purposes.
|
||||
- Large Context Window: Supports a 256k context window.
|
||||
|
||||
This gallery entry includes mmproj for multimodality and uses Unsloth recommended defaults.
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
- gpu
|
||||
- mistral
|
||||
- cpu
|
||||
- function-calling
|
||||
- multimodal
|
||||
overrides:
|
||||
context_size: 16384
|
||||
parameters:
|
||||
model: llama-cpp/models/mistralai_Ministral-3-8B-Instruct-2512-Q4_K_M.gguf
|
||||
temperature: 0.15
|
||||
mmproj: llama-cpp/mmproj/mmproj-mistralai_Ministral-3-8B-Instruct-2512-f32.gguf
|
||||
files:
|
||||
- filename: llama-cpp/models/mistralai_Ministral-3-8B-Instruct-2512-Q4_K_M.gguf
|
||||
sha256: 5dbc3647eb563b9f8d3c70ec3d906cce84b86bb35c5e0b8a36e7df3937ab7174
|
||||
uri: huggingface://unsloth/Ministral-3-8B-Instruct-2512-GGUF/Ministral-3-8B-Instruct-2512-Q4_K_M.gguf
|
||||
- filename: llama-cpp/mmproj/mmproj-mistralai_Ministral-3-8B-Instruct-2512-f32.gguf
|
||||
sha256: 242d11ff65ef844b0aac4e28d4b1318813370608845f17b3ef5826fd7e7fd015
|
||||
uri: huggingface://unsloth/Ministral-3-8B-Instruct-2512-GGUF/mmproj-F32.gguf
|
||||
- !!merge <<: *mistral03
|
||||
name: "mistralai_ministral-3-8b-reasoning-2512-multimodal"
|
||||
urls:
|
||||
- https://huggingface.co/mistralai/Ministral-3-8B-Reasoning-2512
|
||||
- https://huggingface.co/unsloth/Ministral-3-8B-Reasoning-2512-GGUF
|
||||
description: |
|
||||
A balanced model in the Ministral 3 family, Ministral 3 8B is a powerful, efficient tiny language model with vision capabilities.
|
||||
|
||||
This model is the reasoning post-trained version, trained for reasoning tasks, making it ideal for math, coding and stem related use cases.
|
||||
|
||||
The Ministral 3 family is designed for edge deployment, capable of running on a wide range of hardware. Ministral 3 8B can even be deployed locally, capable of fitting in 24GB of VRAM in BF16, and less than 12GB of RAM/VRAM when quantized.
|
||||
|
||||
Key Features:
|
||||
Ministral 3 8B consists of two main architectural components:
|
||||
|
||||
|
||||
- 8.4B Language Model
|
||||
- 0.4B Vision Encoder
|
||||
|
||||
The Ministral 3 8B Reasoning model offers the following capabilities:
|
||||
|
||||
|
||||
- Vision: Enables the model to analyze images and provide insights based on visual content, in addition to text.
|
||||
- Multilingual: Supports dozens of languages, including English, French, Spanish, German, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Arabic.
|
||||
- System Prompt: Maintains strong adherence and support for system prompts.
|
||||
- Agentic: Offers best-in-class agentic capabilities with native function calling and JSON outputting.
|
||||
- Reasoning: Excels at complex, multi-step reasoning and dynamic problem-solving.
|
||||
- Edge-Optimized: Delivers best-in-class performance at a small scale, deployable anywhere.
|
||||
- Apache 2.0 License: Open-source license allowing usage and modification for both commercial and non-commercial purposes.
|
||||
- Large Context Window: Supports a 256k context window.
|
||||
|
||||
This gallery entry includes mmproj for multimodality and uses Unsloth recommended defaults.
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
- gpu
|
||||
- mistral
|
||||
- cpu
|
||||
- function-calling
|
||||
- multimodal
|
||||
overrides:
|
||||
context_size: 32768
|
||||
parameters:
|
||||
model: llama-cpp/models/mistralai_Ministral-3-8B-Reasoning-2512-Q4_K_M.gguf
|
||||
temperature: 0.7
|
||||
top_p: 0.95
|
||||
mmproj: llama-cpp/mmproj/mmproj-mistralai_Ministral-3-8B-Reasoning-2512-f32.gguf
|
||||
files:
|
||||
- filename: llama-cpp/models/mistralai_Ministral-3-8B-Reasoning-2512-Q4_K_M.gguf
|
||||
sha256: c3d1c5ab7406a0fc9d50ad2f0d15d34d5693db00bf953e8a9cd9a243b81cb1b2
|
||||
uri: huggingface://unsloth/Ministral-3-8B-Reasoning-2512-GGUF/Ministral-3-8B-Reasoning-2512-Q4_K_M.gguf
|
||||
- filename: llama-cpp/mmproj/mmproj-mistralai_Ministral-3-8B-Reasoning-2512-f32.gguf
|
||||
sha256: 92252621cb957949379ff81ee14b15887d37eade3845a6e937e571b98c2c84c2
|
||||
uri: huggingface://unsloth/Ministral-3-8B-Reasoning-2512-GGUF/mmproj-F32.gguf
|
||||
- !!merge <<: *mistral03
|
||||
name: "mistralai_ministral-3-3b-instruct-2512-multimodal"
|
||||
urls:
|
||||
- https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512
|
||||
- https://huggingface.co/unsloth/Ministral-3-3B-Instruct-2512-GGUF
|
||||
description: |
|
||||
The smallest model in the Ministral 3 family, Ministral 3 3B is a powerful, efficient tiny language model with vision capabilities.
|
||||
|
||||
The Ministral 3 family is designed for edge deployment, capable of running on a wide range of hardware. Ministral 3 3B can even be deployed locally, capable of fitting in 8GB of VRAM in FP8, and less if further quantized.
|
||||
|
||||
Key Features:
|
||||
Ministral 3 3B consists of two main architectural components:
|
||||
|
||||
- 3.4B Language Model
|
||||
- 0.4B Vision Encoder
|
||||
|
||||
The Ministral 3 3B Instruct model offers the following capabilities:
|
||||
|
||||
- Vision: Enables the model to analyze images and provide insights based on visual content, in addition to text.
|
||||
- Multilingual: Supports dozens of languages, including English, French, Spanish, German, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Arabic.
|
||||
- System Prompt: Maintains strong adherence and support for system prompts.
|
||||
- Agentic: Offers best-in-class agentic capabilities with native function calling and JSON outputting.
|
||||
- Edge-Optimized: Delivers best-in-class performance at a small scale, deployable anywhere.
|
||||
- Apache 2.0 License: Open-source license allowing usage and modification for both commercial and non-commercial purposes.
|
||||
- Large Context Window: Supports a 256k context window.
|
||||
|
||||
This gallery entry includes mmproj for multimodality and uses Unsloth recommended defaults.
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
- gpu
|
||||
- mistral
|
||||
- cpu
|
||||
- function-calling
|
||||
- multimodal
|
||||
overrides:
|
||||
context_size: 16384
|
||||
parameters:
|
||||
model: llama-cpp/models/mistralai_Ministral-3-3B-Instruct-2512-Q4_K_M.gguf
|
||||
temperature: 0.15
|
||||
mmproj: llama-cpp/mmproj/mmproj-mistralai_Ministral-3-3B-Instruct-2512-f32.gguf
|
||||
files:
|
||||
- filename: llama-cpp/models/mistralai_Ministral-3-3B-Instruct-2512-Q4_K_M.gguf
|
||||
sha256: fd46fc371ff0509bfa8657ac956b7de8534d7d9baaa4947975c0648c3aa397f4
|
||||
uri: huggingface://unsloth/Ministral-3-3B-Instruct-2512-GGUF/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf
|
||||
- filename: llama-cpp/mmproj/mmproj-mistralai_Ministral-3-3B-Instruct-2512-f32.gguf
|
||||
sha256: 57bb4e6f01166985ca2fc16061be4023fcb95cb8e60f445b8d0bf1ee30268636
|
||||
uri: huggingface://unsloth/Ministral-3-3B-Instruct-2512-GGUF/mmproj-F32.gguf
|
||||
- !!merge <<: *mistral03
|
||||
name: "mistralai_ministral-3-3b-reasoning-2512-multimodal"
|
||||
urls:
|
||||
- https://huggingface.co/mistralai/Ministral-3-3B-Reasoning-2512
|
||||
- https://huggingface.co/unsloth/Ministral-3-3B-Reasoning-2512-GGUF
|
||||
description: |
|
||||
The smallest model in the Ministral 3 family, Ministral 3 3B is a powerful, efficient tiny language model with vision capabilities.
|
||||
|
||||
This model is the reasoning post-trained version, trained for reasoning tasks, making it ideal for math, coding and stem related use cases.
|
||||
|
||||
The Ministral 3 family is designed for edge deployment, capable of running on a wide range of hardware. Ministral 3 3B can even be deployed locally, fitting in 16GB of VRAM in BF16, and less than 8GB of RAM/VRAM when quantized.
|
||||
|
||||
Key Features:
|
||||
Ministral 3 3B consists of two main architectural components:
|
||||
|
||||
- 3.4B Language Model
|
||||
- 0.4B Vision Encoder
|
||||
|
||||
The Ministral 3 3B Reasoning model offers the following capabilities:
|
||||
|
||||
- Vision: Enables the model to analyze images and provide insights based on visual content, in addition to text.
|
||||
- Multilingual: Supports dozens of languages, including English, French, Spanish, German, Italian, Portuguese, Dutch, Chinese, Japanese, Korean, Arabic.
|
||||
- System Prompt: Maintains strong adherence and support for system prompts.
|
||||
- Agentic: Offers best-in-class agentic capabilities with native function calling and JSON outputting.
|
||||
- Reasoning: Excels at complex, multi-step reasoning and dynamic problem-solving.
|
||||
- Edge-Optimized: Delivers best-in-class performance at a small scale, deployable anywhere.
|
||||
- Apache 2.0 License: Open-source license allowing usage and modification for both commercial and non-commercial purposes.
|
||||
- Large Context Window: Supports a 256k context window.
|
||||
|
||||
This gallery entry includes mmproj for multimodality and uses Unsloth recommended defaults.
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
- gpu
|
||||
- mistral
|
||||
- cpu
|
||||
- function-calling
|
||||
- multimodal
|
||||
overrides:
|
||||
context_size: 32768
|
||||
parameters:
|
||||
model: llama-cpp/models/mistralai_Ministral-3-3B-Reasoning-2512-Q4_K_M.gguf
|
||||
temperature: 0.7
|
||||
top_p: 0.95
|
||||
mmproj: llama-cpp/mmproj/mmproj-mistralai_Ministral-3-3B-Reasoning-2512-f32.gguf
|
||||
files:
|
||||
- filename: llama-cpp/models/mistralai_Ministral-3-3B-Reasoning-2512-Q4_K_M.gguf
|
||||
sha256: a2648395d533b6d1408667d00e0b778f3823f3f3179ba371f89355f2e957e42e
|
||||
uri: huggingface://unsloth/Ministral-3-3B-Reasoning-2512-GGUF/Ministral-3-3B-Reasoning-2512-Q4_K_M.gguf
|
||||
- filename: llama-cpp/mmproj/mmproj-mistralai_Ministral-3-3B-Reasoning-2512-f32.gguf
|
||||
sha256: 8035a6a10dfc6250f50c62764fae3ac2ef6d693fc9252307c7093198aabba812
|
||||
uri: huggingface://unsloth/Ministral-3-3B-Reasoning-2512-GGUF/mmproj-F32.gguf
|
||||
- &mudler
|
||||
url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
|
||||
name: "LocalAI-llama3-8b-function-call-v0.2"
|
||||
@@ -12907,6 +13447,61 @@
|
||||
- filename: "phi-2-orange.Q4_0.gguf"
|
||||
sha256: "49cb710ae688e1b19b1b299087fa40765a0cd677e3afcc45e5f7ef6750975dcf"
|
||||
uri: "huggingface://TheBloke/phi-2-orange-GGUF/phi-2-orange.Q4_0.gguf"
|
||||
- url: "github:mudler/LocalAI/gallery/phi-3-chat.yaml@master"
|
||||
icon: https://cdn-avatars.huggingface.co/v1/production/uploads/652feb6b4e527bd115ffd6c8/YFwodyNe6LmUrzQNmrl-D.png
|
||||
license: mit
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
- gpu
|
||||
- cpu
|
||||
- phi-3
|
||||
name: "npc-llm-3-8b"
|
||||
urls:
|
||||
- https://huggingface.co/Gigax/NPC-LLM-3_8B
|
||||
- https://huggingface.co/bartowski/NPC-LLM-3_8B-GGUF
|
||||
description: |
|
||||
NPC model fined-tuned from Phi-3, using LoRA.
|
||||
|
||||
This model parses a text description of a game scene, and outputs commands like:
|
||||
|
||||
- say <player1> "Hello Adventurer, care to join me on a quest?
|
||||
- greet <player1>
|
||||
- attack <player1>
|
||||
- Any other <action> <param> you add to the prompt! (We call these "skills"!)
|
||||
|
||||
⚠️ This model has been trained to overfit on specific input prompt format. Follow it closely to reach optimal performance ⚠️
|
||||
|
||||
Input prompt
|
||||
|
||||
Here's a sample input prompt, showing you the format on which the model has been trained:
|
||||
|
||||
- WORLD KNOWLEDGE: A vast open world full of mystery and adventure.
|
||||
- KNOWN LOCATIONS: Old Town
|
||||
- NPCS: John the Brave
|
||||
- CURRENT LOCATION: Old Town: A quiet and peaceful town.
|
||||
- CURRENT LOCATION ITEMS: Sword
|
||||
- LAST EVENTS:
|
||||
Aldren: Say Sword What a fine sword!
|
||||
- PROTAGONIST NAME: Aldren
|
||||
- PROTAGONIST PSYCHOLOGICAL PROFILE: Brave and curious
|
||||
- PROTAGONIST MEMORIES:
|
||||
Saved the village
|
||||
Lost a friend
|
||||
- PROTAGONIST PENDING QUESTS:
|
||||
Find the ancient artifact
|
||||
Defeat the evil warlock
|
||||
- PROTAGONIST ALLOWED ACTIONS:
|
||||
Attack <character> : Deliver a powerful blow
|
||||
Aldren:
|
||||
overrides:
|
||||
context_size: 4096
|
||||
parameters:
|
||||
model: NPC-LLM-3_8B-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: NPC-LLM-3_8B-Q4_K_M.gguf
|
||||
uri: huggingface://bartowski/NPC-LLM-3_8B-GGUF/NPC-LLM-3_8B-Q4_K_M.gguf
|
||||
sha256: 5fcfb314566f0ae9364fe80237f96b12678aafbb8e82f90c6aece5ed2a6b83fd
|
||||
### Internlm2
|
||||
- name: "internlm2_5-7b-chat-1m"
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
|
||||
12
go.mod
12
go.mod
@@ -8,10 +8,11 @@ require (
|
||||
dario.cat/mergo v1.0.2
|
||||
fyne.io/fyne/v2 v2.7.2
|
||||
github.com/Masterminds/sprig/v3 v3.3.0
|
||||
github.com/alecthomas/kong v1.13.0
|
||||
github.com/anthropics/anthropic-sdk-go v1.20.0
|
||||
github.com/alecthomas/kong v1.14.0
|
||||
github.com/anthropics/anthropic-sdk-go v1.22.0
|
||||
github.com/charmbracelet/glamour v0.10.0
|
||||
github.com/containerd/containerd v1.7.30
|
||||
github.com/dhowden/tag v0.0.0-20240417053706-3d75831295e8
|
||||
github.com/ebitengine/purego v0.9.1
|
||||
github.com/emirpasic/gods/v2 v2.0.0-alpha
|
||||
github.com/fsnotify/fsnotify v1.9.0
|
||||
@@ -23,7 +24,7 @@ require (
|
||||
github.com/gpustack/gguf-parser-go v0.23.1
|
||||
github.com/hpcloud/tail v1.0.0
|
||||
github.com/ipfs/go-log v1.0.5
|
||||
github.com/jaypipes/ghw v0.21.2
|
||||
github.com/jaypipes/ghw v0.22.0
|
||||
github.com/joho/godotenv v1.5.1
|
||||
github.com/klauspost/cpuid/v2 v2.3.0
|
||||
github.com/labstack/echo/v4 v4.15.0
|
||||
@@ -37,9 +38,9 @@ require (
|
||||
github.com/mudler/go-processmanager v0.1.0
|
||||
github.com/mudler/memory v0.0.0-20251216220809-d1256471a6c2
|
||||
github.com/mudler/xlog v0.0.5
|
||||
github.com/onsi/ginkgo/v2 v2.28.0
|
||||
github.com/onsi/ginkgo/v2 v2.28.1
|
||||
github.com/onsi/gomega v1.39.1
|
||||
github.com/openai/openai-go/v3 v3.17.0
|
||||
github.com/openai/openai-go/v3 v3.19.0
|
||||
github.com/otiai10/copy v1.14.1
|
||||
github.com/otiai10/openaigo v1.7.0
|
||||
github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5
|
||||
@@ -66,7 +67,6 @@ require (
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/dhowden/tag v0.0.0-20240417053706-3d75831295e8 // indirect
|
||||
github.com/ghodss/yaml v1.0.0 // indirect
|
||||
github.com/labstack/gommon v0.4.2 // indirect
|
||||
github.com/swaggo/files/v2 v2.0.2 // indirect
|
||||
|
||||
20
go.sum
20
go.sum
@@ -36,16 +36,16 @@ github.com/alecthomas/assert/v2 v2.11.0 h1:2Q9r3ki8+JYXvGsDyBXwH3LcJ+WK5D0gc5E8v
|
||||
github.com/alecthomas/assert/v2 v2.11.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k=
|
||||
github.com/alecthomas/chroma/v2 v2.14.0 h1:R3+wzpnUArGcQz7fCETQBzO5n9IMNi13iIs46aU4V9E=
|
||||
github.com/alecthomas/chroma/v2 v2.14.0/go.mod h1:QolEbTfmUHIMVpBqxeDnNBj2uoeI4EbYP4i6n68SG4I=
|
||||
github.com/alecthomas/kong v1.13.0 h1:5e/7XC3ugvhP1DQBmTS+WuHtCbcv44hsohMgcvVxSrA=
|
||||
github.com/alecthomas/kong v1.13.0/go.mod h1:wrlbXem1CWqUV5Vbmss5ISYhsVPkBb1Yo7YKJghju2I=
|
||||
github.com/alecthomas/kong v1.14.0 h1:gFgEUZWu2ZmZ+UhyZ1bDhuutbKN1nTtJTwh19Wsn21s=
|
||||
github.com/alecthomas/kong v1.14.0/go.mod h1:wrlbXem1CWqUV5Vbmss5ISYhsVPkBb1Yo7YKJghju2I=
|
||||
github.com/alecthomas/repr v0.5.2 h1:SU73FTI9D1P5UNtvseffFSGmdNci/O6RsqzeXJtP0Qs=
|
||||
github.com/alecthomas/repr v0.5.2/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4=
|
||||
github.com/andybalholm/brotli v1.0.1/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y=
|
||||
github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
|
||||
github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
|
||||
github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c=
|
||||
github.com/anthropics/anthropic-sdk-go v1.20.0 h1:KE6gQiAT1aBHMh3Dmp1WgqnyZZLJNo2oX3ka004oDLE=
|
||||
github.com/anthropics/anthropic-sdk-go v1.20.0/go.mod h1:WTz31rIUHUHqai2UslPpw5CwXrQP3geYBioRV4WOLvE=
|
||||
github.com/anthropics/anthropic-sdk-go v1.22.0 h1:sgo4Ob5pC5InKCi/5Ukn5t9EjPJ7KTMaKm5beOYt6rM=
|
||||
github.com/anthropics/anthropic-sdk-go v1.22.0/go.mod h1:WTz31rIUHUHqai2UslPpw5CwXrQP3geYBioRV4WOLvE=
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
|
||||
github.com/aymanbagabas/go-udiff v0.2.0 h1:TK0fH4MteXUDspT88n8CKzvK0X9O2xu9yQjWpi6yML8=
|
||||
@@ -342,8 +342,8 @@ github.com/ipld/go-ipld-prime v0.21.0 h1:n4JmcpOlPDIxBcY037SVfpd1G+Sj1nKZah0m6QH
|
||||
github.com/ipld/go-ipld-prime v0.21.0/go.mod h1:3RLqy//ERg/y5oShXXdx5YIp50cFGOanyMctpPjsvxQ=
|
||||
github.com/jackpal/go-nat-pmp v1.0.2 h1:KzKSgb7qkJvOUTqYl9/Hg/me3pWgBmERKrTGD7BdWus=
|
||||
github.com/jackpal/go-nat-pmp v1.0.2/go.mod h1:QPH045xvCAeXUZOxsnwmrtiCoxIr9eob+4orBN1SBKc=
|
||||
github.com/jaypipes/ghw v0.21.2 h1:woW0lqNMPbYk59sur6thOVM8YFP9Hxxr8PM+JtpUrNU=
|
||||
github.com/jaypipes/ghw v0.21.2/go.mod h1:GPrvwbtPoxYUenr74+nAnWbardIZq600vJDD5HnPsPE=
|
||||
github.com/jaypipes/ghw v0.22.0 h1:v3G5E1Q7UO61xV15lls5a+2jkQNjM3Z3fE+KOWRt1j4=
|
||||
github.com/jaypipes/ghw v0.22.0/go.mod h1:fUNUjMZ0cjahKo+/u+32m9FutIx53Nkbi0Ti0m7j5HY=
|
||||
github.com/jaypipes/pcidb v1.1.1 h1:QmPhpsbmmnCwZmHeYAATxEaoRuiMAJusKYkUncMC0ro=
|
||||
github.com/jaypipes/pcidb v1.1.1/go.mod h1:x27LT2krrUgjf875KxQXKB0Ha/YXLdZRVmw6hH0G7g8=
|
||||
github.com/jbenet/go-temp-err-catcher v0.1.0 h1:zpb3ZH6wIE8Shj2sKS+khgRvf7T7RABoLk/+KKHggpk=
|
||||
@@ -563,12 +563,12 @@ github.com/nxadm/tail v1.4.8 h1:nPr65rt6Y5JFSKQO7qToXr7pePgD6Gwiw05lkbyAQTE=
|
||||
github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU=
|
||||
github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE=
|
||||
github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU=
|
||||
github.com/onsi/ginkgo/v2 v2.28.0 h1:Rrf+lVLmtlBIKv6KrIGJCjyY8N36vDVcutbGJkyqjJc=
|
||||
github.com/onsi/ginkgo/v2 v2.28.0/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo=
|
||||
github.com/onsi/ginkgo/v2 v2.28.1 h1:S4hj+HbZp40fNKuLUQOYLDgZLwNUVn19N3Atb98NCyI=
|
||||
github.com/onsi/ginkgo/v2 v2.28.1/go.mod h1:CLtbVInNckU3/+gC8LzkGUb9oF+e8W8TdUsxPwvdOgE=
|
||||
github.com/onsi/gomega v1.39.1 h1:1IJLAad4zjPn2PsnhH70V4DKRFlrCzGBNrNaru+Vf28=
|
||||
github.com/onsi/gomega v1.39.1/go.mod h1:hL6yVALoTOxeWudERyfppUcZXjMwIMLnuSfruD2lcfg=
|
||||
github.com/openai/openai-go/v3 v3.17.0 h1:CfTkmQoItolSyW+bHOUF190KuX5+1Zv6MC0Gb4wAwy8=
|
||||
github.com/openai/openai-go/v3 v3.17.0/go.mod h1:cdufnVK14cWcT9qA1rRtrXx4FTRsgbDPW7Ia7SS5cZo=
|
||||
github.com/openai/openai-go/v3 v3.19.0 h1:xS/UQeSaNuL4bZjq28/rBrA4OZaq1BcYLBwQm9Vx8cI=
|
||||
github.com/openai/openai-go/v3 v3.19.0/go.mod h1:cdufnVK14cWcT9qA1rRtrXx4FTRsgbDPW7Ia7SS5cZo=
|
||||
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
|
||||
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
|
||||
github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040=
|
||||
|
||||
@@ -21,6 +21,10 @@ var dataURIPattern = regexp.MustCompile(`^data:([^;]+);base64,`)
|
||||
// GetContentURIAsBase64 checks if the string is an URL, if it's an URL downloads the content in memory encodes it in base64 and returns the base64 string, otherwise returns the string by stripping base64 data headers
|
||||
func GetContentURIAsBase64(s string) (string, error) {
|
||||
if strings.HasPrefix(s, "http") || strings.HasPrefix(s, "https") {
|
||||
if err := ValidateExternalURL(s); err != nil {
|
||||
return "", fmt.Errorf("URL validation failed: %w", err)
|
||||
}
|
||||
|
||||
// download the image
|
||||
resp, err := base64DownloadClient.Get(s)
|
||||
if err != nil {
|
||||
|
||||
78
pkg/utils/urlfetch.go
Normal file
78
pkg/utils/urlfetch.go
Normal file
@@ -0,0 +1,78 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"net/url"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ValidateExternalURL checks that the given URL does not point to a private,
|
||||
// loopback, link-local, or otherwise internal network address. This prevents
|
||||
// Server-Side Request Forgery (SSRF) attacks where a user-supplied URL could
|
||||
// be used to probe internal services or cloud metadata endpoints.
|
||||
func ValidateExternalURL(rawURL string) error {
|
||||
parsed, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return fmt.Errorf("invalid URL: %w", err)
|
||||
}
|
||||
|
||||
scheme := strings.ToLower(parsed.Scheme)
|
||||
if scheme != "http" && scheme != "https" {
|
||||
return fmt.Errorf("unsupported URL scheme: %s", scheme)
|
||||
}
|
||||
|
||||
hostname := parsed.Hostname()
|
||||
if hostname == "" {
|
||||
return fmt.Errorf("URL has no hostname")
|
||||
}
|
||||
|
||||
// Block well-known internal hostnames
|
||||
lower := strings.ToLower(hostname)
|
||||
if lower == "localhost" || strings.HasSuffix(lower, ".local") {
|
||||
return fmt.Errorf("requests to internal hosts are not allowed")
|
||||
}
|
||||
|
||||
// Block cloud metadata service hostnames
|
||||
if lower == "metadata.google.internal" || lower == "instance-data" {
|
||||
return fmt.Errorf("requests to cloud metadata services are not allowed")
|
||||
}
|
||||
|
||||
ips, err := net.LookupHost(hostname)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to resolve hostname: %w", err)
|
||||
}
|
||||
|
||||
for _, ipStr := range ips {
|
||||
ip := net.ParseIP(ipStr)
|
||||
if ip == nil {
|
||||
return fmt.Errorf("unable to parse resolved IP: %s", ipStr)
|
||||
}
|
||||
|
||||
if !isPublicIP(ip) {
|
||||
return fmt.Errorf("requests to internal network addresses are not allowed")
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func isPublicIP(ip net.IP) bool {
|
||||
if ip.IsLoopback() ||
|
||||
ip.IsLinkLocalUnicast() ||
|
||||
ip.IsLinkLocalMulticast() ||
|
||||
ip.IsPrivate() ||
|
||||
ip.IsUnspecified() {
|
||||
return false
|
||||
}
|
||||
|
||||
// Block IPv4-mapped IPv6 addresses that wrap private IPv4
|
||||
if ip4 := ip.To4(); ip4 != nil {
|
||||
return !ip4.IsLoopback() &&
|
||||
!ip4.IsLinkLocalUnicast() &&
|
||||
!ip4.IsPrivate() &&
|
||||
!ip4.IsUnspecified()
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
99
pkg/utils/urlfetch_test.go
Normal file
99
pkg/utils/urlfetch_test.go
Normal file
@@ -0,0 +1,99 @@
|
||||
package utils_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/pkg/utils"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("utils/urlfetch tests", func() {
|
||||
Context("ValidateExternalURL", func() {
|
||||
It("allows valid external HTTPS URLs", func() {
|
||||
err := ValidateExternalURL("https://example.com/image.png")
|
||||
Expect(err).To(BeNil())
|
||||
})
|
||||
|
||||
It("allows valid external HTTP URLs", func() {
|
||||
err := ValidateExternalURL("http://example.com/image.png")
|
||||
Expect(err).To(BeNil())
|
||||
})
|
||||
|
||||
It("blocks localhost", func() {
|
||||
err := ValidateExternalURL("http://localhost/secret")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring("internal"))
|
||||
})
|
||||
|
||||
It("blocks 127.0.0.1", func() {
|
||||
err := ValidateExternalURL("http://127.0.0.1/secret")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring("internal"))
|
||||
})
|
||||
|
||||
It("blocks private 10.x.x.x range", func() {
|
||||
err := ValidateExternalURL("http://10.0.0.1/secret")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring("internal"))
|
||||
})
|
||||
|
||||
It("blocks private 172.16.x.x range", func() {
|
||||
err := ValidateExternalURL("http://172.16.0.1/secret")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring("internal"))
|
||||
})
|
||||
|
||||
It("blocks private 192.168.x.x range", func() {
|
||||
err := ValidateExternalURL("http://192.168.1.1/secret")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring("internal"))
|
||||
})
|
||||
|
||||
It("blocks link-local 169.254.x.x (AWS metadata)", func() {
|
||||
err := ValidateExternalURL("http://169.254.169.254/latest/meta-data/")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring("internal"))
|
||||
})
|
||||
|
||||
It("blocks unsupported schemes", func() {
|
||||
err := ValidateExternalURL("ftp://example.com/file")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring("unsupported URL scheme"))
|
||||
})
|
||||
|
||||
It("blocks file:// scheme", func() {
|
||||
err := ValidateExternalURL("file:///etc/passwd")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring("unsupported URL scheme"))
|
||||
})
|
||||
|
||||
It("blocks URLs with no hostname", func() {
|
||||
err := ValidateExternalURL("http:///path")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring("no hostname"))
|
||||
})
|
||||
|
||||
It("blocks .local hostnames", func() {
|
||||
err := ValidateExternalURL("http://myservice.local/api")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring("internal"))
|
||||
})
|
||||
|
||||
It("blocks metadata.google.internal", func() {
|
||||
err := ValidateExternalURL("http://metadata.google.internal/computeMetadata/v1/")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring("metadata"))
|
||||
})
|
||||
|
||||
It("blocks 0.0.0.0", func() {
|
||||
err := ValidateExternalURL("http://0.0.0.0/")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring("internal"))
|
||||
})
|
||||
|
||||
It("blocks IPv6 loopback ::1", func() {
|
||||
err := ValidateExternalURL("http://[::1]/secret")
|
||||
Expect(err).ToNot(BeNil())
|
||||
Expect(err.Error()).To(ContainSubstring("internal"))
|
||||
})
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user