Merge origin/master + pin-sync paged backend to 0ed235ea

master auto-bumped the stock llama-cpp pin 9d5d882d -> 0ed235ea and updated the
shared grpc-server.cpp. The paged backend's pin must track the stock pin (the
grpc-server.cpp is shared), so bump its LLAMA_VERSION to match. All 28 paged
patches apply clean on 0ed235ea (verified against a fresh upstream clone). The
bf16-tau state-serialization fix (patch 0026) is included. Bit-exact gate + full
grpc-server build verify on GPU/CI to follow.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-06-28 07:56:47 +00:00
95 changed files with 6339 additions and 487 deletions

View File

@@ -137,7 +137,7 @@ RUN <<EOT bash
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
apt-get install -y --no-install-recommends \
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} libcudnn9-dev-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
fi
apt-get clean && \
rm -rf /var/lib/apt/lists/*

View File

@@ -1,15 +1,6 @@
## Clip/LLaVA library for multimodal support — built locally from copied sources
set(TARGET myclip)
add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
install(TARGETS ${TARGET} LIBRARY)
target_include_directories(myclip PUBLIC .)
target_include_directories(myclip PUBLIC ../..)
target_include_directories(myclip PUBLIC ../../common)
target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if (NOT MSVC)
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual)
endif()
## Multimodal support is provided by the in-tree `mtmd` library target
## (examples/mtmd/), which the grpc-server links and includes below. clip/llava
## were pruned upstream; the high-level mtmd_* / mtmd_helper_* API is used instead.
set(TARGET grpc-server)
set(CMAKE_CXX_STANDARD 17)
@@ -67,12 +58,16 @@ add_library(hw_grpc_proto
${hw_proto_hdrs} )
add_executable(${TARGET} grpc-server.cpp json.hpp)
target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
# mtmd public headers (mtmd.h / mtmd-helper.h) live in examples/mtmd/.
# Linking the mtmd target also propagates this include dir, but we add it
# explicitly for clarity.
target_include_directories(${TARGET} PRIVATE ../mtmd)
target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
absl::flags_parse
gRPC::${_REFLECTION}
gRPC::${_GRPC_GRPCPP}
protobuf::${_PROTOBUF_LIBPROTOBUF})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
target_compile_features(${TARGET} PRIVATE cxx_std_17)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View File

@@ -1,5 +1,5 @@
IK_LLAMA_VERSION?=b84902d2ad27c34f989f23947200c4b91b1568fd
IK_LLAMA_VERSION?=f96eaddba8bed6a9a5e628bbf6a566775c70b49c
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
CMAKE_ARGS?=

View File

@@ -11,8 +11,8 @@
#include <memory>
#include <string>
#include <getopt.h>
#include "clip.h"
#include "llava.h"
#include "mtmd.h"
#include "mtmd-helper.h"
#include "log.h"
#include "common.h"
#include "json.hpp"
@@ -45,7 +45,9 @@ using backend::HealthMessage;
///// LLAMA.CPP server code below
using json = nlohmann::json;
// Match mtmd.h and ik_llama's server/common headers, which all use
// nlohmann::ordered_json; a plain nlohmann::json alias collides at global scope.
using json = nlohmann::ordered_json;
struct server_params
{
@@ -219,6 +221,11 @@ struct llama_client_slot
// multimodal
std::vector<slot_image> images;
// Full prompt with mtmd media markers (mtmd_default_marker()) substituted in
// place of the legacy [img-N] tags, covering the text up to and including the
// last image. The text after the last image is kept in params.input_suffix and
// decoded through the normal token path so the sampling loop is unchanged.
std::string mtmd_prompt;
// stats
size_t sent_count = 0;
@@ -252,14 +259,14 @@ struct llama_client_slot
for (slot_image & img : images)
{
free(img.image_embedding);
if (img.img_data) {
clip_image_u8_free(img.img_data);
if (img.bitmap) {
mtmd_bitmap_free(img.bitmap);
img.bitmap = nullptr;
}
img.prefix_prompt = "";
}
images.clear();
mtmd_prompt = "";
}
bool has_budget(gpt_params &global_params) {
@@ -396,46 +403,13 @@ struct llama_metrics {
}
};
struct llava_embd_batch {
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id> seq_id_0;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
};
struct llama_server_context
{
llama_model *model = nullptr;
llama_context *ctx = nullptr;
const llama_vocab * vocab = nullptr;
clip_ctx *clp_ctx = nullptr;
mtmd_context *mctx = nullptr;
gpt_params params;
@@ -491,11 +465,6 @@ struct llama_server_context
if (!params.mmproj.path.empty()) {
multimodal = true;
LOG_INFO("Multi Modal Mode Enabled", {});
clp_ctx = clip_model_load(params.mmproj.path.c_str(), /*verbosity=*/ 1);
if(clp_ctx == nullptr) {
LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
return false;
}
if (params.n_ctx < 2048) { // request larger context for the image embedding
params.n_ctx = 2048;
@@ -512,10 +481,24 @@ struct llama_server_context
}
if (multimodal) {
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
const int n_embd_llm = llama_model_n_embd(model);
if (n_embd_clip != n_embd_llm) {
LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
// mtmd_init_from_file requires the already-loaded text model, so it must
// run AFTER llama_init_from_gpt_params. It validates the projector
// against the model internally and returns nullptr on dim mismatch, so
// the explicit clip_n_mmproj_embd check is no longer needed.
mtmd_context_params mparams = mtmd_context_params_default();
mparams.use_gpu = params.mmproj_use_gpu;
mparams.print_timings = false;
mparams.n_threads = params.n_threads_mtmd != -1 ? params.n_threads_mtmd
: params.n_threads_batch != -1 ? params.n_threads_batch
: params.n_threads;
mparams.verbosity = GGML_LOG_LEVEL_INFO;
mparams.flash_attn_type = params.flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED
: LLAMA_FLASH_ATTN_TYPE_DISABLED;
mparams.image_min_tokens = params.image_min_tokens;
mparams.image_max_tokens = params.image_max_tokens;
mctx = mtmd_init_from_file(params.mmproj.path.c_str(), model, mparams);
if (mctx == nullptr) {
LOG_ERR("unable to load multimodal projector: %s", params.mmproj.path.c_str());
llama_free(ctx);
llama_free_model(model);
return false;
@@ -865,8 +848,8 @@ struct llama_server_context
slot_image img_sl;
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
img_sl.img_data = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
img_sl.bitmap = mtmd_helper_bitmap_init_from_buf(mctx, image_buffer.data(), image_buffer.size());
if (img_sl.bitmap == nullptr)
{
LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d",
__func__,
@@ -879,50 +862,74 @@ struct llama_server_context
{"slot_id", slot->id},
{"img_sl_id", img_sl.id}
});
img_sl.request_encode_image = true;
slot->images.push_back(img_sl);
}
// process prompt
// example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
// Translate the legacy [img-N] tags into mtmd media markers, in
// order, and collect the matching bitmaps in marker order so they
// line up with the markers passed to mtmd_tokenize(). The text after
// the last image stays in input_suffix and is decoded through the
// normal token path, so the sampling loop is unchanged.
// example: system prompt [img-102] user [img-103] describe [img-134]
if (slot->images.size() > 0 && !slot->prompt.is_array())
{
const std::string marker = mtmd_default_marker();
std::string prompt = slot->prompt.get<std::string>();
size_t pos = 0, begin_prefix = 0;
std::string built_prompt;
std::vector<slot_image> ordered;
size_t pos = 0, copy_from = 0;
std::string pattern = "[img-";
while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
size_t end_prefix = pos;
pos += pattern.length();
size_t end_pos = prompt.find(']', pos);
if (end_pos != std::string::npos)
{
std::string image_id = prompt.substr(pos, end_pos - pos);
try
{
int img_id = std::stoi(image_id);
bool found = false;
for (slot_image &img : slot->images)
{
if (img.id == img_id) {
found = true;
img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
begin_prefix = end_pos + 1;
break;
}
}
if (!found) {
LOG("ERROR: Image with id: %i, not found.\n", img_id);
slot->images.clear();
return false;
}
} catch (const std::invalid_argument& e) {
LOG("Invalid image number id in prompt\n");
slot->images.clear();
return false;
auto free_images = [&]() {
for (slot_image &img : slot->images) {
if (img.bitmap) {
mtmd_bitmap_free(img.bitmap);
img.bitmap = nullptr;
}
}
slot->images.clear();
};
while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
size_t tag_begin = pos;
pos += pattern.length();
size_t end_pos = prompt.find(']', pos);
if (end_pos == std::string::npos) {
break;
}
std::string image_id = prompt.substr(pos, end_pos - pos);
try
{
int img_id = std::stoi(image_id);
bool found = false;
for (slot_image &img : slot->images)
{
if (img.id == img_id) {
found = true;
// text before this tag, then the media marker
built_prompt += prompt.substr(copy_from, tag_begin - copy_from);
built_prompt += marker;
copy_from = end_pos + 1;
ordered.push_back(img);
break;
}
}
if (!found) {
LOG("ERROR: Image with id: %i, not found.\n", img_id);
free_images();
return false;
}
} catch (const std::invalid_argument& e) {
LOG("Invalid image number id in prompt\n");
free_images();
return false;
}
pos = end_pos + 1;
}
// bitmaps are consumed in marker order by mtmd_tokenize()
slot->images = ordered;
slot->mtmd_prompt = built_prompt;
slot->prompt = "";
slot->params.input_suffix = prompt.substr(begin_prefix);
slot->params.input_suffix = prompt.substr(copy_from);
slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
}
}
@@ -1176,21 +1183,10 @@ struct llama_server_context
bool process_images(llama_client_slot &slot) const
{
for (slot_image &img : slot.images)
{
if (!img.request_encode_image)
{
continue;
}
if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
LOG("Error processing the given image");
return false;
}
img.request_encode_image = false;
}
// With the mtmd pipeline, image encoding is no longer eager: the bitmaps
// are tokenized and encoded together with the surrounding text inside
// ingest_images() via mtmd_tokenize() + mtmd_helper_eval_chunks(). This
// just reports whether the slot carries any images to process.
return slot.images.size() > 0;
}
@@ -1435,69 +1431,70 @@ struct llama_server_context
}
}
// for multiple images processing
// Tokenize the multimodal prompt (text interleaved with media markers) together
// with the slot's bitmaps, then decode the resulting chunks into the llama
// context via the high-level mtmd helper. The helper runs llama_decode() on the
// text chunks and mtmd_encode() + llama_decode() on the image chunks, handling
// batching and any pre/post decode setup (e.g. non-causal attention for gemma3).
// Advances slot.n_past by the number of positions consumed, then leaves the
// post-image suffix tokens in `batch` so the normal decode + sampling loop
// produces the first generated token.
bool ingest_images(llama_client_slot &slot, int n_batch)
{
int image_idx = 0;
while (image_idx < (int) slot.images.size())
if (mctx == nullptr)
{
slot_image &img = slot.images[image_idx];
LOG("%s : multimodal context is not initialized\n", __func__);
return false;
}
// process prefix prompt
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
{
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
llama_batch batch_view = {
n_tokens,
batch.token + i,
nullptr,
batch.pos + i,
batch.n_seq_id + i,
batch.seq_id + i,
batch.logits + i,
};
if (llama_decode(ctx, batch_view))
{
LOG("%s : failed to eval\n", __func__);
return false;
}
}
// bitmaps stay owned by slot.images (freed on reset()); pass non-owning ptrs
std::vector<const mtmd_bitmap *> bitmaps;
bitmaps.reserve(slot.images.size());
for (const slot_image &img : slot.images)
{
bitmaps.push_back(img.bitmap);
}
// process image with llm
for (int i = 0; i < img.image_tokens; i += n_batch)
{
int n_eval = img.image_tokens - i;
if (n_eval > n_batch)
{
n_eval = n_batch;
}
mtmd_input_text inp_txt;
inp_txt.text = slot.mtmd_prompt.c_str();
inp_txt.add_special = add_bos_token;
inp_txt.parse_special = true;
const int n_embd = llama_model_n_embd(model);
float * embd = img.image_embedding + i * n_embd;
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
if (llama_decode(ctx, llava_batch.batch))
{
LOG("%s : failed to eval image\n", __func__);
return false;
}
slot.n_past += n_eval;
}
image_idx++;
mtmd::input_chunks chunks(mtmd_input_chunks_init());
int32_t res = mtmd_tokenize(mctx,
chunks.ptr.get(),
&inp_txt,
bitmaps.data(),
bitmaps.size());
if (res != 0)
{
LOG("%s : failed to tokenize multimodal prompt, res = %d\n", __func__, res);
return false;
}
common_batch_clear(batch);
const llama_pos start_pos = (llama_pos) system_tokens.size() + slot.n_past;
llama_pos new_n_past = start_pos;
if (mtmd_helper_eval_chunks(mctx,
ctx,
chunks.ptr.get(),
start_pos,
slot.id,
n_batch,
/*logits_last=*/ false,
&new_n_past) != 0)
{
LOG("%s : failed to eval multimodal chunks\n", __func__);
return false;
}
slot.n_past += (int32_t) (new_n_past - start_pos);
// append prefix of next image
const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
slot.params.input_suffix : // no more images, then process suffix prompt
(json)(slot.images[image_idx].prefix_prompt);
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
for (int i = 0; i < (int) append_tokens.size(); ++i)
{
common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
slot.n_past += 1;
}
// queue the post-image suffix text for the normal decode + sampling path
common_batch_clear(batch);
std::vector<llama_token> suffix_tokens = tokenize(slot.params.input_suffix, false);
for (llama_token tok : suffix_tokens)
{
common_batch_add(batch, tok, system_tokens.size() + slot.n_past, { slot.id }, false);
slot.n_past += 1;
}
return true;
@@ -1884,8 +1881,11 @@ struct llama_server_context
const bool has_images = process_images(slot);
// process the prefix of first image
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
// For the multimodal path the whole pre-image / inter-image text is
// tokenized and decoded inside ingest_images() via mtmd, so no prefix
// tokens are queued here; the post-image suffix is appended by
// ingest_images() for the normal decode + sampling loop.
std::vector<llama_token> prefix_tokens = has_images ? std::vector<llama_token>() : prompt_tokens;
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;

View File

@@ -1,11 +0,0 @@
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2494,7 +2494,7 @@
}
new_data = work.data();
- new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr, nullptr);
} else {
new_type = cur->type;
new_data = cur->data;

View File

@@ -17,28 +17,9 @@ cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
cp -r utils.hpp llama.cpp/examples/grpc-server/
cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/examples/grpc-server/
## Copy clip/llava files for multimodal support (built as myclip library)
cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
# Prepend llama.h include to llava.h
echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
# Copy clip-impl.h if it exists
if [ -f llama.cpp/examples/llava/clip-impl.h ]; then
cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
fi
# Copy stb_image.h
if [ -f llama.cpp/vendor/stb/stb_image.h ]; then
cp -rfv llama.cpp/vendor/stb/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
elif [ -f llama.cpp/common/stb_image.h ]; then
cp -rfv llama.cpp/common/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
fi
## Fix API compatibility in llava.cpp (llama_n_embd -> llama_model_n_embd)
if [ -f llama.cpp/examples/grpc-server/llava.cpp ]; then
sed -i 's/llama_n_embd(/llama_model_n_embd(/g' llama.cpp/examples/grpc-server/llava.cpp
fi
## Multimodal support is provided by the `mtmd` library target (examples/mtmd/),
## which the grpc-server links and includes directly. No source copy is needed:
## clip/llava were pruned upstream and the high-level mtmd_* API is used instead.
set +e
if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then

View File

@@ -11,9 +11,12 @@
#include "json.hpp"
#include "clip.h"
#include "mtmd.h"
using json = nlohmann::json;
// mtmd.h and ik_llama's entire server/common stack (chat.h, server-common.h,
// server-task.h, ...) declare `using json = nlohmann::ordered_json`, so match it
// here: a plain `nlohmann::json` alias collides with mtmd.h's at global scope.
using json = nlohmann::ordered_json;
extern bool server_verbose;
@@ -111,13 +114,12 @@ struct slot_image
{
int32_t id;
bool request_encode_image = false;
float * image_embedding = nullptr;
int32_t image_tokens = 0;
clip_image_u8 * img_data;
std::string prefix_prompt; // before of this image
// mtmd bitmap (image/audio) decoded from the request buffer. Owned by the
// slot; freed via mtmd_bitmap_free() on reset. The high-level mtmd pipeline
// (mtmd_tokenize + mtmd_helper_eval_chunks) consumes these directly, so the
// legacy eager-encode fields (embedding/tokens) and per-image prefix prompt
// are no longer needed.
mtmd_bitmap * bitmap = nullptr;
};
// completion token output with probabilities

View File

@@ -49,7 +49,7 @@
# helpers that the refactor pulled into the headers grpc-server.cpp includes.
# Therefore a PIN_SYNC must pass the FULL grpc-server build/link on CI, not only
# the bit-exact gate. See README section 7 + .agents/llama-cpp-localai-paged-backend.md.
LLAMA_VERSION?=9d5d882d8cd0f0a9283d87ed5e6fe3ee0d925fb1
LLAMA_VERSION?=0ed235ea2c17a19fc8238668653946721ed136fd
CMAKE_ARGS?=
BUILD_TYPE?=

View File

@@ -4,7 +4,7 @@
# (backend/cpp/llama-cpp-localai-paged) does NOT inherit this pin: it owns its
# own LLAMA_VERSION because its vendored patch series would break on a naive
# bump and is advanced only by the manual PIN_SYNC process.
LLAMA_VERSION?=9d5d882d8cd0f0a9283d87ed5e6fe3ee0d925fb1
LLAMA_VERSION?=0ed235ea2c17a19fc8238668653946721ed136fd
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?=
@@ -161,11 +161,11 @@ llama-cpp-grpc: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target ggml-rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc
llama-cpp-rpc-server: llama-cpp-grpc
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/ggml-rpc-server llama-cpp-rpc-server
llama.cpp:
mkdir -p llama.cpp

View File

@@ -30,6 +30,19 @@
#define LOCALAI_HAS_SERVER_SCHEMA 1
#include "server-schema.cpp"
#endif
// server-stream.cpp exists only in llama.cpp after the upstream refactor that
// added the SSE stream-resumption layer (stream_session/stream_pipe_producer).
// server-context.cpp calls into it (spipe->cleanup(), stream_aware_should_stop,
// stream_session_attach_pipe), so its definitions must be part of this
// translation unit or the link fails with "undefined reference to
// stream_pipe_producer::cleanup()". The file is self-contained (its only
// external symbols come from server-common, already pulled in above) and the
// http route-handler factories it also defines are unused here but harmless.
// __has_include keeps the source compatible with older pins/forks that predate
// the split.
#if __has_include("server-stream.cpp")
#include "server-stream.cpp"
#endif
#include "server-context.cpp"
// LocalAI

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# CrispASR version (release tag)
CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
CRISPASR_VERSION?=8f1218141b792b8868861c1af17ba1e361b05dc0
CRISPASR_VERSION?=6514c9da00b03a2f0f1b49a43fae4f3a01a41844
SO_TARGET?=libgocrispasr.so
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

18
backend/go/face-detect/.gitignore vendored Normal file
View File

@@ -0,0 +1,18 @@
# Fetched upstream sources
sources/
# CMake build directories
build*/
# build artifacts staged in-tree by the Makefile (cp from sources/) or
# symlinked for local dev; the real sources live in face-detect.cpp upstream.
*.so
*.so.*
facedetect_capi.h
compile_commands.json
# Compiled backend binary
face-detect-grpc
# Packaging output
package/

View File

@@ -0,0 +1,110 @@
# face-detect backend Makefile.
#
# Upstream pin lives below as FACEDETECT_VERSION?=06914b0... (.github/bump_deps.sh
# can find and update it - matches the voice-detect / parakeet.cpp / whisper.cpp
# convention).
#
# Local dev shortcut: if you already have an out-of-tree face-detect.cpp build,
# symlink the .so + header into this directory and skip the clone/cmake steps:
#
# ln -sf /path/to/face-detect.cpp/build-shared/libfacedetect.so .
# ln -sf /path/to/face-detect.cpp/include/facedetect_capi.h .
# go build -o face-detect-grpc .
#
# The default target below does the proper clone-at-pin + cmake build so CI does
# not need a side-checkout.
FACEDETECT_VERSION?=06914b077d52f90d5421299138e7be6bdd06b5e8
FACEDETECT_REPO?=https://github.com/mudler/face-detect.cpp
GOCMD?=go
GO_TAGS?=
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
BUILD_TYPE?=
NATIVE?=false
# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
# Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC)
# so the shared lib is self-contained: dlopen needs no libggml*.so alongside it,
# only system libs (libstdc++/libgomp/libc) the runtime image already provides.
# The vendored jpeg symbols are hidden via -Wl,--exclude-libs,ALL on the C++
# side, so only the facedetect_capi_* surface is exported.
CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DFACEDETECT_SHARED=ON -DFACEDETECT_BUILD_CLI=OFF -DFACEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF
endif
# face-detect.cpp gates its GGML backends behind FACEDETECT_GGML_* options and
# does set(GGML_CUDA ${FACEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
# -DGGML_CUDA=ON is overwritten back to OFF. Forward the FACEDETECT_GGML_*
# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON
# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, SCRFD 2.3x
# vs torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
ifeq ($(CUDA_MAJOR_VERSION),13)
ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
CMAKE_ARGS+=-DFACEDETECT_GGML_CUDNN=ON
endif
endif
else ifeq ($(BUILD_TYPE),openblas)
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DFACEDETECT_GGML_HIP=ON
else ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS+=-DFACEDETECT_GGML_VULKAN=ON
else ifeq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DFACEDETECT_GGML_METAL=ON
endif
.PHONY: face-detect-grpc package build clean purge test all
all: face-detect-grpc
# Clone the upstream face-detect.cpp source at the pinned commit. Directory acts
# as the target so make only re-clones when missing. After a FACEDETECT_VERSION
# bump, run 'make purge && make' to refetch.
sources/face-detect.cpp:
mkdir -p sources/face-detect.cpp
cd sources/face-detect.cpp && \
git init -q && \
git remote add origin $(FACEDETECT_REPO) && \
git fetch --depth 1 origin $(FACEDETECT_VERSION) && \
git checkout FETCH_HEAD && \
git submodule update --init --recursive --depth 1 --single-branch
# Build the shared lib + header out-of-tree, then stage them next to the Go
# sources so purego.Dlopen("libfacedetect.so") and the cgo-less build both pick
# them up.
libfacedetect.so: sources/face-detect.cpp
cmake -B sources/face-detect.cpp/build-shared -S sources/face-detect.cpp $(CMAKE_ARGS)
cmake --build sources/face-detect.cpp/build-shared --config Release -j$(JOBS) --target facedetect
cp -fv sources/face-detect.cpp/build-shared/libfacedetect.so* ./ 2>/dev/null || true
cp -fv sources/face-detect.cpp/include/facedetect_capi.h ./
face-detect-grpc: libfacedetect.so main.go gofacedetect.go options.go
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o face-detect-grpc .
package: face-detect-grpc
bash package.sh
build: package
# Test target. The embed/detect/verify/analyze smoke specs are gated on
# FACEDETECT_BACKEND_TEST_MODEL + FACEDETECT_BACKEND_TEST_IMAGE; without them the
# heavy specs auto-skip and only the pure-Go parsing specs run.
test:
LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
clean: purge
rm -rf libfacedetect.so* facedetect_capi.h package face-detect-grpc
purge:
rm -rf sources/face-detect.cpp

View File

@@ -0,0 +1,431 @@
package main
import (
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"unsafe"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/xlog"
)
// purego-bound entry points from libfacedetect.so. Names match
// facedetect_capi.h exactly so a `nm libfacedetect.so | grep facedetect_capi`
// is enough to spot drift.
//
// The opaque ctx and the malloc'd char*/float* return values are declared as
// uintptr so we get the raw pointer back and can release it via the matching
// capi free function. purego's native string/[]float32 returns would copy and
// forget the original pointer, leaking the C-owned buffer on every call.
var (
CppAbiVersion func() int32
CppLoad func(ggufPath string) uintptr
CppFree func(ctx uintptr)
CppLastError func(ctx uintptr) string
CppFreeString func(s uintptr)
CppFreeVec func(v uintptr)
CppEmbedPath func(ctx uintptr, imagePath string, outVec, outDim unsafe.Pointer) int32
CppEmbedRGB func(ctx uintptr, rgb []byte, width, height int32, outVec, outDim unsafe.Pointer) int32
CppDetectJSON func(ctx uintptr, imagePath string) uintptr
CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, antiSpoof int32, outDistance, outVerified unsafe.Pointer) int32
CppAnalyzeJSON func(ctx uintptr, imagePath string) uintptr
)
// FaceDetect implements the face-recognition (biometric) subset of the Backend
// gRPC service over libfacedetect.so. The C side keeps a single loaded model
// pack plus a per-ctx last-error buffer and is not reentrant, so
// base.SingleThread serializes every call.
type FaceDetect struct {
base.SingleThread
opts loadOptions
ctxPtr uintptr
}
func (f *FaceDetect) Load(opts *pb.ModelOptions) error {
model := opts.ModelFile
if model == "" {
model = opts.ModelPath
}
if !filepath.IsAbs(model) && opts.ModelPath != "" {
model = filepath.Join(opts.ModelPath, model)
}
if model == "" {
return errors.New("face-detect: ModelFile is required")
}
f.opts = parseOptions(opts.Options)
if f.opts.modelName == "" {
f.opts.modelName = filepath.Base(model)
}
// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
// one backend process per model and serves requests concurrently, so the
// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
// FACEDETECT_THREADS is read by the engine at backend construction, so it
// must be set before the capi load. A non-positive Threads means "unset":
// leave the env alone so the engine keeps its sane default.
threads := opts.Threads
if threads > 0 {
if err := os.Setenv("FACEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
return fmt.Errorf("face-detect: set FACEDETECT_THREADS: %w", err)
}
xlog.Info("face-detect: applying LocalAI thread budget", "threads", threads)
}
xlog.Info("face-detect: loading model", "model", model,
"verify_threshold", f.opts.verifyThreshold, "abi", CppAbiVersion())
ctx := CppLoad(model)
if ctx == 0 {
// The last-error buffer lives on the ctx that was never returned, so
// surface the path the operator tried to load instead.
return fmt.Errorf("face-detect: facedetect_capi_load failed for %q", model)
}
f.ctxPtr = ctx
return nil
}
// Embeddings returns the L2-normalized ArcFace embedding of the primary face in
// the supplied image. Mirroring the Python face backend, the image is read from
// Images[0] as a base64 payload; materializeImage decodes it to a temp file so
// the path-based C-API can run its own decode (cv2.imread parity). The gRPC
// server wraps the returned slice in an EmbeddingResult.
func (f *FaceDetect) Embeddings(req *pb.PredictOptions) ([]float32, error) {
if f.ctxPtr == 0 {
return nil, errors.New("face-detect: model not loaded")
}
if len(req.Images) == 0 || req.Images[0] == "" {
return nil, errors.New("face-detect: Embedding requires Images[0] to be a base64 image")
}
path, cleanup, err := materializeImage(req.Images[0])
if err != nil {
return nil, err
}
defer cleanup()
return f.embedPath(path)
}
func (f *FaceDetect) embedPath(path string) ([]float32, error) {
var vec uintptr
var dim int32
rc := CppEmbedPath(f.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
if rc != 0 || vec == 0 || dim <= 0 {
return nil, f.lastErr("embed", path)
}
defer CppFreeVec(vec)
// Copy out of the C-owned malloc'd buffer before freeing it. The
// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
// nor moves this buffer and we copy immediately.
src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
out := make([]float32, int(dim))
copy(out, src)
return out, nil
}
// Detect runs SCRFD over the image and returns one Detection per face. The
// C-API emits a box as [x1,y1,x2,y2] in pixels; the proto carries x/y plus
// width/height, so the corners are converted. The 5 facial landmarks the engine
// also returns are dropped: the Detection message has no field for them.
func (f *FaceDetect) Detect(req *pb.DetectOptions) (pb.DetectResponse, error) {
if f.ctxPtr == 0 {
return pb.DetectResponse{}, errors.New("face-detect: model not loaded")
}
if req.Src == "" {
return pb.DetectResponse{}, errors.New("face-detect: src image is required")
}
path, cleanup, err := materializeImage(req.Src)
if err != nil {
return pb.DetectResponse{}, err
}
defer cleanup()
faces, err := f.detectFaces(path)
if err != nil {
return pb.DetectResponse{}, err
}
dets := make([]*pb.Detection, 0, len(faces))
for _, fc := range faces {
if req.Threshold > 0 && fc.Score < req.Threshold {
continue
}
x, y, w, h := fc.xywh()
dets = append(dets, &pb.Detection{
X: x,
Y: y,
Width: w,
Height: h,
Confidence: fc.Score,
ClassName: "face",
})
}
return pb.DetectResponse{Detections: dets}, nil
}
// FaceVerify embeds the primary face in each image and reports whether they are
// the same identity by cosine distance against a threshold. A request threshold
// <= 0 falls back to the model-configured default (verify_threshold option,
// 0.35 if unset). When anti_spoofing is set, the C-API applies a MiniFASNet
// veto internally (verified forced false on a spoof); the per-image liveness
// scores are not exposed by the verify entry point, so img*_is_real /
// img*_antispoof_score stay at their zero values.
func (f *FaceDetect) FaceVerify(req *pb.FaceVerifyRequest) (pb.FaceVerifyResponse, error) {
if f.ctxPtr == 0 {
return pb.FaceVerifyResponse{}, errors.New("face-detect: model not loaded")
}
if req.Img1 == "" || req.Img2 == "" {
return pb.FaceVerifyResponse{}, errors.New("face-detect: img1 and img2 are required")
}
path1, cleanup1, err := materializeImage(req.Img1)
if err != nil {
return pb.FaceVerifyResponse{}, err
}
defer cleanup1()
path2, cleanup2, err := materializeImage(req.Img2)
if err != nil {
return pb.FaceVerifyResponse{}, err
}
defer cleanup2()
threshold := req.Threshold
if threshold <= 0 {
threshold = f.opts.verifyThreshold
}
antiSpoof := int32(0)
if req.AntiSpoofing {
antiSpoof = 1
}
started := time.Now()
var distance float32
var verified int32
rc := CppVerifyPaths(f.ctxPtr, path1, path2, threshold, antiSpoof,
unsafe.Pointer(&distance), unsafe.Pointer(&verified))
if rc != 0 {
return pb.FaceVerifyResponse{}, f.lastErr("verify", req.Img1[:min(8, len(req.Img1))]+"...")
}
elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
// matching the Python face backend's reporting.
confidence := float32(0)
if threshold > 0 {
confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
}
return pb.FaceVerifyResponse{
Verified: verified != 0,
Distance: distance,
Threshold: threshold,
Confidence: confidence,
Model: f.opts.modelName,
Img1Area: f.bestArea(path1),
Img2Area: f.bestArea(path2),
ProcessingTimeMs: elapsedMs,
}, nil
}
// FaceAnalyze runs the genderage head on every detected face. The C-API returns
// "M"/"F" gender labels and a rounded age; the labels are normalized to the
// "Man"/"Woman" values the proto documents.
func (f *FaceDetect) FaceAnalyze(req *pb.FaceAnalyzeRequest) (pb.FaceAnalyzeResponse, error) {
if f.ctxPtr == 0 {
return pb.FaceAnalyzeResponse{}, errors.New("face-detect: model not loaded")
}
if req.Img == "" {
return pb.FaceAnalyzeResponse{}, errors.New("face-detect: img is required")
}
path, cleanup, err := materializeImage(req.Img)
if err != nil {
return pb.FaceAnalyzeResponse{}, err
}
defer cleanup()
ptr := CppAnalyzeJSON(f.ctxPtr, path)
if ptr == 0 {
return pb.FaceAnalyzeResponse{}, f.lastErr("analyze", path)
}
defer CppFreeString(ptr)
faces, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
if err != nil {
return pb.FaceAnalyzeResponse{}, fmt.Errorf("face-detect: analyze JSON: %w", err)
}
return pb.FaceAnalyzeResponse{Faces: faces}, nil
}
// faceBox is one entry of the detect/analyze JSON documents the engine emits.
type faceBox struct {
Score float32 `json:"score"`
Box []float32 `json:"box"`
Age float32 `json:"age"`
Gender string `json:"gender"`
}
// xywh converts the engine's [x1,y1,x2,y2] box into the x/y/width/height the
// proto carries. A short or missing box yields zeros.
func (b faceBox) xywh() (x, y, w, h float32) {
if len(b.Box) < 4 {
return 0, 0, 0, 0
}
return b.Box[0], b.Box[1], b.Box[2] - b.Box[0], b.Box[3] - b.Box[1]
}
type facesJSON struct {
Faces []faceBox `json:"faces"`
}
func (f *FaceDetect) detectFaces(path string) ([]faceBox, error) {
ptr := CppDetectJSON(f.ctxPtr, path)
if ptr == 0 {
return nil, f.lastErr("detect", path)
}
defer CppFreeString(ptr)
var doc facesJSON
if err := json.Unmarshal([]byte(goStringFromCPtr(ptr)), &doc); err != nil {
return nil, fmt.Errorf("face-detect: detect JSON: %w", err)
}
return doc.Faces, nil
}
// bestArea returns the FacialArea of the highest-scoring face in an image, or an
// empty area when detection fails or finds nothing. Best-effort: verify already
// succeeded, so a missing region must not turn a valid match into an error.
func (f *FaceDetect) bestArea(path string) *pb.FacialArea {
faces, err := f.detectFaces(path)
if err != nil || len(faces) == 0 {
return &pb.FacialArea{}
}
best := faces[0]
for _, fc := range faces[1:] {
if fc.Score > best.Score {
best = fc
}
}
x, y, w, h := best.xywh()
return &pb.FacialArea{X: x, Y: y, W: w, H: h}
}
// parseAnalyzeJSON maps the engine's analyze document onto FaceAnalysis entries.
// The engine reports gender as "M"/"F"; both the dominant label and the score
// map are filled with the "Man"/"Woman" form the proto documents.
func parseAnalyzeJSON(doc string) ([]*pb.FaceAnalysis, error) {
var parsed facesJSON
if err := json.Unmarshal([]byte(doc), &parsed); err != nil {
return nil, err
}
out := make([]*pb.FaceAnalysis, 0, len(parsed.Faces))
for _, fc := range parsed.Faces {
x, y, w, h := fc.xywh()
fa := &pb.FaceAnalysis{
Region: &pb.FacialArea{X: x, Y: y, W: w, H: h},
FaceConfidence: fc.Score,
Age: fc.Age,
}
if label := normalizeGender(fc.Gender); label != "" {
fa.DominantGender = label
fa.Gender = map[string]float32{label: 1.0}
}
out = append(out, fa)
}
return out, nil
}
// normalizeGender maps the engine's "M"/"F" code to the "Man"/"Woman" labels the
// proto documents. Unknown codes pass through unchanged.
func normalizeGender(g string) string {
switch strings.ToUpper(strings.TrimSpace(g)) {
case "M":
return "Man"
case "F":
return "Woman"
case "":
return ""
default:
return g
}
}
// materializeImage decodes a base64 image payload into a temp file and returns
// its path plus a cleanup func. As a convenience for callers that already pass a
// filesystem path (e.g. a test fixture), an existing path is used as-is with a
// no-op cleanup. data: URI prefixes are stripped before decoding.
func materializeImage(src string) (path string, cleanup func(), err error) {
noop := func() {}
if src == "" {
return "", noop, errors.New("face-detect: empty image input")
}
if _, statErr := os.Stat(src); statErr == nil {
return src, noop, nil
}
payload := src
if i := strings.Index(payload, ","); strings.HasPrefix(payload, "data:") && i >= 0 {
payload = payload[i+1:]
}
data, decErr := base64.StdEncoding.DecodeString(strings.TrimSpace(payload))
if decErr != nil || len(data) == 0 {
return "", noop, errors.New("face-detect: image is neither an existing path nor valid base64")
}
tmp, createErr := os.CreateTemp("", "face-detect-*.img")
if createErr != nil {
return "", noop, fmt.Errorf("face-detect: create temp image: %w", createErr)
}
cleanup = func() { _ = os.Remove(tmp.Name()) }
if _, wErr := tmp.Write(data); wErr != nil {
_ = tmp.Close()
cleanup()
return "", noop, fmt.Errorf("face-detect: write temp image: %w", wErr)
}
if cErr := tmp.Close(); cErr != nil {
cleanup()
return "", noop, fmt.Errorf("face-detect: close temp image: %w", cErr)
}
return tmp.Name(), cleanup, nil
}
// lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
func (f *FaceDetect) lastErr(op, subject string) error {
msg := strings.TrimSpace(CppLastError(f.ctxPtr))
if msg == "" {
msg = "no error detail"
}
return fmt.Errorf("face-detect: %s failed for %q: %s", op, subject, msg)
}
// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
// malloc'd buffer the caller owns; release it via CppFreeString after the copy.
//
// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
// moves the buffer and we dereference it immediately to copy the bytes out.
func goStringFromCPtr(cptr uintptr) string {
if cptr == 0 {
return ""
}
p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
n := 0
for *(*byte)(unsafe.Add(p, n)) != 0 {
n++
}
return string(unsafe.Slice((*byte)(p), n))
}

View File

@@ -0,0 +1,230 @@
package main
import (
"encoding/base64"
"os"
"sync"
"testing"
"github.com/ebitengine/purego"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
func TestFaceDetect(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "face-detect Backend Suite")
}
var (
libLoadOnce sync.Once
libLoadErr error
)
// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
// bridge without spinning up the gRPC server. Records the error (the smoke
// specs skip themselves) when libfacedetect.so is not loadable from cwd
// (LD_LIBRARY_PATH or a symlink in ./).
func ensureLibLoaded() error {
libLoadOnce.Do(func() {
libName := os.Getenv("FACEDETECT_LIBRARY")
if libName == "" {
libName = "libfacedetect.so"
}
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
if err != nil {
libLoadErr = err
return
}
purego.RegisterLibFunc(&CppAbiVersion, lib, "facedetect_capi_abi_version")
purego.RegisterLibFunc(&CppLoad, lib, "facedetect_capi_load")
purego.RegisterLibFunc(&CppFree, lib, "facedetect_capi_free")
purego.RegisterLibFunc(&CppLastError, lib, "facedetect_capi_last_error")
purego.RegisterLibFunc(&CppFreeString, lib, "facedetect_capi_free_string")
purego.RegisterLibFunc(&CppFreeVec, lib, "facedetect_capi_free_vec")
purego.RegisterLibFunc(&CppEmbedPath, lib, "facedetect_capi_embed_path")
purego.RegisterLibFunc(&CppEmbedRGB, lib, "facedetect_capi_embed_rgb")
purego.RegisterLibFunc(&CppDetectJSON, lib, "facedetect_capi_detect_path_json")
purego.RegisterLibFunc(&CppVerifyPaths, lib, "facedetect_capi_verify_paths")
purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "facedetect_capi_analyze_path_json")
})
return libLoadErr
}
var _ = Describe("parseOptions", func() {
It("defaults verify_threshold to 0.35", func() {
o := parseOptions(nil)
Expect(o.verifyThreshold).To(Equal(float32(0.35)))
Expect(o.modelName).To(Equal(""))
})
It("parses verify_threshold, threshold alias and model_name", func() {
o := parseOptions([]string{"verify_threshold:0.4", "model_name:buffalo_l", "unknown:x"})
Expect(o.verifyThreshold).To(Equal(float32(0.4)))
Expect(o.modelName).To(Equal("buffalo_l"))
o2 := parseOptions([]string{"threshold:0.3"})
Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
})
It("ignores non-positive thresholds and keeps the default", func() {
o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
Expect(o.verifyThreshold).To(Equal(float32(0.35)))
})
})
var _ = Describe("normalizeGender", func() {
It("maps M/F codes to Man/Woman", func() {
Expect(normalizeGender("M")).To(Equal("Man"))
Expect(normalizeGender("f")).To(Equal("Woman"))
Expect(normalizeGender(" m ")).To(Equal("Man"))
})
It("passes empty and unknown codes through", func() {
Expect(normalizeGender("")).To(Equal(""))
Expect(normalizeGender("nonbinary")).To(Equal("nonbinary"))
})
})
var _ = Describe("faceBox.xywh", func() {
It("converts an [x1,y1,x2,y2] box to x/y/width/height", func() {
b := faceBox{Box: []float32{10, 20, 50, 80}}
x, y, w, h := b.xywh()
Expect(x).To(Equal(float32(10)))
Expect(y).To(Equal(float32(20)))
Expect(w).To(Equal(float32(40)))
Expect(h).To(Equal(float32(60)))
})
It("returns zeros for a short box", func() {
x, y, w, h := faceBox{Box: []float32{1, 2}}.xywh()
Expect([]float32{x, y, w, h}).To(Equal([]float32{0, 0, 0, 0}))
})
})
var _ = Describe("parseAnalyzeJSON", func() {
It("maps region, age and gender for each face", func() {
doc := `{"faces":[
{"score":0.997,"box":[10,20,50,80],"age":31,"gender":"M"},
{"score":0.81,"box":[0,0,40,40],"age":24,"gender":"F"}]}`
faces, err := parseAnalyzeJSON(doc)
Expect(err).ToNot(HaveOccurred())
Expect(faces).To(HaveLen(2))
Expect(faces[0].FaceConfidence).To(BeNumerically("~", 0.997, 1e-4))
Expect(faces[0].Age).To(BeNumerically("~", 31, 1e-4))
Expect(faces[0].DominantGender).To(Equal("Man"))
Expect(faces[0].Gender).To(HaveKeyWithValue("Man", float32(1.0)))
Expect(faces[0].Region.W).To(Equal(float32(40)))
Expect(faces[0].Region.H).To(Equal(float32(60)))
Expect(faces[1].DominantGender).To(Equal("Woman"))
})
It("tolerates a missing gender field", func() {
faces, err := parseAnalyzeJSON(`{"faces":[{"score":0.5,"box":[0,0,10,10],"age":40}]}`)
Expect(err).ToNot(HaveOccurred())
Expect(faces).To(HaveLen(1))
Expect(faces[0].DominantGender).To(Equal(""))
Expect(faces[0].Gender).To(BeEmpty())
})
It("returns no faces for an empty document", func() {
faces, err := parseAnalyzeJSON(`{"faces":[]}`)
Expect(err).ToNot(HaveOccurred())
Expect(faces).To(BeEmpty())
})
It("returns an error on malformed JSON", func() {
_, err := parseAnalyzeJSON(`{not-json`)
Expect(err).To(HaveOccurred())
})
})
var _ = Describe("materializeImage", func() {
It("decodes a base64 payload to a temp file", func() {
payload := base64.StdEncoding.EncodeToString([]byte("\xff\xd8\xff\xe0fake-jpeg"))
path, cleanup, err := materializeImage(payload)
Expect(err).ToNot(HaveOccurred())
defer cleanup()
data, rerr := os.ReadFile(path)
Expect(rerr).ToNot(HaveOccurred())
Expect(data).To(Equal([]byte("\xff\xd8\xff\xe0fake-jpeg")))
})
It("strips a data: URI prefix before decoding", func() {
payload := "data:image/png;base64," + base64.StdEncoding.EncodeToString([]byte("hello"))
path, cleanup, err := materializeImage(payload)
Expect(err).ToNot(HaveOccurred())
defer cleanup()
data, rerr := os.ReadFile(path)
Expect(rerr).ToNot(HaveOccurred())
Expect(data).To(Equal([]byte("hello")))
})
It("uses an existing path as-is", func() {
tmp, err := os.CreateTemp("", "face-detect-fixture-*.bin")
Expect(err).ToNot(HaveOccurred())
defer func() { _ = os.Remove(tmp.Name()) }()
Expect(tmp.Close()).To(Succeed())
path, cleanup, err := materializeImage(tmp.Name())
Expect(err).ToNot(HaveOccurred())
defer cleanup()
Expect(path).To(Equal(tmp.Name()))
})
It("errors on input that is neither a path nor base64", func() {
_, _, err := materializeImage("not base64!!!")
Expect(err).To(HaveOccurred())
})
})
// The specs below exercise the real C-API end to end. They run only when both a
// model GGUF and a test image are provided, and skip cleanly otherwise so the
// suite stays green without large assets.
var _ = Describe("FaceDetect end-to-end", Ordered, func() {
var (
f *FaceDetect
modelPath = os.Getenv("FACEDETECT_BACKEND_TEST_MODEL")
imagePath = os.Getenv("FACEDETECT_BACKEND_TEST_IMAGE")
)
BeforeAll(func() {
if modelPath == "" || imagePath == "" {
Skip("set FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE to run the e2e specs")
}
if err := ensureLibLoaded(); err != nil {
Skip("libfacedetect.so not loadable: " + err.Error())
}
f = &FaceDetect{}
Expect(f.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
})
It("embeds the primary face in an image", func() {
emb, err := f.Embeddings(&pb.PredictOptions{Images: []string{imagePath}})
Expect(err).ToNot(HaveOccurred())
Expect(emb).ToNot(BeEmpty())
})
It("detects at least one face", func() {
resp, err := f.Detect(&pb.DetectOptions{Src: imagePath})
Expect(err).ToNot(HaveOccurred())
Expect(resp.Detections).ToNot(BeEmpty())
Expect(resp.Detections[0].ClassName).To(Equal("face"))
})
It("verifies an image against itself as the same identity", func() {
resp, err := f.FaceVerify(&pb.FaceVerifyRequest{Img1: imagePath, Img2: imagePath})
Expect(err).ToNot(HaveOccurred())
Expect(resp.Verified).To(BeTrue())
Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
})
It("analyzes age/gender for each face", func() {
resp, err := f.FaceAnalyze(&pb.FaceAnalyzeRequest{Img: imagePath})
Expect(err).ToNot(HaveOccurred())
Expect(resp.Faces).ToNot(BeEmpty())
})
})

View File

@@ -0,0 +1,65 @@
package main
// Started internally by LocalAI - one gRPC server per loaded model.
//
// Loads libfacedetect.so via purego and registers the flat C-API entry points
// declared in facedetect_capi.h. The library name can be overridden with
// FACEDETECT_LIBRARY (mirrors the VOICEDETECT_LIBRARY / PARAKEET_LIBRARY
// convention in the sibling backends); the default looks for the .so next to
// this binary (resolved via LD_LIBRARY_PATH by run.sh).
import (
"flag"
"fmt"
"os"
"github.com/ebitengine/purego"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
type LibFuncs struct {
FuncPtr any
Name string
}
func main() {
libName := os.Getenv("FACEDETECT_LIBRARY")
if libName == "" {
libName = "libfacedetect.so"
}
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
if err != nil {
panic(fmt.Errorf("face-detect: dlopen %q: %w", libName, err))
}
// Bound 1:1 to facedetect_capi.h. char*/float* returns are registered as
// uintptr so the raw pointer can be freed via the matching capi free fn.
libFuncs := []LibFuncs{
{&CppAbiVersion, "facedetect_capi_abi_version"},
{&CppLoad, "facedetect_capi_load"},
{&CppFree, "facedetect_capi_free"},
{&CppLastError, "facedetect_capi_last_error"},
{&CppFreeString, "facedetect_capi_free_string"},
{&CppFreeVec, "facedetect_capi_free_vec"},
{&CppEmbedPath, "facedetect_capi_embed_path"},
{&CppEmbedRGB, "facedetect_capi_embed_rgb"},
{&CppDetectJSON, "facedetect_capi_detect_path_json"},
{&CppVerifyPaths, "facedetect_capi_verify_paths"},
{&CppAnalyzeJSON, "facedetect_capi_analyze_path_json"},
}
for _, lf := range libFuncs {
purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
}
fmt.Fprintf(os.Stderr, "[face-detect] ABI=%d\n", CppAbiVersion())
flag.Parse()
if err := grpc.StartServer(*addr, &FaceDetect{}); err != nil {
panic(err)
}
}

View File

@@ -0,0 +1,47 @@
package main
import (
"strconv"
"strings"
)
// defaultVerifyThreshold is the cosine-distance cutoff used when a request does
// not set one. Matches the insightface buffalo_l ArcFace R50 default the Python
// face backend ships with so the two implementations agree on verdicts out of
// the box.
const defaultVerifyThreshold float32 = 0.35
// loadOptions holds the parsed model-level options for face-detect.
type loadOptions struct {
verifyThreshold float32
modelName string
}
func splitOption(o string) (key, value string, ok bool) {
i := strings.Index(o, ":")
if i < 0 {
return "", "", false
}
return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
}
// parseOptions reads the backend "key:value" option slice. Unknown keys are
// ignored. Defaults: verify_threshold 0.35, model_name derived from the file.
func parseOptions(opts []string) loadOptions {
o := loadOptions{verifyThreshold: defaultVerifyThreshold}
for _, oo := range opts {
key, value, ok := splitOption(oo)
if !ok {
continue
}
switch key {
case "verify_threshold", "threshold":
if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
o.verifyThreshold = float32(f)
}
case "model_name":
o.modelName = value
}
}
return o
}

View File

@@ -0,0 +1,68 @@
#!/bin/bash
#
# Bundle the face-detect-grpc binary, libfacedetect.so, the core runtime libs
# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
# so the package is self-contained. Mirrors backend/go/voice-detect/package.sh;
# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
# is used instead of the host's.
set -e
CURDIR=$(dirname "$(realpath "$0")")
REPO_ROOT="${CURDIR}/../../.."
mkdir -p "$CURDIR/package/lib"
cp -avf "$CURDIR/face-detect-grpc" "$CURDIR/package/"
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
# libfacedetect.so + any soname symlinks. purego.Dlopen resolves it via
# LD_LIBRARY_PATH, which run.sh points at lib/.
cp -avf "$CURDIR"/libfacedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
echo "ERROR: libfacedetect.so not found in $CURDIR, run 'make' first" >&2
exit 1
}
# Detect architecture and copy the core runtime libs libfacedetect.so links
# against, plus the matching dynamic loader as lib/ld.so.
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
echo "Detected x86_64 architecture, copying x86_64 libraries..."
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
echo "Detected ARM64 architecture, copying ARM64 libraries..."
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
elif [ "$(uname -s)" = "Darwin" ]; then
echo "Detected Darwin"
else
echo "Error: Could not detect architecture"
exit 1
fi
# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
# BUILD_TYPE so the backend can reach the GPU without the runtime base image
# shipping those drivers.
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
if [ -f "$GPU_LIB_SCRIPT" ]; then
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
package_gpu_libs
fi
echo "Packaging completed successfully"
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"

View File

@@ -0,0 +1,16 @@
#!/bin/bash
set -e
CURDIR=$(dirname "$(realpath "$0")")
export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
# If a self-contained ld.so was packaged, route through it so the packaged
# libc / libstdc++ are used instead of the host's (matches the voice-detect /
# whisper / parakeet backends' runtime layout).
if [ -f "$CURDIR/lib/ld.so" ]; then
echo "Using lib/ld.so"
exec "$CURDIR/lib/ld.so" "$CURDIR/face-detect-grpc" "$@"
fi
exec "$CURDIR/face-detect-grpc" "$@"

View File

@@ -0,0 +1,15 @@
#!/bin/bash
set -e
CURDIR=$(dirname "$(realpath "$0")")
cd "$CURDIR"
echo "Running face-detect backend tests..."
# The pure-Go parsing specs always run. The embed/detect/verify/analyze smoke
# specs run only when a model + image are provided via
# FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE; otherwise they
# auto-skip.
LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
echo "face-detect tests completed."

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=8caa3f908ae6d4a4bef531e73b9a969f266a3d1f
STABLEDIFFUSION_GGML_VERSION?=9956436c925a367daeab097598b1ea1f32d3503f
CMAKE_ARGS+=-DGGML_MAX_NAME=128

18
backend/go/voice-detect/.gitignore vendored Normal file
View File

@@ -0,0 +1,18 @@
# Fetched upstream sources
sources/
# CMake build directories
build*/
# build artifacts staged in-tree by the Makefile (cp from sources/) or
# symlinked for local dev; the real sources live in voice-detect.cpp upstream.
*.so
*.so.*
voicedetect_capi.h
compile_commands.json
# Compiled backend binary
voice-detect-grpc
# Packaging output
package/

View File

@@ -0,0 +1,107 @@
# voice-detect backend Makefile.
#
# Upstream pin lives below as VOICEDETECT_VERSION?=3d51077... (.github/bump_deps.sh
# can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention).
#
# Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build,
# symlink the .so + header into this directory and skip the clone/cmake steps:
#
# ln -sf /path/to/voice-detect.cpp/build-shared/libvoicedetect.so .
# ln -sf /path/to/voice-detect.cpp/include/voicedetect_capi.h .
# go build -o voice-detect-grpc .
#
# The default target below does the proper clone-at-pin + cmake build so CI does
# not need a side-checkout.
VOICEDETECT_VERSION?=3d510772357538c5182808ac7de2278b84824e24
VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp
GOCMD?=go
GO_TAGS?=
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
BUILD_TYPE?=
NATIVE?=false
# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
# Build ggml statically into libvoicedetect.so (PIC) so the shared lib is
# self-contained: dlopen needs no libggml*.so alongside it, only system libs
# (libstdc++/libgomp/libc) that the runtime image already provides.
CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DVOICEDETECT_SHARED=ON -DVOICEDETECT_BUILD_CLI=OFF -DVOICEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
ifeq ($(NATIVE),false)
CMAKE_ARGS+=-DGGML_NATIVE=OFF
endif
# voice-detect.cpp gates its GGML backends behind VOICEDETECT_GGML_* options and
# does set(GGML_CUDA ${VOICEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
# -DGGML_CUDA=ON is overwritten back to OFF. Forward the VOICEDETECT_GGML_*
# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
ifeq ($(BUILD_TYPE),cublas)
CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON
# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, reaches
# torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
ifeq ($(CUDA_MAJOR_VERSION),13)
ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDNN=ON
endif
endif
else ifeq ($(BUILD_TYPE),openblas)
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
else ifeq ($(BUILD_TYPE),hipblas)
CMAKE_ARGS+=-DVOICEDETECT_GGML_HIP=ON
else ifeq ($(BUILD_TYPE),vulkan)
CMAKE_ARGS+=-DVOICEDETECT_GGML_VULKAN=ON
else ifeq ($(BUILD_TYPE),metal)
CMAKE_ARGS+=-DVOICEDETECT_GGML_METAL=ON
endif
.PHONY: voice-detect-grpc package build clean purge test all
all: voice-detect-grpc
# Clone the upstream voice-detect.cpp source at the pinned commit. Directory acts
# as the target so make only re-clones when missing. After a VOICEDETECT_VERSION
# bump, run 'make purge && make' to refetch.
sources/voice-detect.cpp:
mkdir -p sources/voice-detect.cpp
cd sources/voice-detect.cpp && \
git init -q && \
git remote add origin $(VOICEDETECT_REPO) && \
git fetch --depth 1 origin $(VOICEDETECT_VERSION) && \
git checkout FETCH_HEAD && \
git submodule update --init --recursive --depth 1 --single-branch
# Build the shared lib + header out-of-tree, then stage them next to the Go
# sources so purego.Dlopen("libvoicedetect.so") and the cgo-less build both pick
# them up.
libvoicedetect.so: sources/voice-detect.cpp
cmake -B sources/voice-detect.cpp/build-shared -S sources/voice-detect.cpp $(CMAKE_ARGS)
cmake --build sources/voice-detect.cpp/build-shared --config Release -j$(JOBS) --target voicedetect
cp -fv sources/voice-detect.cpp/build-shared/libvoicedetect.so* ./ 2>/dev/null || true
cp -fv sources/voice-detect.cpp/include/voicedetect_capi.h ./
voice-detect-grpc: libvoicedetect.so main.go govoicedetect.go options.go
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o voice-detect-grpc .
package: voice-detect-grpc
bash package.sh
build: package
# Test target. The embed/verify/analyze smoke specs are gated on
# VOICEDETECT_BACKEND_TEST_MODEL + VOICEDETECT_BACKEND_TEST_WAV; without them the
# heavy specs auto-skip and only the pure-Go parsing specs run.
test:
LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
clean: purge
rm -rf libvoicedetect.so* voicedetect_capi.h package voice-detect-grpc
purge:
rm -rf sources/voice-detect.cpp

View File

@@ -0,0 +1,273 @@
package main
import (
"encoding/json"
"errors"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"time"
"unsafe"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/xlog"
)
// purego-bound entry points from libvoicedetect.so. Names match
// voicedetect_capi.h exactly so a `nm libvoicedetect.so | grep voicedetect_capi`
// is enough to spot drift.
//
// The opaque ctx and the malloc'd char*/float* return values are declared as
// uintptr so we get the raw pointer back and can release it via the matching
// capi free function. purego's native string/[]float32 returns would copy and
// forget the original pointer, leaking the C-owned buffer on every call.
var (
CppAbiVersion func() int32
CppLoad func(ggufPath string) uintptr
CppFree func(ctx uintptr)
CppLastError func(ctx uintptr) string
CppFreeString func(s uintptr)
CppFreeVec func(v uintptr)
CppEmbedPath func(ctx uintptr, wavPath string, outVec, outDim unsafe.Pointer) int32
CppEmbedPCM func(ctx uintptr, pcm []float32, nSamples, sampleRate int32, outVec, outDim unsafe.Pointer) int32
CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, outDistance, outVerified unsafe.Pointer) int32
CppAnalyzeJSON func(ctx uintptr, wavPath string) uintptr
)
// VoiceDetect implements the speaker-recognition voice subset of the Backend
// gRPC service over libvoicedetect.so. The C side keeps a single loaded model
// plus a per-ctx last-error buffer and is not reentrant, so base.SingleThread
// serializes every call.
type VoiceDetect struct {
base.SingleThread
opts loadOptions
ctxPtr uintptr
}
func (v *VoiceDetect) Load(opts *pb.ModelOptions) error {
model := opts.ModelFile
if model == "" {
model = opts.ModelPath
}
if !filepath.IsAbs(model) && opts.ModelPath != "" {
model = filepath.Join(opts.ModelPath, model)
}
if model == "" {
return errors.New("voice-detect: ModelFile is required")
}
v.opts = parseOptions(opts.Options)
if v.opts.modelName == "" {
v.opts.modelName = filepath.Base(model)
}
// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
// one backend process per model and serves requests concurrently, so the
// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
// VOICEDETECT_THREADS is read by the engine at backend construction, so it
// must be set before the capi load. A non-positive Threads means "unset":
// leave the env alone so the engine keeps its sane default.
threads := opts.Threads
if threads > 0 {
if err := os.Setenv("VOICEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
return fmt.Errorf("voice-detect: set VOICEDETECT_THREADS: %w", err)
}
xlog.Info("voice-detect: applying LocalAI thread budget", "threads", threads)
}
xlog.Info("voice-detect: loading model", "model", model,
"verify_threshold", v.opts.verifyThreshold, "abi", CppAbiVersion())
ctx := CppLoad(model)
if ctx == 0 {
// The last-error buffer lives on the ctx that was never returned, so
// surface the path the operator tried to load instead.
return fmt.Errorf("voice-detect: voicedetect_capi_load failed for %q", model)
}
v.ctxPtr = ctx
return nil
}
// VoiceEmbed returns the L2-normalized speaker embedding for an audio clip.
// The request carries a filesystem PATH; the HTTP layer materializes
// base64/URL/data-URI inputs to a temp file before the gRPC call.
func (v *VoiceDetect) VoiceEmbed(req *pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error) {
if v.ctxPtr == 0 {
return pb.VoiceEmbedResponse{}, errors.New("voice-detect: model not loaded")
}
if req.Audio == "" {
return pb.VoiceEmbedResponse{}, errors.New("voice-detect: audio path is required")
}
emb, err := v.embedPath(req.Audio)
if err != nil {
return pb.VoiceEmbedResponse{}, err
}
return pb.VoiceEmbedResponse{Embedding: emb, Model: v.opts.modelName}, nil
}
func (v *VoiceDetect) embedPath(path string) ([]float32, error) {
var vec uintptr
var dim int32
rc := CppEmbedPath(v.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
if rc != 0 || vec == 0 || dim <= 0 {
return nil, v.lastErr("embed", path)
}
defer CppFreeVec(vec)
// Copy out of the C-owned malloc'd buffer before freeing it. The
// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
// nor moves this buffer and we copy immediately.
src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
out := make([]float32, int(dim))
copy(out, src)
return out, nil
}
// VoiceVerify embeds two clips and reports whether they are the same speaker by
// cosine distance against a threshold. A request threshold <= 0 falls back to
// the model-configured default (verify_threshold option, 0.25 if unset).
func (v *VoiceDetect) VoiceVerify(req *pb.VoiceVerifyRequest) (pb.VoiceVerifyResponse, error) {
if v.ctxPtr == 0 {
return pb.VoiceVerifyResponse{}, errors.New("voice-detect: model not loaded")
}
if req.Audio1 == "" || req.Audio2 == "" {
return pb.VoiceVerifyResponse{}, errors.New("voice-detect: audio1 and audio2 are required")
}
threshold := req.Threshold
if threshold <= 0 {
threshold = v.opts.verifyThreshold
}
started := time.Now()
var distance float32
var verified int32
rc := CppVerifyPaths(v.ctxPtr, req.Audio1, req.Audio2, threshold,
unsafe.Pointer(&distance), unsafe.Pointer(&verified))
if rc != 0 {
return pb.VoiceVerifyResponse{}, v.lastErr("verify", req.Audio1+","+req.Audio2)
}
elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
// matching the Python speaker-recognition backend's reporting.
confidence := float32(0)
if threshold > 0 {
confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
}
return pb.VoiceVerifyResponse{
Verified: verified != 0,
Distance: distance,
Threshold: threshold,
Confidence: confidence,
Model: v.opts.modelName,
ProcessingTimeMs: elapsedMs,
}, nil
}
// VoiceAnalyze runs the age/gender/emotion heads on a single clip. The C-API
// always evaluates every supported head, so the request's actions filter is
// advisory and the full analysis is returned as a single segment (the engine
// does not produce time-bounded segments).
func (v *VoiceDetect) VoiceAnalyze(req *pb.VoiceAnalyzeRequest) (pb.VoiceAnalyzeResponse, error) {
if v.ctxPtr == 0 {
return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: model not loaded")
}
if req.Audio == "" {
return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: audio path is required")
}
ptr := CppAnalyzeJSON(v.ctxPtr, req.Audio)
if ptr == 0 {
return pb.VoiceAnalyzeResponse{}, v.lastErr("analyze", req.Audio)
}
defer CppFreeString(ptr)
seg, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
if err != nil {
return pb.VoiceAnalyzeResponse{}, fmt.Errorf("voice-detect: analyze JSON for %q: %w", req.Audio, err)
}
return pb.VoiceAnalyzeResponse{Segments: []*pb.VoiceAnalysis{seg}}, nil
}
// analyzeJSON mirrors the document returned by voicedetect_capi_analyze_path_json:
//
// {"age":42.0,
// "gender":{"label":"female","female":0.88,"male":0.12},
// "emotion":{"label":"neutral","scores":{"neutral":0.7, ...}}}
//
// gender is a mixed object (a "label" string plus per-class float scores), so
// it is decoded into raw messages and split in parseAnalyzeJSON.
type analyzeJSON struct {
Age float32 `json:"age"`
Gender map[string]json.RawMessage `json:"gender"`
Emotion struct {
Label string `json:"label"`
Scores map[string]float32 `json:"scores"`
} `json:"emotion"`
}
// parseAnalyzeJSON maps the engine's analyze document onto a VoiceAnalysis.
// start/end stay 0: the model emits a single whole-utterance result, not
// time-bounded segments.
func parseAnalyzeJSON(doc string) (*pb.VoiceAnalysis, error) {
var a analyzeJSON
if err := json.Unmarshal([]byte(doc), &a); err != nil {
return nil, err
}
seg := &pb.VoiceAnalysis{
Age: a.Age,
DominantEmotion: a.Emotion.Label,
Emotion: a.Emotion.Scores,
}
if len(a.Gender) > 0 {
gender := make(map[string]float32, len(a.Gender))
for k, raw := range a.Gender {
if k == "label" {
_ = json.Unmarshal(raw, &seg.DominantGender)
continue
}
var score float32
if err := json.Unmarshal(raw, &score); err == nil {
gender[k] = score
}
}
seg.Gender = gender
}
return seg, nil
}
// lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
func (v *VoiceDetect) lastErr(op, subject string) error {
msg := strings.TrimSpace(CppLastError(v.ctxPtr))
if msg == "" {
msg = "no error detail"
}
return fmt.Errorf("voice-detect: %s failed for %q: %s", op, subject, msg)
}
// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
// malloc'd buffer the caller owns; release it via CppFreeString after the copy.
//
// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
// moves the buffer and we dereference it immediately to copy the bytes out.
func goStringFromCPtr(cptr uintptr) string {
if cptr == 0 {
return ""
}
p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
n := 0
for *(*byte)(unsafe.Add(p, n)) != 0 {
n++
}
return string(unsafe.Slice((*byte)(p), n))
}

View File

@@ -0,0 +1,144 @@
package main
import (
"os"
"sync"
"testing"
"github.com/ebitengine/purego"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
func TestVoiceDetect(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "voice-detect Backend Suite")
}
var (
libLoadOnce sync.Once
libLoadErr error
)
// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
// bridge without spinning up the gRPC server. Records the error (the smoke
// specs skip themselves) when libvoicedetect.so is not loadable from cwd
// (LD_LIBRARY_PATH or a symlink in ./).
func ensureLibLoaded() error {
libLoadOnce.Do(func() {
libName := os.Getenv("VOICEDETECT_LIBRARY")
if libName == "" {
libName = "libvoicedetect.so"
}
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
if err != nil {
libLoadErr = err
return
}
purego.RegisterLibFunc(&CppAbiVersion, lib, "voicedetect_capi_abi_version")
purego.RegisterLibFunc(&CppLoad, lib, "voicedetect_capi_load")
purego.RegisterLibFunc(&CppFree, lib, "voicedetect_capi_free")
purego.RegisterLibFunc(&CppLastError, lib, "voicedetect_capi_last_error")
purego.RegisterLibFunc(&CppFreeString, lib, "voicedetect_capi_free_string")
purego.RegisterLibFunc(&CppFreeVec, lib, "voicedetect_capi_free_vec")
purego.RegisterLibFunc(&CppEmbedPath, lib, "voicedetect_capi_embed_path")
purego.RegisterLibFunc(&CppEmbedPCM, lib, "voicedetect_capi_embed_pcm")
purego.RegisterLibFunc(&CppVerifyPaths, lib, "voicedetect_capi_verify_paths")
purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "voicedetect_capi_analyze_path_json")
})
return libLoadErr
}
var _ = Describe("parseOptions", func() {
It("defaults verify_threshold to 0.25", func() {
o := parseOptions(nil)
Expect(o.verifyThreshold).To(Equal(float32(0.25)))
Expect(o.modelName).To(Equal(""))
})
It("parses verify_threshold, threshold alias and model_name", func() {
o := parseOptions([]string{"verify_threshold:0.4", "model_name:ecapa", "unknown:x"})
Expect(o.verifyThreshold).To(Equal(float32(0.4)))
Expect(o.modelName).To(Equal("ecapa"))
o2 := parseOptions([]string{"threshold:0.3"})
Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
})
It("ignores non-positive thresholds and keeps the default", func() {
o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
Expect(o.verifyThreshold).To(Equal(float32(0.25)))
})
})
var _ = Describe("parseAnalyzeJSON", func() {
It("maps age, gender label+scores and emotion label+scores", func() {
doc := `{"age":42.0,
"gender":{"label":"female","female":0.88,"male":0.12},
"emotion":{"label":"neutral","scores":{"neutral":0.7,"happy":0.2,"sad":0.1}}}`
seg, err := parseAnalyzeJSON(doc)
Expect(err).ToNot(HaveOccurred())
Expect(seg.Age).To(BeNumerically("~", 42.0, 1e-4))
Expect(seg.Start).To(Equal(float32(0)))
Expect(seg.End).To(Equal(float32(0)))
Expect(seg.DominantGender).To(Equal("female"))
Expect(seg.Gender).To(HaveKeyWithValue("female", BeNumerically("~", 0.88, 1e-4)))
Expect(seg.Gender).To(HaveKeyWithValue("male", BeNumerically("~", 0.12, 1e-4)))
// The "label" entry is consumed into DominantGender, not the score map.
Expect(seg.Gender).ToNot(HaveKey("label"))
Expect(seg.DominantEmotion).To(Equal("neutral"))
Expect(seg.Emotion).To(HaveKeyWithValue("neutral", BeNumerically("~", 0.7, 1e-4)))
Expect(seg.Emotion).To(HaveKeyWithValue("happy", BeNumerically("~", 0.2, 1e-4)))
})
It("tolerates a missing gender block", func() {
seg, err := parseAnalyzeJSON(`{"age":30.0,"emotion":{"label":"happy","scores":{"happy":1.0}}}`)
Expect(err).ToNot(HaveOccurred())
Expect(seg.DominantGender).To(Equal(""))
Expect(seg.DominantEmotion).To(Equal("happy"))
})
It("returns an error on malformed JSON", func() {
_, err := parseAnalyzeJSON(`{not-json`)
Expect(err).To(HaveOccurred())
})
})
// The specs below exercise the real C-API end to end. They run only when both a
// model GGUF and a test WAV are provided, and skip cleanly otherwise so the
// suite stays green without large assets.
var _ = Describe("VoiceDetect end-to-end", Ordered, func() {
var (
v *VoiceDetect
modelPath = os.Getenv("VOICEDETECT_BACKEND_TEST_MODEL")
wavPath = os.Getenv("VOICEDETECT_BACKEND_TEST_WAV")
)
BeforeAll(func() {
if modelPath == "" || wavPath == "" {
Skip("set VOICEDETECT_BACKEND_TEST_MODEL and VOICEDETECT_BACKEND_TEST_WAV to run the e2e specs")
}
if err := ensureLibLoaded(); err != nil {
Skip("libvoicedetect.so not loadable: " + err.Error())
}
v = &VoiceDetect{}
Expect(v.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
})
It("embeds an audio clip", func() {
resp, err := v.VoiceEmbed(&pb.VoiceEmbedRequest{Audio: wavPath})
Expect(err).ToNot(HaveOccurred())
Expect(resp.Embedding).ToNot(BeEmpty())
Expect(resp.Model).ToNot(BeEmpty())
})
It("verifies a clip against itself as the same speaker", func() {
resp, err := v.VoiceVerify(&pb.VoiceVerifyRequest{Audio1: wavPath, Audio2: wavPath})
Expect(err).ToNot(HaveOccurred())
Expect(resp.Verified).To(BeTrue())
Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
})
})

View File

@@ -0,0 +1,64 @@
package main
// Started internally by LocalAI - one gRPC server per loaded model.
//
// Loads libvoicedetect.so via purego and registers the flat C-API entry points
// declared in voicedetect_capi.h. The library name can be overridden with
// VOICEDETECT_LIBRARY (mirrors the PARAKEET_LIBRARY / OMNIVOICE_LIBRARY
// convention in the sibling backends); the default looks for the .so next to
// this binary (resolved via LD_LIBRARY_PATH by run.sh).
import (
"flag"
"fmt"
"os"
"github.com/ebitengine/purego"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
type LibFuncs struct {
FuncPtr any
Name string
}
func main() {
libName := os.Getenv("VOICEDETECT_LIBRARY")
if libName == "" {
libName = "libvoicedetect.so"
}
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
if err != nil {
panic(fmt.Errorf("voice-detect: dlopen %q: %w", libName, err))
}
// Bound 1:1 to voicedetect_capi.h. char*/float* returns are registered as
// uintptr so the raw pointer can be freed via the matching capi free fn.
libFuncs := []LibFuncs{
{&CppAbiVersion, "voicedetect_capi_abi_version"},
{&CppLoad, "voicedetect_capi_load"},
{&CppFree, "voicedetect_capi_free"},
{&CppLastError, "voicedetect_capi_last_error"},
{&CppFreeString, "voicedetect_capi_free_string"},
{&CppFreeVec, "voicedetect_capi_free_vec"},
{&CppEmbedPath, "voicedetect_capi_embed_path"},
{&CppEmbedPCM, "voicedetect_capi_embed_pcm"},
{&CppVerifyPaths, "voicedetect_capi_verify_paths"},
{&CppAnalyzeJSON, "voicedetect_capi_analyze_path_json"},
}
for _, lf := range libFuncs {
purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
}
fmt.Fprintf(os.Stderr, "[voice-detect] ABI=%d\n", CppAbiVersion())
flag.Parse()
if err := grpc.StartServer(*addr, &VoiceDetect{}); err != nil {
panic(err)
}
}

View File

@@ -0,0 +1,46 @@
package main
import (
"strconv"
"strings"
)
// defaultVerifyThreshold is the cosine-distance cutoff used when a request does
// not set one. Matches the Python speaker-recognition backend's default so the
// two implementations agree on verdicts out of the box.
const defaultVerifyThreshold float32 = 0.25
// loadOptions holds the parsed model-level options for voice-detect.
type loadOptions struct {
verifyThreshold float32
modelName string
}
func splitOption(o string) (key, value string, ok bool) {
i := strings.Index(o, ":")
if i < 0 {
return "", "", false
}
return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
}
// parseOptions reads the backend "key:value" option slice. Unknown keys are
// ignored. Defaults: verify_threshold 0.25, model_name derived from the file.
func parseOptions(opts []string) loadOptions {
o := loadOptions{verifyThreshold: defaultVerifyThreshold}
for _, oo := range opts {
key, value, ok := splitOption(oo)
if !ok {
continue
}
switch key {
case "verify_threshold", "threshold":
if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
o.verifyThreshold = float32(f)
}
case "model_name":
o.modelName = value
}
}
return o
}

View File

@@ -0,0 +1,68 @@
#!/bin/bash
#
# Bundle the voice-detect-grpc binary, libvoicedetect.so, the core runtime libs
# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
# so the package is self-contained. Mirrors backend/go/parakeet-cpp/package.sh;
# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
# is used instead of the host's.
set -e
CURDIR=$(dirname "$(realpath "$0")")
REPO_ROOT="${CURDIR}/../../.."
mkdir -p "$CURDIR/package/lib"
cp -avf "$CURDIR/voice-detect-grpc" "$CURDIR/package/"
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
# libvoicedetect.so + any soname symlinks. purego.Dlopen resolves it via
# LD_LIBRARY_PATH, which run.sh points at lib/.
cp -avf "$CURDIR"/libvoicedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
echo "ERROR: libvoicedetect.so not found in $CURDIR, run 'make' first" >&2
exit 1
}
# Detect architecture and copy the core runtime libs libvoicedetect.so links
# against, plus the matching dynamic loader as lib/ld.so.
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
echo "Detected x86_64 architecture, copying x86_64 libraries..."
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
echo "Detected ARM64 architecture, copying ARM64 libraries..."
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
elif [ "$(uname -s)" = "Darwin" ]; then
echo "Detected Darwin"
else
echo "Error: Could not detect architecture"
exit 1
fi
# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
# BUILD_TYPE so the backend can reach the GPU without the runtime base image
# shipping those drivers.
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
if [ -f "$GPU_LIB_SCRIPT" ]; then
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
package_gpu_libs
fi
echo "Packaging completed successfully"
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"

16
backend/go/voice-detect/run.sh Executable file
View File

@@ -0,0 +1,16 @@
#!/bin/bash
set -e
CURDIR=$(dirname "$(realpath "$0")")
export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
# If a self-contained ld.so was packaged, route through it so the packaged
# libc / libstdc++ are used instead of the host's (matches the whisper /
# parakeet backends' runtime layout).
if [ -f "$CURDIR/lib/ld.so" ]; then
echo "Using lib/ld.so"
exec "$CURDIR/lib/ld.so" "$CURDIR/voice-detect-grpc" "$@"
fi
exec "$CURDIR/voice-detect-grpc" "$@"

14
backend/go/voice-detect/test.sh Executable file
View File

@@ -0,0 +1,14 @@
#!/bin/bash
set -e
CURDIR=$(dirname "$(realpath "$0")")
cd "$CURDIR"
echo "Running voice-detect backend tests..."
# The pure-Go parsing specs always run. The embed/verify/analyze smoke specs run
# only when a model + WAV are provided via VOICEDETECT_BACKEND_TEST_MODEL and
# VOICEDETECT_BACKEND_TEST_WAV; otherwise they auto-skip.
LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
echo "voice-detect tests completed."

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# whisper.cpp version
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
WHISPER_CPP_VERSION?=43d78af5be58f41d6ffbc227d608f104577741ea
WHISPER_CPP_VERSION?=0ae02cdb2c7317b50991367c165736ce42ed96ac
SO_TARGET?=libgowhisper.so
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

View File

@@ -13,8 +13,14 @@ if [ "$(uname)" != "Darwin" ]; then
fi
if [ "$(uname)" = "Darwin" ]; then
# macOS: single dylib variant (Metal or Accelerate)
LIBRARY="$CURDIR/libgowhisper-fallback.dylib"
# macOS: single fallback variant (Metal/Accelerate). The cmake build emits a
# Mach-O named .so, but tolerate .dylib too — pick whichever exists so the Go
# loader doesn't panic on a hardcoded name that isn't on disk.
if [ -e "$CURDIR/libgowhisper-fallback.dylib" ]; then
LIBRARY="$CURDIR/libgowhisper-fallback.dylib"
else
LIBRARY="$CURDIR/libgowhisper-fallback.so"
fi
export DYLD_LIBRARY_PATH="$CURDIR"/lib:$DYLD_LIBRARY_PATH
else
LIBRARY="$CURDIR/libgowhisper-fallback.so"

View File

@@ -243,6 +243,78 @@
nvidia-cuda-12: "cuda12-ced"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
- &voicedetect
name: "voice-detect"
alias: "voice-detect"
license: mit
icon: https://avatars.githubusercontent.com/u/95302084
description: |
voice-detect speaker recognition and voice analysis.
voice-detect.cpp is a C++/ggml engine that produces L2-normalised
speaker embeddings (ECAPA-TDNN, WeSpeaker ResNet34, 3D-Speaker
ERes2Net, CAM++) for voice verification and 1:N identification, plus
a wav2vec2 age / gender / emotion analysis head. It replaces the
Python speaker-recognition backend and is exposed through the Voice*
gRPC rpcs and the /v1/voice/* REST endpoints. It runs on CPU, NVIDIA
CUDA, AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
urls:
- https://github.com/mudler/voice-detect.cpp
tags:
- voice-recognition
- speaker-verification
- speaker-embedding
- CPU
- GPU
- CUDA
- HIP
capabilities:
default: "cpu-voice-detect"
nvidia: "cuda12-voice-detect"
intel: "intel-sycl-f16-voice-detect"
metal: "metal-voice-detect"
amd: "rocm-voice-detect"
vulkan: "vulkan-voice-detect"
nvidia-l4t: "nvidia-l4t-arm64-voice-detect"
nvidia-cuda-13: "cuda13-voice-detect"
nvidia-cuda-12: "cuda12-voice-detect"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect"
- &facedetect
name: "face-detect"
alias: "face-detect"
license: mit
icon: https://avatars.githubusercontent.com/u/95302084
description: |
face-detect face detection, embedding, verification and analysis.
face-detect.cpp is a C++/ggml engine that runs SCRFD / YuNet face
detection and ArcFace / SFace 512-d (or 128-d) L2-normalised face
embeddings for verification and 1:N identification, plus a landmark /
age / gender analysis head. It replaces the Python insightface backend
and is exposed through the Embedding, Detect and Face* gRPC rpcs and
the /v1/face/* REST endpoints. It runs on CPU, NVIDIA CUDA, AMD
ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
urls:
- https://github.com/mudler/face-detect.cpp
tags:
- face-recognition
- face-verification
- face-embedding
- CPU
- GPU
- CUDA
- HIP
capabilities:
default: "cpu-face-detect"
nvidia: "cuda12-face-detect"
intel: "intel-sycl-f16-face-detect"
metal: "metal-face-detect"
amd: "rocm-face-detect"
vulkan: "vulkan-face-detect"
nvidia-l4t: "nvidia-l4t-arm64-face-detect"
nvidia-cuda-13: "cuda13-face-detect"
nvidia-cuda-12: "cuda12-face-detect"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect"
- &voxtral
name: "voxtral"
alias: "voxtral"
@@ -1390,7 +1462,6 @@
intel: "intel-fish-speech"
amd: "rocm-fish-speech"
nvidia-l4t: "nvidia-l4t-fish-speech"
metal: "metal-fish-speech"
default: "cpu-fish-speech"
nvidia-cuda-13: "cuda13-fish-speech"
nvidia-cuda-12: "cuda12-fish-speech"
@@ -2890,6 +2961,236 @@
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-ced
## voice-detect
- !!merge <<: *voicedetect
name: "voice-detect-development"
capabilities:
default: "cpu-voice-detect-development"
nvidia: "cuda12-voice-detect-development"
intel: "intel-sycl-f16-voice-detect-development"
metal: "metal-voice-detect-development"
amd: "rocm-voice-detect-development"
vulkan: "vulkan-voice-detect-development"
nvidia-l4t: "nvidia-l4t-arm64-voice-detect-development"
nvidia-cuda-13: "cuda13-voice-detect-development"
nvidia-cuda-12: "cuda12-voice-detect-development"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect-development"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect-development"
- !!merge <<: *voicedetect
name: "nvidia-l4t-arm64-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-voice-detect"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-arm64-voice-detect
- !!merge <<: *voicedetect
name: "nvidia-l4t-arm64-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-voice-detect"
mirrors:
- localai/localai-backends:master-nvidia-l4t-arm64-voice-detect
- !!merge <<: *voicedetect
name: "cuda13-nvidia-l4t-arm64-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect
- !!merge <<: *voicedetect
name: "cuda13-nvidia-l4t-arm64-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect
- !!merge <<: *voicedetect
name: "cpu-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voice-detect"
mirrors:
- localai/localai-backends:latest-cpu-voice-detect
- !!merge <<: *voicedetect
name: "cpu-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voice-detect"
mirrors:
- localai/localai-backends:master-cpu-voice-detect
- !!merge <<: *voicedetect
name: "metal-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-voice-detect"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-voice-detect
- !!merge <<: *voicedetect
name: "metal-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-voice-detect"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-voice-detect
- !!merge <<: *voicedetect
name: "cuda12-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voice-detect"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-voice-detect
- !!merge <<: *voicedetect
name: "cuda12-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voice-detect"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-voice-detect
- !!merge <<: *voicedetect
name: "rocm-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voice-detect"
mirrors:
- localai/localai-backends:latest-gpu-rocm-hipblas-voice-detect
- !!merge <<: *voicedetect
name: "rocm-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voice-detect"
mirrors:
- localai/localai-backends:master-gpu-rocm-hipblas-voice-detect
- !!merge <<: *voicedetect
name: "intel-sycl-f32-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-voice-detect"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f32-voice-detect
- !!merge <<: *voicedetect
name: "intel-sycl-f32-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-voice-detect"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f32-voice-detect
- !!merge <<: *voicedetect
name: "intel-sycl-f16-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-voice-detect"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f16-voice-detect
- !!merge <<: *voicedetect
name: "intel-sycl-f16-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-voice-detect"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f16-voice-detect
- !!merge <<: *voicedetect
name: "vulkan-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-voice-detect"
mirrors:
- localai/localai-backends:latest-gpu-vulkan-voice-detect
- !!merge <<: *voicedetect
name: "vulkan-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-voice-detect"
mirrors:
- localai/localai-backends:master-gpu-vulkan-voice-detect
- !!merge <<: *voicedetect
name: "cuda13-voice-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voice-detect"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-voice-detect
- !!merge <<: *voicedetect
name: "cuda13-voice-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voice-detect"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-voice-detect
## face-detect
- !!merge <<: *facedetect
name: "face-detect-development"
capabilities:
default: "cpu-face-detect-development"
nvidia: "cuda12-face-detect-development"
intel: "intel-sycl-f16-face-detect-development"
metal: "metal-face-detect-development"
amd: "rocm-face-detect-development"
vulkan: "vulkan-face-detect-development"
nvidia-l4t: "nvidia-l4t-arm64-face-detect-development"
nvidia-cuda-13: "cuda13-face-detect-development"
nvidia-cuda-12: "cuda12-face-detect-development"
nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect-development"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect-development"
- !!merge <<: *facedetect
name: "nvidia-l4t-arm64-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-face-detect"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-arm64-face-detect
- !!merge <<: *facedetect
name: "nvidia-l4t-arm64-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-face-detect"
mirrors:
- localai/localai-backends:master-nvidia-l4t-arm64-face-detect
- !!merge <<: *facedetect
name: "cuda13-nvidia-l4t-arm64-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect
- !!merge <<: *facedetect
name: "cuda13-nvidia-l4t-arm64-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect
- !!merge <<: *facedetect
name: "cpu-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-face-detect"
mirrors:
- localai/localai-backends:latest-cpu-face-detect
- !!merge <<: *facedetect
name: "cpu-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-face-detect"
mirrors:
- localai/localai-backends:master-cpu-face-detect
- !!merge <<: *facedetect
name: "metal-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-face-detect"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-face-detect
- !!merge <<: *facedetect
name: "metal-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-face-detect"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-face-detect
- !!merge <<: *facedetect
name: "cuda12-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-face-detect"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-12-face-detect
- !!merge <<: *facedetect
name: "cuda12-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-face-detect"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-12-face-detect
- !!merge <<: *facedetect
name: "rocm-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-face-detect"
mirrors:
- localai/localai-backends:latest-gpu-rocm-hipblas-face-detect
- !!merge <<: *facedetect
name: "rocm-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-face-detect"
mirrors:
- localai/localai-backends:master-gpu-rocm-hipblas-face-detect
- !!merge <<: *facedetect
name: "intel-sycl-f32-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-face-detect"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f32-face-detect
- !!merge <<: *facedetect
name: "intel-sycl-f32-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-face-detect"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f32-face-detect
- !!merge <<: *facedetect
name: "intel-sycl-f16-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-face-detect"
mirrors:
- localai/localai-backends:latest-gpu-intel-sycl-f16-face-detect
- !!merge <<: *facedetect
name: "intel-sycl-f16-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-face-detect"
mirrors:
- localai/localai-backends:master-gpu-intel-sycl-f16-face-detect
- !!merge <<: *facedetect
name: "vulkan-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-face-detect"
mirrors:
- localai/localai-backends:latest-gpu-vulkan-face-detect
- !!merge <<: *facedetect
name: "vulkan-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-face-detect"
mirrors:
- localai/localai-backends:master-gpu-vulkan-face-detect
- !!merge <<: *facedetect
name: "cuda13-face-detect"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-face-detect"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-face-detect
- !!merge <<: *facedetect
name: "cuda13-face-detect-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-face-detect"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-face-detect
## stablediffusion-ggml
- !!merge <<: *stablediffusionggml
name: "cpu-stablediffusion-ggml"
@@ -4932,7 +5233,6 @@
intel: "intel-fish-speech-development"
amd: "rocm-fish-speech-development"
nvidia-l4t: "nvidia-l4t-fish-speech-development"
metal: "metal-fish-speech-development"
default: "cpu-fish-speech-development"
nvidia-cuda-13: "cuda13-fish-speech-development"
nvidia-cuda-12: "cuda12-fish-speech-development"
@@ -5008,16 +5308,6 @@
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-fish-speech"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-fish-speech
- !!merge <<: *fish-speech
name: "metal-fish-speech"
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-fish-speech"
mirrors:
- localai/localai-backends:latest-metal-darwin-arm64-fish-speech
- !!merge <<: *fish-speech
name: "metal-fish-speech-development"
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-fish-speech"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-fish-speech
## faster-qwen3-tts
- !!merge <<: *faster-qwen3-tts
name: "faster-qwen3-tts-development"

View File

@@ -1,2 +0,0 @@
torch
torchaudio