Merge origin/master + pin-sync paged backend to 0ed235ea

master auto-bumped the stock llama-cpp pin 9d5d882d -> 0ed235ea and updated the shared grpc-server.cpp. The paged backend's pin must track the stock pin (the grpc-server.cpp is shared), so bump its LLAMA_VERSION to match. All 28 paged patches apply clean on 0ed235ea (verified against a fresh upstream clone). The bf16-tau state-serialization fix (patch 0026) is included. Bit-exact gate + full grpc-server build verify on GPU/CI to follow. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-29 19:06:43 -04:00 · 2026-06-28 07:56:47 +00:00
parent 1f3e5ba301 de2ec2f136
commit ea72a56e2c
95 changed files with 6339 additions and 487 deletions
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -137,7 +137,7 @@ RUN <<EOT bash
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
            apt-get install -y --no-install-recommends \
-            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
+            libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} libcudnn9-dev-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
        fi
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
--- a/backend/cpp/ik-llama-cpp/CMakeLists.txt
+++ b/backend/cpp/ik-llama-cpp/CMakeLists.txt
@@ -1,15 +1,6 @@
-## Clip/LLaVA library for multimodal support — built locally from copied sources
-set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
-install(TARGETS ${TARGET} LIBRARY)
-target_include_directories(myclip PUBLIC .)
-target_include_directories(myclip PUBLIC ../..)
-target_include_directories(myclip PUBLIC ../../common)
-target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if (NOT MSVC)
-    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual)
-endif()
+## Multimodal support is provided by the in-tree `mtmd` library target
+## (examples/mtmd/), which the grpc-server links and includes below. clip/llava
+## were pruned upstream; the high-level mtmd_* / mtmd_helper_* API is used instead.

 set(TARGET grpc-server)
 set(CMAKE_CXX_STANDARD 17)
@@ -67,12 +58,16 @@ add_library(hw_grpc_proto
  ${hw_proto_hdrs} )

 add_executable(${TARGET} grpc-server.cpp json.hpp)
-target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+# mtmd public headers (mtmd.h / mtmd-helper.h) live in examples/mtmd/.
+# Linking the mtmd target also propagates this include dir, but we add it
+# explicitly for clarity.
+target_include_directories(${TARGET} PRIVATE ../mtmd)
+target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
  absl::flags_parse
  gRPC::${_REFLECTION}
  gRPC::${_GRPC_GRPCPP}
  protobuf::${_PROTOBUF_LIBPROTOBUF})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
 endif()
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=b84902d2ad27c34f989f23947200c4b91b1568fd
+IK_LLAMA_VERSION?=f96eaddba8bed6a9a5e628bbf6a566775c70b49c
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/ik-llama-cpp/grpc-server.cpp
+++ b/backend/cpp/ik-llama-cpp/grpc-server.cpp
@@ -11,8 +11,8 @@
 #include <memory>
 #include <string>
 #include <getopt.h>
-#include "clip.h"
-#include "llava.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
 #include "log.h"
 #include "common.h"
 #include "json.hpp"
@@ -45,7 +45,9 @@ using backend::HealthMessage;

 ///// LLAMA.CPP server code below

-using json = nlohmann::json;
+// Match mtmd.h and ik_llama's server/common headers, which all use
+// nlohmann::ordered_json; a plain nlohmann::json alias collides at global scope.
+using json = nlohmann::ordered_json;

 struct server_params
 {
@@ -219,6 +221,11 @@ struct llama_client_slot

    // multimodal
    std::vector<slot_image> images;
+    // Full prompt with mtmd media markers (mtmd_default_marker()) substituted in
+    // place of the legacy [img-N] tags, covering the text up to and including the
+    // last image. The text after the last image is kept in params.input_suffix and
+    // decoded through the normal token path so the sampling loop is unchanged.
+    std::string mtmd_prompt;

    // stats
    size_t sent_count = 0;
@@ -252,14 +259,14 @@ struct llama_client_slot

        for (slot_image & img : images)
        {
-            free(img.image_embedding);
-            if (img.img_data) {
-                clip_image_u8_free(img.img_data);
+            if (img.bitmap) {
+                mtmd_bitmap_free(img.bitmap);
+                img.bitmap = nullptr;
            }
-            img.prefix_prompt = "";
        }

        images.clear();
+        mtmd_prompt = "";
    }

    bool has_budget(gpt_params &global_params) {
@@ -396,46 +403,13 @@ struct llama_metrics {
    }
 };

-struct llava_embd_batch {
-    std::vector<llama_pos>      pos;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id>   seq_id_0;
-    std::vector<llama_seq_id *> seq_ids;
-    std::vector<int8_t>         logits;
-    llama_batch batch;
-    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-        pos     .resize(n_tokens);
-        n_seq_id.resize(n_tokens);
-        seq_ids .resize(n_tokens + 1);
-        logits  .resize(n_tokens);
-        seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
-        seq_ids [n_tokens] = nullptr;
-        batch = {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ nullptr,
-            /*embd           =*/ embd,
-            /*pos            =*/ pos.data(),
-            /*n_seq_id       =*/ n_seq_id.data(),
-            /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
-        };
-        for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i] = pos_0 + i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
-        }
-    }
-};
-
 struct llama_server_context
 {
    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
    const llama_vocab * vocab = nullptr;

-    clip_ctx *clp_ctx = nullptr;
+    mtmd_context *mctx = nullptr;

    gpt_params params;

@@ -491,11 +465,6 @@ struct llama_server_context
        if (!params.mmproj.path.empty()) {
            multimodal = true;
            LOG_INFO("Multi Modal Mode Enabled", {});
-            clp_ctx = clip_model_load(params.mmproj.path.c_str(), /*verbosity=*/ 1);
-            if(clp_ctx == nullptr) {
-                LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
-                return false;
-            }

            if (params.n_ctx < 2048) { // request larger context for the image embedding
                params.n_ctx = 2048;
@@ -512,10 +481,24 @@ struct llama_server_context
        }

        if (multimodal) {
-            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
-            const int n_embd_llm  = llama_model_n_embd(model);
-            if (n_embd_clip != n_embd_llm) {
-                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
+            // mtmd_init_from_file requires the already-loaded text model, so it must
+            // run AFTER llama_init_from_gpt_params. It validates the projector
+            // against the model internally and returns nullptr on dim mismatch, so
+            // the explicit clip_n_mmproj_embd check is no longer needed.
+            mtmd_context_params mparams = mtmd_context_params_default();
+            mparams.use_gpu         = params.mmproj_use_gpu;
+            mparams.print_timings   = false;
+            mparams.n_threads       = params.n_threads_mtmd != -1 ? params.n_threads_mtmd
+                                      : params.n_threads_batch != -1 ? params.n_threads_batch
+                                                                     : params.n_threads;
+            mparams.verbosity       = GGML_LOG_LEVEL_INFO;
+            mparams.flash_attn_type = params.flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED
+                                                        : LLAMA_FLASH_ATTN_TYPE_DISABLED;
+            mparams.image_min_tokens = params.image_min_tokens;
+            mparams.image_max_tokens = params.image_max_tokens;
+            mctx = mtmd_init_from_file(params.mmproj.path.c_str(), model, mparams);
+            if (mctx == nullptr) {
+                LOG_ERR("unable to load multimodal projector: %s", params.mmproj.path.c_str());
                llama_free(ctx);
                llama_free_model(model);
                return false;
@@ -865,8 +848,8 @@ struct llama_server_context

                    slot_image img_sl;
                    img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
-                    img_sl.img_data = clip_image_u8_init();
-                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
+                    img_sl.bitmap = mtmd_helper_bitmap_init_from_buf(mctx, image_buffer.data(), image_buffer.size());
+                    if (img_sl.bitmap == nullptr)
                    {
                        LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d",
                             __func__,
@@ -879,50 +862,74 @@ struct llama_server_context
                        {"slot_id",   slot->id},
                        {"img_sl_id", img_sl.id}
                    });
-                    img_sl.request_encode_image = true;
                    slot->images.push_back(img_sl);
                }
-                // process prompt
-                // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
+                // Translate the legacy [img-N] tags into mtmd media markers, in
+                // order, and collect the matching bitmaps in marker order so they
+                // line up with the markers passed to mtmd_tokenize(). The text after
+                // the last image stays in input_suffix and is decoded through the
+                // normal token path, so the sampling loop is unchanged.
+                // example: system prompt [img-102] user [img-103] describe [img-134]
                if (slot->images.size() > 0 && !slot->prompt.is_array())
                {
+                    const std::string marker = mtmd_default_marker();
                    std::string prompt = slot->prompt.get<std::string>();
-                    size_t pos = 0, begin_prefix = 0;
+                    std::string built_prompt;
+                    std::vector<slot_image> ordered;
+                    size_t pos = 0, copy_from = 0;
                    std::string pattern = "[img-";
-                    while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
-                        size_t end_prefix = pos;
-                        pos += pattern.length();
-                        size_t end_pos = prompt.find(']', pos);
-                        if (end_pos != std::string::npos)
-                        {
-                            std::string image_id = prompt.substr(pos, end_pos - pos);
-                            try
-                            {
-                                int img_id = std::stoi(image_id);
-                                bool found = false;
-                                for (slot_image &img : slot->images)
-                                {
-                                    if (img.id == img_id) {
-                                        found = true;
-                                        img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
-                                        begin_prefix = end_pos + 1;
-                                        break;
-                                    }
-                                }
-                                if (!found) {
-                                    LOG("ERROR: Image with id: %i, not found.\n", img_id);
-                                    slot->images.clear();
-                                    return false;
-                                }
-                            } catch (const std::invalid_argument& e) {
-                                LOG("Invalid image number id in prompt\n");
-                                slot->images.clear();
-                                return false;
+
+                    auto free_images = [&]() {
+                        for (slot_image &img : slot->images) {
+                            if (img.bitmap) {
+                                mtmd_bitmap_free(img.bitmap);
+                                img.bitmap = nullptr;
                            }
                        }
+                        slot->images.clear();
+                    };
+
+                    while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
+                        size_t tag_begin = pos;
+                        pos += pattern.length();
+                        size_t end_pos = prompt.find(']', pos);
+                        if (end_pos == std::string::npos) {
+                            break;
+                        }
+                        std::string image_id = prompt.substr(pos, end_pos - pos);
+                        try
+                        {
+                            int img_id = std::stoi(image_id);
+                            bool found = false;
+                            for (slot_image &img : slot->images)
+                            {
+                                if (img.id == img_id) {
+                                    found = true;
+                                    // text before this tag, then the media marker
+                                    built_prompt += prompt.substr(copy_from, tag_begin - copy_from);
+                                    built_prompt += marker;
+                                    copy_from = end_pos + 1;
+                                    ordered.push_back(img);
+                                    break;
+                                }
+                            }
+                            if (!found) {
+                                LOG("ERROR: Image with id: %i, not found.\n", img_id);
+                                free_images();
+                                return false;
+                            }
+                        } catch (const std::invalid_argument& e) {
+                            LOG("Invalid image number id in prompt\n");
+                            free_images();
+                            return false;
+                        }
+                        pos = end_pos + 1;
                    }
+                    // bitmaps are consumed in marker order by mtmd_tokenize()
+                    slot->images = ordered;
+                    slot->mtmd_prompt = built_prompt;
                    slot->prompt = "";
-                    slot->params.input_suffix = prompt.substr(begin_prefix);
+                    slot->params.input_suffix = prompt.substr(copy_from);
                    slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
                }
            }
@@ -1176,21 +1183,10 @@ struct llama_server_context

    bool process_images(llama_client_slot &slot) const
    {
-        for (slot_image &img : slot.images)
-        {
-            if (!img.request_encode_image)
-            {
-                continue;
-            }
-
-            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
-                LOG("Error processing the given image");
-                return false;
-            }
-
-            img.request_encode_image = false;
-        }
-
+        // With the mtmd pipeline, image encoding is no longer eager: the bitmaps
+        // are tokenized and encoded together with the surrounding text inside
+        // ingest_images() via mtmd_tokenize() + mtmd_helper_eval_chunks(). This
+        // just reports whether the slot carries any images to process.
        return slot.images.size() > 0;
    }

@@ -1435,69 +1431,70 @@ struct llama_server_context
        }
    }

-    // for multiple images processing
+    // Tokenize the multimodal prompt (text interleaved with media markers) together
+    // with the slot's bitmaps, then decode the resulting chunks into the llama
+    // context via the high-level mtmd helper. The helper runs llama_decode() on the
+    // text chunks and mtmd_encode() + llama_decode() on the image chunks, handling
+    // batching and any pre/post decode setup (e.g. non-causal attention for gemma3).
+    // Advances slot.n_past by the number of positions consumed, then leaves the
+    // post-image suffix tokens in `batch` so the normal decode + sampling loop
+    // produces the first generated token.
    bool ingest_images(llama_client_slot &slot, int n_batch)
    {
-        int image_idx = 0;
-
-        while (image_idx < (int) slot.images.size())
+        if (mctx == nullptr)
        {
-            slot_image &img = slot.images[image_idx];
+            LOG("%s : multimodal context is not initialized\n", __func__);
+            return false;
+        }

-            // process prefix prompt
-            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
-            {
-                const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-                llama_batch batch_view = {
-                    n_tokens,
-                    batch.token    + i,
-                    nullptr,
-                    batch.pos      + i,
-                    batch.n_seq_id + i,
-                    batch.seq_id   + i,
-                    batch.logits   + i,
-                };
-                if (llama_decode(ctx, batch_view))
-                {
-                    LOG("%s : failed to eval\n", __func__);
-                    return false;
-                }
-            }
+        // bitmaps stay owned by slot.images (freed on reset()); pass non-owning ptrs
+        std::vector<const mtmd_bitmap *> bitmaps;
+        bitmaps.reserve(slot.images.size());
+        for (const slot_image &img : slot.images)
+        {
+            bitmaps.push_back(img.bitmap);
+        }

-            // process image with llm
-            for (int i = 0; i < img.image_tokens; i += n_batch)
-            {
-                int n_eval = img.image_tokens - i;
-                if (n_eval > n_batch)
-                {
-                    n_eval = n_batch;
-                }
+        mtmd_input_text inp_txt;
+        inp_txt.text          = slot.mtmd_prompt.c_str();
+        inp_txt.add_special   = add_bos_token;
+        inp_txt.parse_special = true;

-                const int n_embd = llama_model_n_embd(model);
-                float * embd = img.image_embedding + i * n_embd;
-                llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
-                if (llama_decode(ctx, llava_batch.batch))
-                {
-                    LOG("%s : failed to eval image\n", __func__);
-                    return false;
-                }
-                slot.n_past += n_eval;
-            }
-            image_idx++;
+        mtmd::input_chunks chunks(mtmd_input_chunks_init());
+        int32_t res = mtmd_tokenize(mctx,
+                                    chunks.ptr.get(),
+                                    &inp_txt,
+                                    bitmaps.data(),
+                                    bitmaps.size());
+        if (res != 0)
+        {
+            LOG("%s : failed to tokenize multimodal prompt, res = %d\n", __func__, res);
+            return false;
+        }

-            common_batch_clear(batch);
+        const llama_pos start_pos = (llama_pos) system_tokens.size() + slot.n_past;
+        llama_pos new_n_past = start_pos;
+        if (mtmd_helper_eval_chunks(mctx,
+                                    ctx,
+                                    chunks.ptr.get(),
+                                    start_pos,
+                                    slot.id,
+                                    n_batch,
+                                    /*logits_last=*/ false,
+                                    &new_n_past) != 0)
+        {
+            LOG("%s : failed to eval multimodal chunks\n", __func__);
+            return false;
+        }
+        slot.n_past += (int32_t) (new_n_past - start_pos);

-            // append prefix of next image
-            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
-                slot.params.input_suffix : // no more images, then process suffix prompt
-                (json)(slot.images[image_idx].prefix_prompt);
-
-            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
-            for (int i = 0; i < (int) append_tokens.size(); ++i)
-            {
-                common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
-                slot.n_past += 1;
-            }
+        // queue the post-image suffix text for the normal decode + sampling path
+        common_batch_clear(batch);
+        std::vector<llama_token> suffix_tokens = tokenize(slot.params.input_suffix, false);
+        for (llama_token tok : suffix_tokens)
+        {
+            common_batch_add(batch, tok, system_tokens.size() + slot.n_past, { slot.id }, false);
+            slot.n_past += 1;
        }

        return true;
@@ -1884,8 +1881,11 @@ struct llama_server_context

                    const bool has_images = process_images(slot);

-                    // process the prefix of first image
-                    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
+                    // For the multimodal path the whole pre-image / inter-image text is
+                    // tokenized and decoded inside ingest_images() via mtmd, so no prefix
+                    // tokens are queued here; the post-image suffix is appended by
+                    // ingest_images() for the normal decode + sampling loop.
+                    std::vector<llama_token> prefix_tokens = has_images ? std::vector<llama_token>() : prompt_tokens;

                    int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;

--- a/backend/cpp/ik-llama-cpp/patches/0002-clip-ggml-quantize-chunk-user-data.patch
+++ b/backend/cpp/ik-llama-cpp/patches/0002-clip-ggml-quantize-chunk-user-data.patch
@@ -1,11 +0,0 @@
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -2494,7 +2494,7 @@
-             }
-             new_data = work.data();
-
-            new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
-+            new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr, nullptr);
-         } else {
-             new_type = cur->type;
-             new_data = cur->data;
--- a/backend/cpp/ik-llama-cpp/prepare.sh
+++ b/backend/cpp/ik-llama-cpp/prepare.sh
@@ -17,28 +17,9 @@ cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
 cp -r utils.hpp llama.cpp/examples/grpc-server/
 cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/examples/grpc-server/

-## Copy clip/llava files for multimodal support (built as myclip library)
-cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
-cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
-cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
-# Prepend llama.h include to llava.h
-echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
-cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
-# Copy clip-impl.h if it exists
-if [ -f llama.cpp/examples/llava/clip-impl.h ]; then
-    cp -rfv llama.cpp/examples/llava/clip-impl.h llama.cpp/examples/grpc-server/clip-impl.h
-fi
-# Copy stb_image.h
-if [ -f llama.cpp/vendor/stb/stb_image.h ]; then
-    cp -rfv llama.cpp/vendor/stb/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
-elif [ -f llama.cpp/common/stb_image.h ]; then
-    cp -rfv llama.cpp/common/stb_image.h llama.cpp/examples/grpc-server/stb_image.h
-fi
-
-## Fix API compatibility in llava.cpp (llama_n_embd -> llama_model_n_embd)
-if [ -f llama.cpp/examples/grpc-server/llava.cpp ]; then
-    sed -i 's/llama_n_embd(/llama_model_n_embd(/g' llama.cpp/examples/grpc-server/llava.cpp
-fi
+## Multimodal support is provided by the `mtmd` library target (examples/mtmd/),
+## which the grpc-server links and includes directly. No source copy is needed:
+## clip/llava were pruned upstream and the high-level mtmd_* API is used instead.

 set +e
 if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
--- a/backend/cpp/ik-llama-cpp/utils.hpp
+++ b/backend/cpp/ik-llama-cpp/utils.hpp
@@ -11,9 +11,12 @@

 #include "json.hpp"

-#include "clip.h"
+#include "mtmd.h"

-using json = nlohmann::json;
+// mtmd.h and ik_llama's entire server/common stack (chat.h, server-common.h,
+// server-task.h, ...) declare `using json = nlohmann::ordered_json`, so match it
+// here: a plain `nlohmann::json` alias collides with mtmd.h's at global scope.
+using json = nlohmann::ordered_json;

 extern bool server_verbose;

@@ -111,13 +114,12 @@ struct slot_image
 {
    int32_t id;

-    bool request_encode_image = false;
-    float * image_embedding = nullptr;
-    int32_t image_tokens = 0;
-
-    clip_image_u8 * img_data;
-
-    std::string prefix_prompt; // before of this image
+    // mtmd bitmap (image/audio) decoded from the request buffer. Owned by the
+    // slot; freed via mtmd_bitmap_free() on reset. The high-level mtmd pipeline
+    // (mtmd_tokenize + mtmd_helper_eval_chunks) consumes these directly, so the
+    // legacy eager-encode fields (embedding/tokens) and per-image prefix prompt
+    // are no longer needed.
+    mtmd_bitmap * bitmap = nullptr;
 };

 // completion token output with probabilities
--- a/backend/cpp/llama-cpp-localai-paged/Makefile
+++ b/backend/cpp/llama-cpp-localai-paged/Makefile
@@ -49,7 +49,7 @@
 # helpers that the refactor pulled into the headers grpc-server.cpp includes.
 # Therefore a PIN_SYNC must pass the FULL grpc-server build/link on CI, not only
 # the bit-exact gate. See README section 7 + .agents/llama-cpp-localai-paged-backend.md.
-LLAMA_VERSION?=9d5d882d8cd0f0a9283d87ed5e6fe3ee0d925fb1
+LLAMA_VERSION?=0ed235ea2c17a19fc8238668653946721ed136fd

 CMAKE_ARGS?=
 BUILD_TYPE?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -4,7 +4,7 @@
 # (backend/cpp/llama-cpp-localai-paged) does NOT inherit this pin: it owns its
 # own LLAMA_VERSION because its vendored patch series would break on a naive
 # bump and is advanced only by the manual PIN_SYNC process.
-LLAMA_VERSION?=9d5d882d8cd0f0a9283d87ed5e6fe3ee0d925fb1
+LLAMA_VERSION?=0ed235ea2c17a19fc8238668653946721ed136fd
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
@@ -161,11 +161,11 @@ llama-cpp-grpc: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
 	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target ggml-rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
 	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc

 llama-cpp-rpc-server: llama-cpp-grpc
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/ggml-rpc-server llama-cpp-rpc-server

 llama.cpp:
 	mkdir -p llama.cpp
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -30,6 +30,19 @@
 #define LOCALAI_HAS_SERVER_SCHEMA 1
 #include "server-schema.cpp"
 #endif
+// server-stream.cpp exists only in llama.cpp after the upstream refactor that
+// added the SSE stream-resumption layer (stream_session/stream_pipe_producer).
+// server-context.cpp calls into it (spipe->cleanup(), stream_aware_should_stop,
+// stream_session_attach_pipe), so its definitions must be part of this
+// translation unit or the link fails with "undefined reference to
+// stream_pipe_producer::cleanup()". The file is self-contained (its only
+// external symbols come from server-common, already pulled in above) and the
+// http route-handler factories it also defines are unused here but harmless.
+// __has_include keeps the source compatible with older pins/forks that predate
+// the split.
+#if __has_include("server-stream.cpp")
+#include "server-stream.cpp"
+#endif
 #include "server-context.cpp"

 // LocalAI
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=8f1218141b792b8868861c1af17ba1e361b05dc0
+CRISPASR_VERSION?=6514c9da00b03a2f0f1b49a43fae4f3a01a41844
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/face-detect/.gitignore
+++ b/backend/go/face-detect/.gitignore
@@ -0,0 +1,18 @@
+# Fetched upstream sources
+sources/
+
+# CMake build directories
+build*/
+
+# build artifacts staged in-tree by the Makefile (cp from sources/) or
+# symlinked for local dev; the real sources live in face-detect.cpp upstream.
+*.so
+*.so.*
+facedetect_capi.h
+compile_commands.json
+
+# Compiled backend binary
+face-detect-grpc
+
+# Packaging output
+package/
--- a/backend/go/face-detect/Makefile
+++ b/backend/go/face-detect/Makefile
@@ -0,0 +1,110 @@
+# face-detect backend Makefile.
+#
+# Upstream pin lives below as FACEDETECT_VERSION?=06914b0... (.github/bump_deps.sh
+# can find and update it - matches the voice-detect / parakeet.cpp / whisper.cpp
+# convention).
+#
+# Local dev shortcut: if you already have an out-of-tree face-detect.cpp build,
+# symlink the .so + header into this directory and skip the clone/cmake steps:
+#
+#   ln -sf /path/to/face-detect.cpp/build-shared/libfacedetect.so .
+#   ln -sf /path/to/face-detect.cpp/include/facedetect_capi.h .
+#   go build -o face-detect-grpc .
+#
+# The default target below does the proper clone-at-pin + cmake build so CI does
+# not need a side-checkout.
+
+FACEDETECT_VERSION?=06914b077d52f90d5421299138e7be6bdd06b5e8
+FACEDETECT_REPO?=https://github.com/mudler/face-detect.cpp
+
+GOCMD?=go
+GO_TAGS?=
+JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
+
+BUILD_TYPE?=
+NATIVE?=false
+
+# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
+# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
+RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
+
+# Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC)
+# so the shared lib is self-contained: dlopen needs no libggml*.so alongside it,
+# only system libs (libstdc++/libgomp/libc) the runtime image already provides.
+# The vendored jpeg symbols are hidden via -Wl,--exclude-libs,ALL on the C++
+# side, so only the facedetect_capi_* surface is exported.
+CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DFACEDETECT_SHARED=ON -DFACEDETECT_BUILD_CLI=OFF -DFACEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+
+# face-detect.cpp gates its GGML backends behind FACEDETECT_GGML_* options and
+# does set(GGML_CUDA ${FACEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
+# -DGGML_CUDA=ON is overwritten back to OFF. Forward the FACEDETECT_GGML_*
+# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON
+	# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, SCRFD 2.3x
+	# vs torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
+	# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
+	# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
+	ifeq ($(CUDA_MAJOR_VERSION),13)
+	ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
+		CMAKE_ARGS+=-DFACEDETECT_GGML_CUDNN=ON
+	endif
+	endif
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DFACEDETECT_GGML_HIP=ON
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DFACEDETECT_GGML_VULKAN=ON
+else ifeq ($(BUILD_TYPE),metal)
+	CMAKE_ARGS+=-DFACEDETECT_GGML_METAL=ON
+endif
+
+.PHONY: face-detect-grpc package build clean purge test all
+
+all: face-detect-grpc
+
+# Clone the upstream face-detect.cpp source at the pinned commit. Directory acts
+# as the target so make only re-clones when missing. After a FACEDETECT_VERSION
+# bump, run 'make purge && make' to refetch.
+sources/face-detect.cpp:
+	mkdir -p sources/face-detect.cpp
+	cd sources/face-detect.cpp && \
+	git init -q && \
+	git remote add origin $(FACEDETECT_REPO) && \
+	git fetch --depth 1 origin $(FACEDETECT_VERSION) && \
+	git checkout FETCH_HEAD && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+# Build the shared lib + header out-of-tree, then stage them next to the Go
+# sources so purego.Dlopen("libfacedetect.so") and the cgo-less build both pick
+# them up.
+libfacedetect.so: sources/face-detect.cpp
+	cmake -B sources/face-detect.cpp/build-shared -S sources/face-detect.cpp $(CMAKE_ARGS)
+	cmake --build sources/face-detect.cpp/build-shared --config Release -j$(JOBS) --target facedetect
+	cp -fv sources/face-detect.cpp/build-shared/libfacedetect.so* ./ 2>/dev/null || true
+	cp -fv sources/face-detect.cpp/include/facedetect_capi.h ./
+
+face-detect-grpc: libfacedetect.so main.go gofacedetect.go options.go
+	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o face-detect-grpc .
+
+package: face-detect-grpc
+	bash package.sh
+
+build: package
+
+# Test target. The embed/detect/verify/analyze smoke specs are gated on
+# FACEDETECT_BACKEND_TEST_MODEL + FACEDETECT_BACKEND_TEST_IMAGE; without them the
+# heavy specs auto-skip and only the pure-Go parsing specs run.
+test:
+	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
+
+clean: purge
+	rm -rf libfacedetect.so* facedetect_capi.h package face-detect-grpc
+
+purge:
+	rm -rf sources/face-detect.cpp
--- a/backend/go/face-detect/gofacedetect.go
+++ b/backend/go/face-detect/gofacedetect.go
@@ -0,0 +1,431 @@
+package main
+
+import (
+	"encoding/base64"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/xlog"
+)
+
+// purego-bound entry points from libfacedetect.so. Names match
+// facedetect_capi.h exactly so a `nm libfacedetect.so | grep facedetect_capi`
+// is enough to spot drift.
+//
+// The opaque ctx and the malloc'd char*/float* return values are declared as
+// uintptr so we get the raw pointer back and can release it via the matching
+// capi free function. purego's native string/[]float32 returns would copy and
+// forget the original pointer, leaking the C-owned buffer on every call.
+var (
+	CppAbiVersion  func() int32
+	CppLoad        func(ggufPath string) uintptr
+	CppFree        func(ctx uintptr)
+	CppLastError   func(ctx uintptr) string
+	CppFreeString  func(s uintptr)
+	CppFreeVec     func(v uintptr)
+	CppEmbedPath   func(ctx uintptr, imagePath string, outVec, outDim unsafe.Pointer) int32
+	CppEmbedRGB    func(ctx uintptr, rgb []byte, width, height int32, outVec, outDim unsafe.Pointer) int32
+	CppDetectJSON  func(ctx uintptr, imagePath string) uintptr
+	CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, antiSpoof int32, outDistance, outVerified unsafe.Pointer) int32
+	CppAnalyzeJSON func(ctx uintptr, imagePath string) uintptr
+)
+
+// FaceDetect implements the face-recognition (biometric) subset of the Backend
+// gRPC service over libfacedetect.so. The C side keeps a single loaded model
+// pack plus a per-ctx last-error buffer and is not reentrant, so
+// base.SingleThread serializes every call.
+type FaceDetect struct {
+	base.SingleThread
+	opts   loadOptions
+	ctxPtr uintptr
+}
+
+func (f *FaceDetect) Load(opts *pb.ModelOptions) error {
+	model := opts.ModelFile
+	if model == "" {
+		model = opts.ModelPath
+	}
+	if !filepath.IsAbs(model) && opts.ModelPath != "" {
+		model = filepath.Join(opts.ModelPath, model)
+	}
+	if model == "" {
+		return errors.New("face-detect: ModelFile is required")
+	}
+
+	f.opts = parseOptions(opts.Options)
+	if f.opts.modelName == "" {
+		f.opts.modelName = filepath.Base(model)
+	}
+
+	// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
+	// one backend process per model and serves requests concurrently, so the
+	// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
+	// FACEDETECT_THREADS is read by the engine at backend construction, so it
+	// must be set before the capi load. A non-positive Threads means "unset":
+	// leave the env alone so the engine keeps its sane default.
+	threads := opts.Threads
+	if threads > 0 {
+		if err := os.Setenv("FACEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
+			return fmt.Errorf("face-detect: set FACEDETECT_THREADS: %w", err)
+		}
+		xlog.Info("face-detect: applying LocalAI thread budget", "threads", threads)
+	}
+
+	xlog.Info("face-detect: loading model", "model", model,
+		"verify_threshold", f.opts.verifyThreshold, "abi", CppAbiVersion())
+
+	ctx := CppLoad(model)
+	if ctx == 0 {
+		// The last-error buffer lives on the ctx that was never returned, so
+		// surface the path the operator tried to load instead.
+		return fmt.Errorf("face-detect: facedetect_capi_load failed for %q", model)
+	}
+	f.ctxPtr = ctx
+	return nil
+}
+
+// Embeddings returns the L2-normalized ArcFace embedding of the primary face in
+// the supplied image. Mirroring the Python face backend, the image is read from
+// Images[0] as a base64 payload; materializeImage decodes it to a temp file so
+// the path-based C-API can run its own decode (cv2.imread parity). The gRPC
+// server wraps the returned slice in an EmbeddingResult.
+func (f *FaceDetect) Embeddings(req *pb.PredictOptions) ([]float32, error) {
+	if f.ctxPtr == 0 {
+		return nil, errors.New("face-detect: model not loaded")
+	}
+	if len(req.Images) == 0 || req.Images[0] == "" {
+		return nil, errors.New("face-detect: Embedding requires Images[0] to be a base64 image")
+	}
+
+	path, cleanup, err := materializeImage(req.Images[0])
+	if err != nil {
+		return nil, err
+	}
+	defer cleanup()
+
+	return f.embedPath(path)
+}
+
+func (f *FaceDetect) embedPath(path string) ([]float32, error) {
+	var vec uintptr
+	var dim int32
+	rc := CppEmbedPath(f.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
+	if rc != 0 || vec == 0 || dim <= 0 {
+		return nil, f.lastErr("embed", path)
+	}
+	defer CppFreeVec(vec)
+	// Copy out of the C-owned malloc'd buffer before freeing it. The
+	// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
+	// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
+	// nor moves this buffer and we copy immediately.
+	src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
+	out := make([]float32, int(dim))
+	copy(out, src)
+	return out, nil
+}
+
+// Detect runs SCRFD over the image and returns one Detection per face. The
+// C-API emits a box as [x1,y1,x2,y2] in pixels; the proto carries x/y plus
+// width/height, so the corners are converted. The 5 facial landmarks the engine
+// also returns are dropped: the Detection message has no field for them.
+func (f *FaceDetect) Detect(req *pb.DetectOptions) (pb.DetectResponse, error) {
+	if f.ctxPtr == 0 {
+		return pb.DetectResponse{}, errors.New("face-detect: model not loaded")
+	}
+	if req.Src == "" {
+		return pb.DetectResponse{}, errors.New("face-detect: src image is required")
+	}
+
+	path, cleanup, err := materializeImage(req.Src)
+	if err != nil {
+		return pb.DetectResponse{}, err
+	}
+	defer cleanup()
+
+	faces, err := f.detectFaces(path)
+	if err != nil {
+		return pb.DetectResponse{}, err
+	}
+
+	dets := make([]*pb.Detection, 0, len(faces))
+	for _, fc := range faces {
+		if req.Threshold > 0 && fc.Score < req.Threshold {
+			continue
+		}
+		x, y, w, h := fc.xywh()
+		dets = append(dets, &pb.Detection{
+			X:          x,
+			Y:          y,
+			Width:      w,
+			Height:     h,
+			Confidence: fc.Score,
+			ClassName:  "face",
+		})
+	}
+	return pb.DetectResponse{Detections: dets}, nil
+}
+
+// FaceVerify embeds the primary face in each image and reports whether they are
+// the same identity by cosine distance against a threshold. A request threshold
+// <= 0 falls back to the model-configured default (verify_threshold option,
+// 0.35 if unset). When anti_spoofing is set, the C-API applies a MiniFASNet
+// veto internally (verified forced false on a spoof); the per-image liveness
+// scores are not exposed by the verify entry point, so img*_is_real /
+// img*_antispoof_score stay at their zero values.
+func (f *FaceDetect) FaceVerify(req *pb.FaceVerifyRequest) (pb.FaceVerifyResponse, error) {
+	if f.ctxPtr == 0 {
+		return pb.FaceVerifyResponse{}, errors.New("face-detect: model not loaded")
+	}
+	if req.Img1 == "" || req.Img2 == "" {
+		return pb.FaceVerifyResponse{}, errors.New("face-detect: img1 and img2 are required")
+	}
+
+	path1, cleanup1, err := materializeImage(req.Img1)
+	if err != nil {
+		return pb.FaceVerifyResponse{}, err
+	}
+	defer cleanup1()
+	path2, cleanup2, err := materializeImage(req.Img2)
+	if err != nil {
+		return pb.FaceVerifyResponse{}, err
+	}
+	defer cleanup2()
+
+	threshold := req.Threshold
+	if threshold <= 0 {
+		threshold = f.opts.verifyThreshold
+	}
+
+	antiSpoof := int32(0)
+	if req.AntiSpoofing {
+		antiSpoof = 1
+	}
+
+	started := time.Now()
+	var distance float32
+	var verified int32
+	rc := CppVerifyPaths(f.ctxPtr, path1, path2, threshold, antiSpoof,
+		unsafe.Pointer(&distance), unsafe.Pointer(&verified))
+	if rc != 0 {
+		return pb.FaceVerifyResponse{}, f.lastErr("verify", req.Img1[:min(8, len(req.Img1))]+"...")
+	}
+	elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
+
+	// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
+	// matching the Python face backend's reporting.
+	confidence := float32(0)
+	if threshold > 0 {
+		confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
+	}
+
+	return pb.FaceVerifyResponse{
+		Verified:         verified != 0,
+		Distance:         distance,
+		Threshold:        threshold,
+		Confidence:       confidence,
+		Model:            f.opts.modelName,
+		Img1Area:         f.bestArea(path1),
+		Img2Area:         f.bestArea(path2),
+		ProcessingTimeMs: elapsedMs,
+	}, nil
+}
+
+// FaceAnalyze runs the genderage head on every detected face. The C-API returns
+// "M"/"F" gender labels and a rounded age; the labels are normalized to the
+// "Man"/"Woman" values the proto documents.
+func (f *FaceDetect) FaceAnalyze(req *pb.FaceAnalyzeRequest) (pb.FaceAnalyzeResponse, error) {
+	if f.ctxPtr == 0 {
+		return pb.FaceAnalyzeResponse{}, errors.New("face-detect: model not loaded")
+	}
+	if req.Img == "" {
+		return pb.FaceAnalyzeResponse{}, errors.New("face-detect: img is required")
+	}
+
+	path, cleanup, err := materializeImage(req.Img)
+	if err != nil {
+		return pb.FaceAnalyzeResponse{}, err
+	}
+	defer cleanup()
+
+	ptr := CppAnalyzeJSON(f.ctxPtr, path)
+	if ptr == 0 {
+		return pb.FaceAnalyzeResponse{}, f.lastErr("analyze", path)
+	}
+	defer CppFreeString(ptr)
+
+	faces, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
+	if err != nil {
+		return pb.FaceAnalyzeResponse{}, fmt.Errorf("face-detect: analyze JSON: %w", err)
+	}
+	return pb.FaceAnalyzeResponse{Faces: faces}, nil
+}
+
+// faceBox is one entry of the detect/analyze JSON documents the engine emits.
+type faceBox struct {
+	Score  float32   `json:"score"`
+	Box    []float32 `json:"box"`
+	Age    float32   `json:"age"`
+	Gender string    `json:"gender"`
+}
+
+// xywh converts the engine's [x1,y1,x2,y2] box into the x/y/width/height the
+// proto carries. A short or missing box yields zeros.
+func (b faceBox) xywh() (x, y, w, h float32) {
+	if len(b.Box) < 4 {
+		return 0, 0, 0, 0
+	}
+	return b.Box[0], b.Box[1], b.Box[2] - b.Box[0], b.Box[3] - b.Box[1]
+}
+
+type facesJSON struct {
+	Faces []faceBox `json:"faces"`
+}
+
+func (f *FaceDetect) detectFaces(path string) ([]faceBox, error) {
+	ptr := CppDetectJSON(f.ctxPtr, path)
+	if ptr == 0 {
+		return nil, f.lastErr("detect", path)
+	}
+	defer CppFreeString(ptr)
+
+	var doc facesJSON
+	if err := json.Unmarshal([]byte(goStringFromCPtr(ptr)), &doc); err != nil {
+		return nil, fmt.Errorf("face-detect: detect JSON: %w", err)
+	}
+	return doc.Faces, nil
+}
+
+// bestArea returns the FacialArea of the highest-scoring face in an image, or an
+// empty area when detection fails or finds nothing. Best-effort: verify already
+// succeeded, so a missing region must not turn a valid match into an error.
+func (f *FaceDetect) bestArea(path string) *pb.FacialArea {
+	faces, err := f.detectFaces(path)
+	if err != nil || len(faces) == 0 {
+		return &pb.FacialArea{}
+	}
+	best := faces[0]
+	for _, fc := range faces[1:] {
+		if fc.Score > best.Score {
+			best = fc
+		}
+	}
+	x, y, w, h := best.xywh()
+	return &pb.FacialArea{X: x, Y: y, W: w, H: h}
+}
+
+// parseAnalyzeJSON maps the engine's analyze document onto FaceAnalysis entries.
+// The engine reports gender as "M"/"F"; both the dominant label and the score
+// map are filled with the "Man"/"Woman" form the proto documents.
+func parseAnalyzeJSON(doc string) ([]*pb.FaceAnalysis, error) {
+	var parsed facesJSON
+	if err := json.Unmarshal([]byte(doc), &parsed); err != nil {
+		return nil, err
+	}
+
+	out := make([]*pb.FaceAnalysis, 0, len(parsed.Faces))
+	for _, fc := range parsed.Faces {
+		x, y, w, h := fc.xywh()
+		fa := &pb.FaceAnalysis{
+			Region:         &pb.FacialArea{X: x, Y: y, W: w, H: h},
+			FaceConfidence: fc.Score,
+			Age:            fc.Age,
+		}
+		if label := normalizeGender(fc.Gender); label != "" {
+			fa.DominantGender = label
+			fa.Gender = map[string]float32{label: 1.0}
+		}
+		out = append(out, fa)
+	}
+	return out, nil
+}
+
+// normalizeGender maps the engine's "M"/"F" code to the "Man"/"Woman" labels the
+// proto documents. Unknown codes pass through unchanged.
+func normalizeGender(g string) string {
+	switch strings.ToUpper(strings.TrimSpace(g)) {
+	case "M":
+		return "Man"
+	case "F":
+		return "Woman"
+	case "":
+		return ""
+	default:
+		return g
+	}
+}
+
+// materializeImage decodes a base64 image payload into a temp file and returns
+// its path plus a cleanup func. As a convenience for callers that already pass a
+// filesystem path (e.g. a test fixture), an existing path is used as-is with a
+// no-op cleanup. data: URI prefixes are stripped before decoding.
+func materializeImage(src string) (path string, cleanup func(), err error) {
+	noop := func() {}
+	if src == "" {
+		return "", noop, errors.New("face-detect: empty image input")
+	}
+	if _, statErr := os.Stat(src); statErr == nil {
+		return src, noop, nil
+	}
+
+	payload := src
+	if i := strings.Index(payload, ","); strings.HasPrefix(payload, "data:") && i >= 0 {
+		payload = payload[i+1:]
+	}
+	data, decErr := base64.StdEncoding.DecodeString(strings.TrimSpace(payload))
+	if decErr != nil || len(data) == 0 {
+		return "", noop, errors.New("face-detect: image is neither an existing path nor valid base64")
+	}
+
+	tmp, createErr := os.CreateTemp("", "face-detect-*.img")
+	if createErr != nil {
+		return "", noop, fmt.Errorf("face-detect: create temp image: %w", createErr)
+	}
+	cleanup = func() { _ = os.Remove(tmp.Name()) }
+	if _, wErr := tmp.Write(data); wErr != nil {
+		_ = tmp.Close()
+		cleanup()
+		return "", noop, fmt.Errorf("face-detect: write temp image: %w", wErr)
+	}
+	if cErr := tmp.Close(); cErr != nil {
+		cleanup()
+		return "", noop, fmt.Errorf("face-detect: close temp image: %w", cErr)
+	}
+	return tmp.Name(), cleanup, nil
+}
+
+// lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
+func (f *FaceDetect) lastErr(op, subject string) error {
+	msg := strings.TrimSpace(CppLastError(f.ctxPtr))
+	if msg == "" {
+		msg = "no error detail"
+	}
+	return fmt.Errorf("face-detect: %s failed for %q: %s", op, subject, msg)
+}
+
+// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
+// malloc'd buffer the caller owns; release it via CppFreeString after the copy.
+//
+// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
+// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
+// moves the buffer and we dereference it immediately to copy the bytes out.
+func goStringFromCPtr(cptr uintptr) string {
+	if cptr == 0 {
+		return ""
+	}
+	p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
+	n := 0
+	for *(*byte)(unsafe.Add(p, n)) != 0 {
+		n++
+	}
+	return string(unsafe.Slice((*byte)(p), n))
+}
--- a/backend/go/face-detect/gofacedetect_test.go
+++ b/backend/go/face-detect/gofacedetect_test.go
@@ -0,0 +1,230 @@
+package main
+
+import (
+	"encoding/base64"
+	"os"
+	"sync"
+	"testing"
+
+	"github.com/ebitengine/purego"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestFaceDetect(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "face-detect Backend Suite")
+}
+
+var (
+	libLoadOnce sync.Once
+	libLoadErr  error
+)
+
+// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
+// bridge without spinning up the gRPC server. Records the error (the smoke
+// specs skip themselves) when libfacedetect.so is not loadable from cwd
+// (LD_LIBRARY_PATH or a symlink in ./).
+func ensureLibLoaded() error {
+	libLoadOnce.Do(func() {
+		libName := os.Getenv("FACEDETECT_LIBRARY")
+		if libName == "" {
+			libName = "libfacedetect.so"
+		}
+		lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+		if err != nil {
+			libLoadErr = err
+			return
+		}
+		purego.RegisterLibFunc(&CppAbiVersion, lib, "facedetect_capi_abi_version")
+		purego.RegisterLibFunc(&CppLoad, lib, "facedetect_capi_load")
+		purego.RegisterLibFunc(&CppFree, lib, "facedetect_capi_free")
+		purego.RegisterLibFunc(&CppLastError, lib, "facedetect_capi_last_error")
+		purego.RegisterLibFunc(&CppFreeString, lib, "facedetect_capi_free_string")
+		purego.RegisterLibFunc(&CppFreeVec, lib, "facedetect_capi_free_vec")
+		purego.RegisterLibFunc(&CppEmbedPath, lib, "facedetect_capi_embed_path")
+		purego.RegisterLibFunc(&CppEmbedRGB, lib, "facedetect_capi_embed_rgb")
+		purego.RegisterLibFunc(&CppDetectJSON, lib, "facedetect_capi_detect_path_json")
+		purego.RegisterLibFunc(&CppVerifyPaths, lib, "facedetect_capi_verify_paths")
+		purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "facedetect_capi_analyze_path_json")
+	})
+	return libLoadErr
+}
+
+var _ = Describe("parseOptions", func() {
+	It("defaults verify_threshold to 0.35", func() {
+		o := parseOptions(nil)
+		Expect(o.verifyThreshold).To(Equal(float32(0.35)))
+		Expect(o.modelName).To(Equal(""))
+	})
+
+	It("parses verify_threshold, threshold alias and model_name", func() {
+		o := parseOptions([]string{"verify_threshold:0.4", "model_name:buffalo_l", "unknown:x"})
+		Expect(o.verifyThreshold).To(Equal(float32(0.4)))
+		Expect(o.modelName).To(Equal("buffalo_l"))
+
+		o2 := parseOptions([]string{"threshold:0.3"})
+		Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
+	})
+
+	It("ignores non-positive thresholds and keeps the default", func() {
+		o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
+		Expect(o.verifyThreshold).To(Equal(float32(0.35)))
+	})
+})
+
+var _ = Describe("normalizeGender", func() {
+	It("maps M/F codes to Man/Woman", func() {
+		Expect(normalizeGender("M")).To(Equal("Man"))
+		Expect(normalizeGender("f")).To(Equal("Woman"))
+		Expect(normalizeGender(" m ")).To(Equal("Man"))
+	})
+
+	It("passes empty and unknown codes through", func() {
+		Expect(normalizeGender("")).To(Equal(""))
+		Expect(normalizeGender("nonbinary")).To(Equal("nonbinary"))
+	})
+})
+
+var _ = Describe("faceBox.xywh", func() {
+	It("converts an [x1,y1,x2,y2] box to x/y/width/height", func() {
+		b := faceBox{Box: []float32{10, 20, 50, 80}}
+		x, y, w, h := b.xywh()
+		Expect(x).To(Equal(float32(10)))
+		Expect(y).To(Equal(float32(20)))
+		Expect(w).To(Equal(float32(40)))
+		Expect(h).To(Equal(float32(60)))
+	})
+
+	It("returns zeros for a short box", func() {
+		x, y, w, h := faceBox{Box: []float32{1, 2}}.xywh()
+		Expect([]float32{x, y, w, h}).To(Equal([]float32{0, 0, 0, 0}))
+	})
+})
+
+var _ = Describe("parseAnalyzeJSON", func() {
+	It("maps region, age and gender for each face", func() {
+		doc := `{"faces":[
+			{"score":0.997,"box":[10,20,50,80],"age":31,"gender":"M"},
+			{"score":0.81,"box":[0,0,40,40],"age":24,"gender":"F"}]}`
+		faces, err := parseAnalyzeJSON(doc)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(faces).To(HaveLen(2))
+
+		Expect(faces[0].FaceConfidence).To(BeNumerically("~", 0.997, 1e-4))
+		Expect(faces[0].Age).To(BeNumerically("~", 31, 1e-4))
+		Expect(faces[0].DominantGender).To(Equal("Man"))
+		Expect(faces[0].Gender).To(HaveKeyWithValue("Man", float32(1.0)))
+		Expect(faces[0].Region.W).To(Equal(float32(40)))
+		Expect(faces[0].Region.H).To(Equal(float32(60)))
+
+		Expect(faces[1].DominantGender).To(Equal("Woman"))
+	})
+
+	It("tolerates a missing gender field", func() {
+		faces, err := parseAnalyzeJSON(`{"faces":[{"score":0.5,"box":[0,0,10,10],"age":40}]}`)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(faces).To(HaveLen(1))
+		Expect(faces[0].DominantGender).To(Equal(""))
+		Expect(faces[0].Gender).To(BeEmpty())
+	})
+
+	It("returns no faces for an empty document", func() {
+		faces, err := parseAnalyzeJSON(`{"faces":[]}`)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(faces).To(BeEmpty())
+	})
+
+	It("returns an error on malformed JSON", func() {
+		_, err := parseAnalyzeJSON(`{not-json`)
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+var _ = Describe("materializeImage", func() {
+	It("decodes a base64 payload to a temp file", func() {
+		payload := base64.StdEncoding.EncodeToString([]byte("\xff\xd8\xff\xe0fake-jpeg"))
+		path, cleanup, err := materializeImage(payload)
+		Expect(err).ToNot(HaveOccurred())
+		defer cleanup()
+		data, rerr := os.ReadFile(path)
+		Expect(rerr).ToNot(HaveOccurred())
+		Expect(data).To(Equal([]byte("\xff\xd8\xff\xe0fake-jpeg")))
+	})
+
+	It("strips a data: URI prefix before decoding", func() {
+		payload := "data:image/png;base64," + base64.StdEncoding.EncodeToString([]byte("hello"))
+		path, cleanup, err := materializeImage(payload)
+		Expect(err).ToNot(HaveOccurred())
+		defer cleanup()
+		data, rerr := os.ReadFile(path)
+		Expect(rerr).ToNot(HaveOccurred())
+		Expect(data).To(Equal([]byte("hello")))
+	})
+
+	It("uses an existing path as-is", func() {
+		tmp, err := os.CreateTemp("", "face-detect-fixture-*.bin")
+		Expect(err).ToNot(HaveOccurred())
+		defer func() { _ = os.Remove(tmp.Name()) }()
+		Expect(tmp.Close()).To(Succeed())
+
+		path, cleanup, err := materializeImage(tmp.Name())
+		Expect(err).ToNot(HaveOccurred())
+		defer cleanup()
+		Expect(path).To(Equal(tmp.Name()))
+	})
+
+	It("errors on input that is neither a path nor base64", func() {
+		_, _, err := materializeImage("not base64!!!")
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+// The specs below exercise the real C-API end to end. They run only when both a
+// model GGUF and a test image are provided, and skip cleanly otherwise so the
+// suite stays green without large assets.
+var _ = Describe("FaceDetect end-to-end", Ordered, func() {
+	var (
+		f         *FaceDetect
+		modelPath = os.Getenv("FACEDETECT_BACKEND_TEST_MODEL")
+		imagePath = os.Getenv("FACEDETECT_BACKEND_TEST_IMAGE")
+	)
+
+	BeforeAll(func() {
+		if modelPath == "" || imagePath == "" {
+			Skip("set FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE to run the e2e specs")
+		}
+		if err := ensureLibLoaded(); err != nil {
+			Skip("libfacedetect.so not loadable: " + err.Error())
+		}
+		f = &FaceDetect{}
+		Expect(f.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
+	})
+
+	It("embeds the primary face in an image", func() {
+		emb, err := f.Embeddings(&pb.PredictOptions{Images: []string{imagePath}})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(emb).ToNot(BeEmpty())
+	})
+
+	It("detects at least one face", func() {
+		resp, err := f.Detect(&pb.DetectOptions{Src: imagePath})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(resp.Detections).ToNot(BeEmpty())
+		Expect(resp.Detections[0].ClassName).To(Equal("face"))
+	})
+
+	It("verifies an image against itself as the same identity", func() {
+		resp, err := f.FaceVerify(&pb.FaceVerifyRequest{Img1: imagePath, Img2: imagePath})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(resp.Verified).To(BeTrue())
+		Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
+	})
+
+	It("analyzes age/gender for each face", func() {
+		resp, err := f.FaceAnalyze(&pb.FaceAnalyzeRequest{Img: imagePath})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(resp.Faces).ToNot(BeEmpty())
+	})
+})
--- a/backend/go/face-detect/main.go
+++ b/backend/go/face-detect/main.go
@@ -0,0 +1,65 @@
+package main
+
+// Started internally by LocalAI - one gRPC server per loaded model.
+//
+// Loads libfacedetect.so via purego and registers the flat C-API entry points
+// declared in facedetect_capi.h. The library name can be overridden with
+// FACEDETECT_LIBRARY (mirrors the VOICEDETECT_LIBRARY / PARAKEET_LIBRARY
+// convention in the sibling backends); the default looks for the .so next to
+// this binary (resolved via LD_LIBRARY_PATH by run.sh).
+import (
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/ebitengine/purego"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+type LibFuncs struct {
+	FuncPtr any
+	Name    string
+}
+
+func main() {
+	libName := os.Getenv("FACEDETECT_LIBRARY")
+	if libName == "" {
+		libName = "libfacedetect.so"
+	}
+
+	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+	if err != nil {
+		panic(fmt.Errorf("face-detect: dlopen %q: %w", libName, err))
+	}
+
+	// Bound 1:1 to facedetect_capi.h. char*/float* returns are registered as
+	// uintptr so the raw pointer can be freed via the matching capi free fn.
+	libFuncs := []LibFuncs{
+		{&CppAbiVersion, "facedetect_capi_abi_version"},
+		{&CppLoad, "facedetect_capi_load"},
+		{&CppFree, "facedetect_capi_free"},
+		{&CppLastError, "facedetect_capi_last_error"},
+		{&CppFreeString, "facedetect_capi_free_string"},
+		{&CppFreeVec, "facedetect_capi_free_vec"},
+		{&CppEmbedPath, "facedetect_capi_embed_path"},
+		{&CppEmbedRGB, "facedetect_capi_embed_rgb"},
+		{&CppDetectJSON, "facedetect_capi_detect_path_json"},
+		{&CppVerifyPaths, "facedetect_capi_verify_paths"},
+		{&CppAnalyzeJSON, "facedetect_capi_analyze_path_json"},
+	}
+	for _, lf := range libFuncs {
+		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
+	}
+
+	fmt.Fprintf(os.Stderr, "[face-detect] ABI=%d\n", CppAbiVersion())
+
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &FaceDetect{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/face-detect/options.go
+++ b/backend/go/face-detect/options.go
@@ -0,0 +1,47 @@
+package main
+
+import (
+	"strconv"
+	"strings"
+)
+
+// defaultVerifyThreshold is the cosine-distance cutoff used when a request does
+// not set one. Matches the insightface buffalo_l ArcFace R50 default the Python
+// face backend ships with so the two implementations agree on verdicts out of
+// the box.
+const defaultVerifyThreshold float32 = 0.35
+
+// loadOptions holds the parsed model-level options for face-detect.
+type loadOptions struct {
+	verifyThreshold float32
+	modelName       string
+}
+
+func splitOption(o string) (key, value string, ok bool) {
+	i := strings.Index(o, ":")
+	if i < 0 {
+		return "", "", false
+	}
+	return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
+}
+
+// parseOptions reads the backend "key:value" option slice. Unknown keys are
+// ignored. Defaults: verify_threshold 0.35, model_name derived from the file.
+func parseOptions(opts []string) loadOptions {
+	o := loadOptions{verifyThreshold: defaultVerifyThreshold}
+	for _, oo := range opts {
+		key, value, ok := splitOption(oo)
+		if !ok {
+			continue
+		}
+		switch key {
+		case "verify_threshold", "threshold":
+			if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
+				o.verifyThreshold = float32(f)
+			}
+		case "model_name":
+			o.modelName = value
+		}
+	}
+	return o
+}
--- a/backend/go/face-detect/package.sh
+++ b/backend/go/face-detect/package.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#
+# Bundle the face-detect-grpc binary, libfacedetect.so, the core runtime libs
+# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
+# so the package is self-contained. Mirrors backend/go/voice-detect/package.sh;
+# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
+# is used instead of the host's.
+
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+REPO_ROOT="${CURDIR}/../../.."
+
+mkdir -p "$CURDIR/package/lib"
+
+cp -avf "$CURDIR/face-detect-grpc" "$CURDIR/package/"
+cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
+
+# libfacedetect.so + any soname symlinks. purego.Dlopen resolves it via
+# LD_LIBRARY_PATH, which run.sh points at lib/.
+cp -avf "$CURDIR"/libfacedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
+	echo "ERROR: libfacedetect.so not found in $CURDIR, run 'make' first" >&2
+	exit 1
+}
+
+# Detect architecture and copy the core runtime libs libfacedetect.so links
+# against, plus the matching dynamic loader as lib/ld.so.
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ "$(uname -s)" = "Darwin" ]; then
+    echo "Detected Darwin"
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
+# BUILD_TYPE so the backend can reach the GPU without the runtime base image
+# shipping those drivers.
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
+echo "Packaging completed successfully"
+ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/face-detect/run.sh
+++ b/backend/go/face-detect/run.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+
+export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
+
+# If a self-contained ld.so was packaged, route through it so the packaged
+# libc / libstdc++ are used instead of the host's (matches the voice-detect /
+# whisper / parakeet backends' runtime layout).
+if [ -f "$CURDIR/lib/ld.so" ]; then
+	echo "Using lib/ld.so"
+	exec "$CURDIR/lib/ld.so" "$CURDIR/face-detect-grpc" "$@"
+fi
+
+exec "$CURDIR/face-detect-grpc" "$@"
--- a/backend/go/face-detect/test.sh
+++ b/backend/go/face-detect/test.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+cd "$CURDIR"
+
+echo "Running face-detect backend tests..."
+
+# The pure-Go parsing specs always run. The embed/detect/verify/analyze smoke
+# specs run only when a model + image are provided via
+# FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE; otherwise they
+# auto-skip.
+LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
+
+echo "face-detect tests completed."
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=8caa3f908ae6d4a4bef531e73b9a969f266a3d1f
+STABLEDIFFUSION_GGML_VERSION?=9956436c925a367daeab097598b1ea1f32d3503f

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/voice-detect/.gitignore
+++ b/backend/go/voice-detect/.gitignore
@@ -0,0 +1,18 @@
+# Fetched upstream sources
+sources/
+
+# CMake build directories
+build*/
+
+# build artifacts staged in-tree by the Makefile (cp from sources/) or
+# symlinked for local dev; the real sources live in voice-detect.cpp upstream.
+*.so
+*.so.*
+voicedetect_capi.h
+compile_commands.json
+
+# Compiled backend binary
+voice-detect-grpc
+
+# Packaging output
+package/
--- a/backend/go/voice-detect/Makefile
+++ b/backend/go/voice-detect/Makefile
@@ -0,0 +1,107 @@
+# voice-detect backend Makefile.
+#
+# Upstream pin lives below as VOICEDETECT_VERSION?=3d51077... (.github/bump_deps.sh
+# can find and update it - matches the parakeet.cpp / whisper.cpp / ds4 convention).
+#
+# Local dev shortcut: if you already have an out-of-tree voice-detect.cpp build,
+# symlink the .so + header into this directory and skip the clone/cmake steps:
+#
+#   ln -sf /path/to/voice-detect.cpp/build-shared/libvoicedetect.so .
+#   ln -sf /path/to/voice-detect.cpp/include/voicedetect_capi.h .
+#   go build -o voice-detect-grpc .
+#
+# The default target below does the proper clone-at-pin + cmake build so CI does
+# not need a side-checkout.
+
+VOICEDETECT_VERSION?=3d510772357538c5182808ac7de2278b84824e24
+VOICEDETECT_REPO?=https://github.com/mudler/voice-detect.cpp
+
+GOCMD?=go
+GO_TAGS?=
+JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
+
+BUILD_TYPE?=
+NATIVE?=false
+
+# Resolve the target arch. The backend matrix / Docker build pass TARGETARCH
+# (amd64|arm64); fall back to uname -m (aarch64|x86_64) for a local build.
+RECON_ARCH?=$(or $(TARGETARCH),$(shell uname -m))
+
+# Build ggml statically into libvoicedetect.so (PIC) so the shared lib is
+# self-contained: dlopen needs no libggml*.so alongside it, only system libs
+# (libstdc++/libgomp/libc) that the runtime image already provides.
+CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DVOICEDETECT_SHARED=ON -DVOICEDETECT_BUILD_CLI=OFF -DVOICEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+
+# voice-detect.cpp gates its GGML backends behind VOICEDETECT_GGML_* options and
+# does set(GGML_CUDA ${VOICEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
+# -DGGML_CUDA=ON is overwritten back to OFF. Forward the VOICEDETECT_GGML_*
+# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDA=ON
+	# Opt-in cuDNN implicit-GEMM conv path (kills im2col on GPU, reaches
+	# torch-cuDNN parity). Only the arm64 + CUDA 13 image (GB10/Jetson/L4T)
+	# ships libcudnn9 + the -dev headers, so gate cuDNN to that variant.
+	# x86 CUDA images carry no cuDNN -> enabling it there is a link failure.
+	ifeq ($(CUDA_MAJOR_VERSION),13)
+	ifneq (,$(filter arm64 aarch64,$(RECON_ARCH)))
+		CMAKE_ARGS+=-DVOICEDETECT_GGML_CUDNN=ON
+	endif
+	endif
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DVOICEDETECT_GGML_HIP=ON
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DVOICEDETECT_GGML_VULKAN=ON
+else ifeq ($(BUILD_TYPE),metal)
+	CMAKE_ARGS+=-DVOICEDETECT_GGML_METAL=ON
+endif
+
+.PHONY: voice-detect-grpc package build clean purge test all
+
+all: voice-detect-grpc
+
+# Clone the upstream voice-detect.cpp source at the pinned commit. Directory acts
+# as the target so make only re-clones when missing. After a VOICEDETECT_VERSION
+# bump, run 'make purge && make' to refetch.
+sources/voice-detect.cpp:
+	mkdir -p sources/voice-detect.cpp
+	cd sources/voice-detect.cpp && \
+	git init -q && \
+	git remote add origin $(VOICEDETECT_REPO) && \
+	git fetch --depth 1 origin $(VOICEDETECT_VERSION) && \
+	git checkout FETCH_HEAD && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+# Build the shared lib + header out-of-tree, then stage them next to the Go
+# sources so purego.Dlopen("libvoicedetect.so") and the cgo-less build both pick
+# them up.
+libvoicedetect.so: sources/voice-detect.cpp
+	cmake -B sources/voice-detect.cpp/build-shared -S sources/voice-detect.cpp $(CMAKE_ARGS)
+	cmake --build sources/voice-detect.cpp/build-shared --config Release -j$(JOBS) --target voicedetect
+	cp -fv sources/voice-detect.cpp/build-shared/libvoicedetect.so* ./ 2>/dev/null || true
+	cp -fv sources/voice-detect.cpp/include/voicedetect_capi.h ./
+
+voice-detect-grpc: libvoicedetect.so main.go govoicedetect.go options.go
+	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o voice-detect-grpc .
+
+package: voice-detect-grpc
+	bash package.sh
+
+build: package
+
+# Test target. The embed/verify/analyze smoke specs are gated on
+# VOICEDETECT_BACKEND_TEST_MODEL + VOICEDETECT_BACKEND_TEST_WAV; without them the
+# heavy specs auto-skip and only the pure-Go parsing specs run.
+test:
+	LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
+
+clean: purge
+	rm -rf libvoicedetect.so* voicedetect_capi.h package voice-detect-grpc
+
+purge:
+	rm -rf sources/voice-detect.cpp
--- a/backend/go/voice-detect/govoicedetect.go
+++ b/backend/go/voice-detect/govoicedetect.go
@@ -0,0 +1,273 @@
+package main
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/xlog"
+)
+
+// purego-bound entry points from libvoicedetect.so. Names match
+// voicedetect_capi.h exactly so a `nm libvoicedetect.so | grep voicedetect_capi`
+// is enough to spot drift.
+//
+// The opaque ctx and the malloc'd char*/float* return values are declared as
+// uintptr so we get the raw pointer back and can release it via the matching
+// capi free function. purego's native string/[]float32 returns would copy and
+// forget the original pointer, leaking the C-owned buffer on every call.
+var (
+	CppAbiVersion  func() int32
+	CppLoad        func(ggufPath string) uintptr
+	CppFree        func(ctx uintptr)
+	CppLastError   func(ctx uintptr) string
+	CppFreeString  func(s uintptr)
+	CppFreeVec     func(v uintptr)
+	CppEmbedPath   func(ctx uintptr, wavPath string, outVec, outDim unsafe.Pointer) int32
+	CppEmbedPCM    func(ctx uintptr, pcm []float32, nSamples, sampleRate int32, outVec, outDim unsafe.Pointer) int32
+	CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, outDistance, outVerified unsafe.Pointer) int32
+	CppAnalyzeJSON func(ctx uintptr, wavPath string) uintptr
+)
+
+// VoiceDetect implements the speaker-recognition voice subset of the Backend
+// gRPC service over libvoicedetect.so. The C side keeps a single loaded model
+// plus a per-ctx last-error buffer and is not reentrant, so base.SingleThread
+// serializes every call.
+type VoiceDetect struct {
+	base.SingleThread
+	opts   loadOptions
+	ctxPtr uintptr
+}
+
+func (v *VoiceDetect) Load(opts *pb.ModelOptions) error {
+	model := opts.ModelFile
+	if model == "" {
+		model = opts.ModelPath
+	}
+	if !filepath.IsAbs(model) && opts.ModelPath != "" {
+		model = filepath.Join(opts.ModelPath, model)
+	}
+	if model == "" {
+		return errors.New("voice-detect: ModelFile is required")
+	}
+
+	v.opts = parseOptions(opts.Options)
+	if v.opts.modelName == "" {
+		v.opts.modelName = filepath.Base(model)
+	}
+
+	// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
+	// one backend process per model and serves requests concurrently, so the
+	// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
+	// VOICEDETECT_THREADS is read by the engine at backend construction, so it
+	// must be set before the capi load. A non-positive Threads means "unset":
+	// leave the env alone so the engine keeps its sane default.
+	threads := opts.Threads
+	if threads > 0 {
+		if err := os.Setenv("VOICEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
+			return fmt.Errorf("voice-detect: set VOICEDETECT_THREADS: %w", err)
+		}
+		xlog.Info("voice-detect: applying LocalAI thread budget", "threads", threads)
+	}
+
+	xlog.Info("voice-detect: loading model", "model", model,
+		"verify_threshold", v.opts.verifyThreshold, "abi", CppAbiVersion())
+
+	ctx := CppLoad(model)
+	if ctx == 0 {
+		// The last-error buffer lives on the ctx that was never returned, so
+		// surface the path the operator tried to load instead.
+		return fmt.Errorf("voice-detect: voicedetect_capi_load failed for %q", model)
+	}
+	v.ctxPtr = ctx
+	return nil
+}
+
+// VoiceEmbed returns the L2-normalized speaker embedding for an audio clip.
+// The request carries a filesystem PATH; the HTTP layer materializes
+// base64/URL/data-URI inputs to a temp file before the gRPC call.
+func (v *VoiceDetect) VoiceEmbed(req *pb.VoiceEmbedRequest) (pb.VoiceEmbedResponse, error) {
+	if v.ctxPtr == 0 {
+		return pb.VoiceEmbedResponse{}, errors.New("voice-detect: model not loaded")
+	}
+	if req.Audio == "" {
+		return pb.VoiceEmbedResponse{}, errors.New("voice-detect: audio path is required")
+	}
+	emb, err := v.embedPath(req.Audio)
+	if err != nil {
+		return pb.VoiceEmbedResponse{}, err
+	}
+	return pb.VoiceEmbedResponse{Embedding: emb, Model: v.opts.modelName}, nil
+}
+
+func (v *VoiceDetect) embedPath(path string) ([]float32, error) {
+	var vec uintptr
+	var dim int32
+	rc := CppEmbedPath(v.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
+	if rc != 0 || vec == 0 || dim <= 0 {
+		return nil, v.lastErr("embed", path)
+	}
+	defer CppFreeVec(vec)
+	// Copy out of the C-owned malloc'd buffer before freeing it. The
+	// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
+	// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
+	// nor moves this buffer and we copy immediately.
+	src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
+	out := make([]float32, int(dim))
+	copy(out, src)
+	return out, nil
+}
+
+// VoiceVerify embeds two clips and reports whether they are the same speaker by
+// cosine distance against a threshold. A request threshold <= 0 falls back to
+// the model-configured default (verify_threshold option, 0.25 if unset).
+func (v *VoiceDetect) VoiceVerify(req *pb.VoiceVerifyRequest) (pb.VoiceVerifyResponse, error) {
+	if v.ctxPtr == 0 {
+		return pb.VoiceVerifyResponse{}, errors.New("voice-detect: model not loaded")
+	}
+	if req.Audio1 == "" || req.Audio2 == "" {
+		return pb.VoiceVerifyResponse{}, errors.New("voice-detect: audio1 and audio2 are required")
+	}
+
+	threshold := req.Threshold
+	if threshold <= 0 {
+		threshold = v.opts.verifyThreshold
+	}
+
+	started := time.Now()
+	var distance float32
+	var verified int32
+	rc := CppVerifyPaths(v.ctxPtr, req.Audio1, req.Audio2, threshold,
+		unsafe.Pointer(&distance), unsafe.Pointer(&verified))
+	if rc != 0 {
+		return pb.VoiceVerifyResponse{}, v.lastErr("verify", req.Audio1+","+req.Audio2)
+	}
+	elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
+
+	// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
+	// matching the Python speaker-recognition backend's reporting.
+	confidence := float32(0)
+	if threshold > 0 {
+		confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
+	}
+
+	return pb.VoiceVerifyResponse{
+		Verified:         verified != 0,
+		Distance:         distance,
+		Threshold:        threshold,
+		Confidence:       confidence,
+		Model:            v.opts.modelName,
+		ProcessingTimeMs: elapsedMs,
+	}, nil
+}
+
+// VoiceAnalyze runs the age/gender/emotion heads on a single clip. The C-API
+// always evaluates every supported head, so the request's actions filter is
+// advisory and the full analysis is returned as a single segment (the engine
+// does not produce time-bounded segments).
+func (v *VoiceDetect) VoiceAnalyze(req *pb.VoiceAnalyzeRequest) (pb.VoiceAnalyzeResponse, error) {
+	if v.ctxPtr == 0 {
+		return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: model not loaded")
+	}
+	if req.Audio == "" {
+		return pb.VoiceAnalyzeResponse{}, errors.New("voice-detect: audio path is required")
+	}
+
+	ptr := CppAnalyzeJSON(v.ctxPtr, req.Audio)
+	if ptr == 0 {
+		return pb.VoiceAnalyzeResponse{}, v.lastErr("analyze", req.Audio)
+	}
+	defer CppFreeString(ptr)
+
+	seg, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
+	if err != nil {
+		return pb.VoiceAnalyzeResponse{}, fmt.Errorf("voice-detect: analyze JSON for %q: %w", req.Audio, err)
+	}
+	return pb.VoiceAnalyzeResponse{Segments: []*pb.VoiceAnalysis{seg}}, nil
+}
+
+// analyzeJSON mirrors the document returned by voicedetect_capi_analyze_path_json:
+//
+//	{"age":42.0,
+//	 "gender":{"label":"female","female":0.88,"male":0.12},
+//	 "emotion":{"label":"neutral","scores":{"neutral":0.7, ...}}}
+//
+// gender is a mixed object (a "label" string plus per-class float scores), so
+// it is decoded into raw messages and split in parseAnalyzeJSON.
+type analyzeJSON struct {
+	Age     float32                    `json:"age"`
+	Gender  map[string]json.RawMessage `json:"gender"`
+	Emotion struct {
+		Label  string             `json:"label"`
+		Scores map[string]float32 `json:"scores"`
+	} `json:"emotion"`
+}
+
+// parseAnalyzeJSON maps the engine's analyze document onto a VoiceAnalysis.
+// start/end stay 0: the model emits a single whole-utterance result, not
+// time-bounded segments.
+func parseAnalyzeJSON(doc string) (*pb.VoiceAnalysis, error) {
+	var a analyzeJSON
+	if err := json.Unmarshal([]byte(doc), &a); err != nil {
+		return nil, err
+	}
+
+	seg := &pb.VoiceAnalysis{
+		Age:             a.Age,
+		DominantEmotion: a.Emotion.Label,
+		Emotion:         a.Emotion.Scores,
+	}
+
+	if len(a.Gender) > 0 {
+		gender := make(map[string]float32, len(a.Gender))
+		for k, raw := range a.Gender {
+			if k == "label" {
+				_ = json.Unmarshal(raw, &seg.DominantGender)
+				continue
+			}
+			var score float32
+			if err := json.Unmarshal(raw, &score); err == nil {
+				gender[k] = score
+			}
+		}
+		seg.Gender = gender
+	}
+
+	return seg, nil
+}
+
+// lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
+func (v *VoiceDetect) lastErr(op, subject string) error {
+	msg := strings.TrimSpace(CppLastError(v.ctxPtr))
+	if msg == "" {
+		msg = "no error detail"
+	}
+	return fmt.Errorf("voice-detect: %s failed for %q: %s", op, subject, msg)
+}
+
+// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
+// malloc'd buffer the caller owns; release it via CppFreeString after the copy.
+//
+// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
+// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
+// moves the buffer and we dereference it immediately to copy the bytes out.
+func goStringFromCPtr(cptr uintptr) string {
+	if cptr == 0 {
+		return ""
+	}
+	p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
+	n := 0
+	for *(*byte)(unsafe.Add(p, n)) != 0 {
+		n++
+	}
+	return string(unsafe.Slice((*byte)(p), n))
+}
--- a/backend/go/voice-detect/govoicedetect_test.go
+++ b/backend/go/voice-detect/govoicedetect_test.go
@@ -0,0 +1,144 @@
+package main
+
+import (
+	"os"
+	"sync"
+	"testing"
+
+	"github.com/ebitengine/purego"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestVoiceDetect(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "voice-detect Backend Suite")
+}
+
+var (
+	libLoadOnce sync.Once
+	libLoadErr  error
+)
+
+// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
+// bridge without spinning up the gRPC server. Records the error (the smoke
+// specs skip themselves) when libvoicedetect.so is not loadable from cwd
+// (LD_LIBRARY_PATH or a symlink in ./).
+func ensureLibLoaded() error {
+	libLoadOnce.Do(func() {
+		libName := os.Getenv("VOICEDETECT_LIBRARY")
+		if libName == "" {
+			libName = "libvoicedetect.so"
+		}
+		lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+		if err != nil {
+			libLoadErr = err
+			return
+		}
+		purego.RegisterLibFunc(&CppAbiVersion, lib, "voicedetect_capi_abi_version")
+		purego.RegisterLibFunc(&CppLoad, lib, "voicedetect_capi_load")
+		purego.RegisterLibFunc(&CppFree, lib, "voicedetect_capi_free")
+		purego.RegisterLibFunc(&CppLastError, lib, "voicedetect_capi_last_error")
+		purego.RegisterLibFunc(&CppFreeString, lib, "voicedetect_capi_free_string")
+		purego.RegisterLibFunc(&CppFreeVec, lib, "voicedetect_capi_free_vec")
+		purego.RegisterLibFunc(&CppEmbedPath, lib, "voicedetect_capi_embed_path")
+		purego.RegisterLibFunc(&CppEmbedPCM, lib, "voicedetect_capi_embed_pcm")
+		purego.RegisterLibFunc(&CppVerifyPaths, lib, "voicedetect_capi_verify_paths")
+		purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "voicedetect_capi_analyze_path_json")
+	})
+	return libLoadErr
+}
+
+var _ = Describe("parseOptions", func() {
+	It("defaults verify_threshold to 0.25", func() {
+		o := parseOptions(nil)
+		Expect(o.verifyThreshold).To(Equal(float32(0.25)))
+		Expect(o.modelName).To(Equal(""))
+	})
+
+	It("parses verify_threshold, threshold alias and model_name", func() {
+		o := parseOptions([]string{"verify_threshold:0.4", "model_name:ecapa", "unknown:x"})
+		Expect(o.verifyThreshold).To(Equal(float32(0.4)))
+		Expect(o.modelName).To(Equal("ecapa"))
+
+		o2 := parseOptions([]string{"threshold:0.3"})
+		Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
+	})
+
+	It("ignores non-positive thresholds and keeps the default", func() {
+		o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
+		Expect(o.verifyThreshold).To(Equal(float32(0.25)))
+	})
+})
+
+var _ = Describe("parseAnalyzeJSON", func() {
+	It("maps age, gender label+scores and emotion label+scores", func() {
+		doc := `{"age":42.0,
+			"gender":{"label":"female","female":0.88,"male":0.12},
+			"emotion":{"label":"neutral","scores":{"neutral":0.7,"happy":0.2,"sad":0.1}}}`
+		seg, err := parseAnalyzeJSON(doc)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(seg.Age).To(BeNumerically("~", 42.0, 1e-4))
+		Expect(seg.Start).To(Equal(float32(0)))
+		Expect(seg.End).To(Equal(float32(0)))
+
+		Expect(seg.DominantGender).To(Equal("female"))
+		Expect(seg.Gender).To(HaveKeyWithValue("female", BeNumerically("~", 0.88, 1e-4)))
+		Expect(seg.Gender).To(HaveKeyWithValue("male", BeNumerically("~", 0.12, 1e-4)))
+		// The "label" entry is consumed into DominantGender, not the score map.
+		Expect(seg.Gender).ToNot(HaveKey("label"))
+
+		Expect(seg.DominantEmotion).To(Equal("neutral"))
+		Expect(seg.Emotion).To(HaveKeyWithValue("neutral", BeNumerically("~", 0.7, 1e-4)))
+		Expect(seg.Emotion).To(HaveKeyWithValue("happy", BeNumerically("~", 0.2, 1e-4)))
+	})
+
+	It("tolerates a missing gender block", func() {
+		seg, err := parseAnalyzeJSON(`{"age":30.0,"emotion":{"label":"happy","scores":{"happy":1.0}}}`)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(seg.DominantGender).To(Equal(""))
+		Expect(seg.DominantEmotion).To(Equal("happy"))
+	})
+
+	It("returns an error on malformed JSON", func() {
+		_, err := parseAnalyzeJSON(`{not-json`)
+		Expect(err).To(HaveOccurred())
+	})
+})
+
+// The specs below exercise the real C-API end to end. They run only when both a
+// model GGUF and a test WAV are provided, and skip cleanly otherwise so the
+// suite stays green without large assets.
+var _ = Describe("VoiceDetect end-to-end", Ordered, func() {
+	var (
+		v         *VoiceDetect
+		modelPath = os.Getenv("VOICEDETECT_BACKEND_TEST_MODEL")
+		wavPath   = os.Getenv("VOICEDETECT_BACKEND_TEST_WAV")
+	)
+
+	BeforeAll(func() {
+		if modelPath == "" || wavPath == "" {
+			Skip("set VOICEDETECT_BACKEND_TEST_MODEL and VOICEDETECT_BACKEND_TEST_WAV to run the e2e specs")
+		}
+		if err := ensureLibLoaded(); err != nil {
+			Skip("libvoicedetect.so not loadable: " + err.Error())
+		}
+		v = &VoiceDetect{}
+		Expect(v.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
+	})
+
+	It("embeds an audio clip", func() {
+		resp, err := v.VoiceEmbed(&pb.VoiceEmbedRequest{Audio: wavPath})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(resp.Embedding).ToNot(BeEmpty())
+		Expect(resp.Model).ToNot(BeEmpty())
+	})
+
+	It("verifies a clip against itself as the same speaker", func() {
+		resp, err := v.VoiceVerify(&pb.VoiceVerifyRequest{Audio1: wavPath, Audio2: wavPath})
+		Expect(err).ToNot(HaveOccurred())
+		Expect(resp.Verified).To(BeTrue())
+		Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
+	})
+})
--- a/backend/go/voice-detect/main.go
+++ b/backend/go/voice-detect/main.go
@@ -0,0 +1,64 @@
+package main
+
+// Started internally by LocalAI - one gRPC server per loaded model.
+//
+// Loads libvoicedetect.so via purego and registers the flat C-API entry points
+// declared in voicedetect_capi.h. The library name can be overridden with
+// VOICEDETECT_LIBRARY (mirrors the PARAKEET_LIBRARY / OMNIVOICE_LIBRARY
+// convention in the sibling backends); the default looks for the .so next to
+// this binary (resolved via LD_LIBRARY_PATH by run.sh).
+import (
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/ebitengine/purego"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+type LibFuncs struct {
+	FuncPtr any
+	Name    string
+}
+
+func main() {
+	libName := os.Getenv("VOICEDETECT_LIBRARY")
+	if libName == "" {
+		libName = "libvoicedetect.so"
+	}
+
+	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+	if err != nil {
+		panic(fmt.Errorf("voice-detect: dlopen %q: %w", libName, err))
+	}
+
+	// Bound 1:1 to voicedetect_capi.h. char*/float* returns are registered as
+	// uintptr so the raw pointer can be freed via the matching capi free fn.
+	libFuncs := []LibFuncs{
+		{&CppAbiVersion, "voicedetect_capi_abi_version"},
+		{&CppLoad, "voicedetect_capi_load"},
+		{&CppFree, "voicedetect_capi_free"},
+		{&CppLastError, "voicedetect_capi_last_error"},
+		{&CppFreeString, "voicedetect_capi_free_string"},
+		{&CppFreeVec, "voicedetect_capi_free_vec"},
+		{&CppEmbedPath, "voicedetect_capi_embed_path"},
+		{&CppEmbedPCM, "voicedetect_capi_embed_pcm"},
+		{&CppVerifyPaths, "voicedetect_capi_verify_paths"},
+		{&CppAnalyzeJSON, "voicedetect_capi_analyze_path_json"},
+	}
+	for _, lf := range libFuncs {
+		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
+	}
+
+	fmt.Fprintf(os.Stderr, "[voice-detect] ABI=%d\n", CppAbiVersion())
+
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &VoiceDetect{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/voice-detect/options.go
+++ b/backend/go/voice-detect/options.go
@@ -0,0 +1,46 @@
+package main
+
+import (
+	"strconv"
+	"strings"
+)
+
+// defaultVerifyThreshold is the cosine-distance cutoff used when a request does
+// not set one. Matches the Python speaker-recognition backend's default so the
+// two implementations agree on verdicts out of the box.
+const defaultVerifyThreshold float32 = 0.25
+
+// loadOptions holds the parsed model-level options for voice-detect.
+type loadOptions struct {
+	verifyThreshold float32
+	modelName       string
+}
+
+func splitOption(o string) (key, value string, ok bool) {
+	i := strings.Index(o, ":")
+	if i < 0 {
+		return "", "", false
+	}
+	return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
+}
+
+// parseOptions reads the backend "key:value" option slice. Unknown keys are
+// ignored. Defaults: verify_threshold 0.25, model_name derived from the file.
+func parseOptions(opts []string) loadOptions {
+	o := loadOptions{verifyThreshold: defaultVerifyThreshold}
+	for _, oo := range opts {
+		key, value, ok := splitOption(oo)
+		if !ok {
+			continue
+		}
+		switch key {
+		case "verify_threshold", "threshold":
+			if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
+				o.verifyThreshold = float32(f)
+			}
+		case "model_name":
+			o.modelName = value
+		}
+	}
+	return o
+}
--- a/backend/go/voice-detect/package.sh
+++ b/backend/go/voice-detect/package.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#
+# Bundle the voice-detect-grpc binary, libvoicedetect.so, the core runtime libs
+# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
+# so the package is self-contained. Mirrors backend/go/parakeet-cpp/package.sh;
+# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
+# is used instead of the host's.
+
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+REPO_ROOT="${CURDIR}/../../.."
+
+mkdir -p "$CURDIR/package/lib"
+
+cp -avf "$CURDIR/voice-detect-grpc" "$CURDIR/package/"
+cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
+
+# libvoicedetect.so + any soname symlinks. purego.Dlopen resolves it via
+# LD_LIBRARY_PATH, which run.sh points at lib/.
+cp -avf "$CURDIR"/libvoicedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
+	echo "ERROR: libvoicedetect.so not found in $CURDIR, run 'make' first" >&2
+	exit 1
+}
+
+# Detect architecture and copy the core runtime libs libvoicedetect.so links
+# against, plus the matching dynamic loader as lib/ld.so.
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
+elif [ "$(uname -s)" = "Darwin" ]; then
+    echo "Detected Darwin"
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
+# BUILD_TYPE so the backend can reach the GPU without the runtime base image
+# shipping those drivers.
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
+echo "Packaging completed successfully"
+ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
--- a/backend/go/voice-detect/run.sh
+++ b/backend/go/voice-detect/run.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+
+export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
+
+# If a self-contained ld.so was packaged, route through it so the packaged
+# libc / libstdc++ are used instead of the host's (matches the whisper /
+# parakeet backends' runtime layout).
+if [ -f "$CURDIR/lib/ld.so" ]; then
+	echo "Using lib/ld.so"
+	exec "$CURDIR/lib/ld.so" "$CURDIR/voice-detect-grpc" "$@"
+fi
+
+exec "$CURDIR/voice-detect-grpc" "$@"
--- a/backend/go/voice-detect/test.sh
+++ b/backend/go/voice-detect/test.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+
+CURDIR=$(dirname "$(realpath "$0")")
+cd "$CURDIR"
+
+echo "Running voice-detect backend tests..."
+
+# The pure-Go parsing specs always run. The embed/verify/analyze smoke specs run
+# only when a model + WAV are provided via VOICEDETECT_BACKEND_TEST_MODEL and
+# VOICEDETECT_BACKEND_TEST_WAV; otherwise they auto-skip.
+LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
+
+echo "voice-detect tests completed."
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=43d78af5be58f41d6ffbc227d608f104577741ea
+WHISPER_CPP_VERSION?=0ae02cdb2c7317b50991367c165736ce42ed96ac
 SO_TARGET?=libgowhisper.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/whisper/run.sh
+++ b/backend/go/whisper/run.sh
@@ -13,8 +13,14 @@ if [ "$(uname)" != "Darwin" ]; then
 fi

 if [ "$(uname)" = "Darwin" ]; then
-	# macOS: single dylib variant (Metal or Accelerate)
-	LIBRARY="$CURDIR/libgowhisper-fallback.dylib"
+	# macOS: single fallback variant (Metal/Accelerate). The cmake build emits a
+	# Mach-O named .so, but tolerate .dylib too — pick whichever exists so the Go
+	# loader doesn't panic on a hardcoded name that isn't on disk.
+	if [ -e "$CURDIR/libgowhisper-fallback.dylib" ]; then
+		LIBRARY="$CURDIR/libgowhisper-fallback.dylib"
+	else
+		LIBRARY="$CURDIR/libgowhisper-fallback.so"
+	fi
 	export DYLD_LIBRARY_PATH="$CURDIR"/lib:$DYLD_LIBRARY_PATH
 else
 	LIBRARY="$CURDIR/libgowhisper-fallback.so"
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -243,6 +243,78 @@
    nvidia-cuda-12: "cuda12-ced"
    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-ced"
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ced"
+- &voicedetect
+  name: "voice-detect"
+  alias: "voice-detect"
+  license: mit
+  icon: https://avatars.githubusercontent.com/u/95302084
+  description: |
+    voice-detect speaker recognition and voice analysis.
+    voice-detect.cpp is a C++/ggml engine that produces L2-normalised
+    speaker embeddings (ECAPA-TDNN, WeSpeaker ResNet34, 3D-Speaker
+    ERes2Net, CAM++) for voice verification and 1:N identification, plus
+    a wav2vec2 age / gender / emotion analysis head. It replaces the
+    Python speaker-recognition backend and is exposed through the Voice*
+    gRPC rpcs and the /v1/voice/* REST endpoints. It runs on CPU, NVIDIA
+    CUDA, AMD ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
+  urls:
+    - https://github.com/mudler/voice-detect.cpp
+  tags:
+    - voice-recognition
+    - speaker-verification
+    - speaker-embedding
+    - CPU
+    - GPU
+    - CUDA
+    - HIP
+  capabilities:
+    default: "cpu-voice-detect"
+    nvidia: "cuda12-voice-detect"
+    intel: "intel-sycl-f16-voice-detect"
+    metal: "metal-voice-detect"
+    amd: "rocm-voice-detect"
+    vulkan: "vulkan-voice-detect"
+    nvidia-l4t: "nvidia-l4t-arm64-voice-detect"
+    nvidia-cuda-13: "cuda13-voice-detect"
+    nvidia-cuda-12: "cuda12-voice-detect"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect"
+- &facedetect
+  name: "face-detect"
+  alias: "face-detect"
+  license: mit
+  icon: https://avatars.githubusercontent.com/u/95302084
+  description: |
+    face-detect face detection, embedding, verification and analysis.
+    face-detect.cpp is a C++/ggml engine that runs SCRFD / YuNet face
+    detection and ArcFace / SFace 512-d (or 128-d) L2-normalised face
+    embeddings for verification and 1:N identification, plus a landmark /
+    age / gender analysis head. It replaces the Python insightface backend
+    and is exposed through the Embedding, Detect and Face* gRPC rpcs and
+    the /v1/face/* REST endpoints. It runs on CPU, NVIDIA CUDA, AMD
+    ROCm/HIP, Intel SYCL, Vulkan and NVIDIA Jetson (L4T) targets.
+  urls:
+    - https://github.com/mudler/face-detect.cpp
+  tags:
+    - face-recognition
+    - face-verification
+    - face-embedding
+    - CPU
+    - GPU
+    - CUDA
+    - HIP
+  capabilities:
+    default: "cpu-face-detect"
+    nvidia: "cuda12-face-detect"
+    intel: "intel-sycl-f16-face-detect"
+    metal: "metal-face-detect"
+    amd: "rocm-face-detect"
+    vulkan: "vulkan-face-detect"
+    nvidia-l4t: "nvidia-l4t-arm64-face-detect"
+    nvidia-cuda-13: "cuda13-face-detect"
+    nvidia-cuda-12: "cuda12-face-detect"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect"
 - &voxtral
  name: "voxtral"
  alias: "voxtral"
@@ -1390,7 +1462,6 @@
    intel: "intel-fish-speech"
    amd: "rocm-fish-speech"
    nvidia-l4t: "nvidia-l4t-fish-speech"
-    metal: "metal-fish-speech"
    default: "cpu-fish-speech"
    nvidia-cuda-13: "cuda13-fish-speech"
    nvidia-cuda-12: "cuda12-fish-speech"
@@ -2890,6 +2961,236 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ced"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-13-ced
+## voice-detect
+- !!merge <<: *voicedetect
+  name: "voice-detect-development"
+  capabilities:
+    default: "cpu-voice-detect-development"
+    nvidia: "cuda12-voice-detect-development"
+    intel: "intel-sycl-f16-voice-detect-development"
+    metal: "metal-voice-detect-development"
+    amd: "rocm-voice-detect-development"
+    vulkan: "vulkan-voice-detect-development"
+    nvidia-l4t: "nvidia-l4t-arm64-voice-detect-development"
+    nvidia-cuda-13: "cuda13-voice-detect-development"
+    nvidia-cuda-12: "cuda12-voice-detect-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-voice-detect-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-voice-detect-development"
+- !!merge <<: *voicedetect
+  name: "nvidia-l4t-arm64-voice-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-voice-detect"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-arm64-voice-detect
+- !!merge <<: *voicedetect
+  name: "nvidia-l4t-arm64-voice-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-voice-detect"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-arm64-voice-detect
+- !!merge <<: *voicedetect
+  name: "cuda13-nvidia-l4t-arm64-voice-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-voice-detect
+- !!merge <<: *voicedetect
+  name: "cuda13-nvidia-l4t-arm64-voice-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-voice-detect
+- !!merge <<: *voicedetect
+  name: "cpu-voice-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-voice-detect"
+  mirrors:
+    - localai/localai-backends:latest-cpu-voice-detect
+- !!merge <<: *voicedetect
+  name: "cpu-voice-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-voice-detect"
+  mirrors:
+    - localai/localai-backends:master-cpu-voice-detect
+- !!merge <<: *voicedetect
+  name: "metal-voice-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-voice-detect"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-voice-detect
+- !!merge <<: *voicedetect
+  name: "metal-voice-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-voice-detect"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-voice-detect
+- !!merge <<: *voicedetect
+  name: "cuda12-voice-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-voice-detect"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-12-voice-detect
+- !!merge <<: *voicedetect
+  name: "cuda12-voice-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-voice-detect"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-12-voice-detect
+- !!merge <<: *voicedetect
+  name: "rocm-voice-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-voice-detect"
+  mirrors:
+    - localai/localai-backends:latest-gpu-rocm-hipblas-voice-detect
+- !!merge <<: *voicedetect
+  name: "rocm-voice-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-voice-detect"
+  mirrors:
+    - localai/localai-backends:master-gpu-rocm-hipblas-voice-detect
+- !!merge <<: *voicedetect
+  name: "intel-sycl-f32-voice-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-voice-detect"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f32-voice-detect
+- !!merge <<: *voicedetect
+  name: "intel-sycl-f32-voice-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-voice-detect"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f32-voice-detect
+- !!merge <<: *voicedetect
+  name: "intel-sycl-f16-voice-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-voice-detect"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f16-voice-detect
+- !!merge <<: *voicedetect
+  name: "intel-sycl-f16-voice-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-voice-detect"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f16-voice-detect
+- !!merge <<: *voicedetect
+  name: "vulkan-voice-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-voice-detect"
+  mirrors:
+    - localai/localai-backends:latest-gpu-vulkan-voice-detect
+- !!merge <<: *voicedetect
+  name: "vulkan-voice-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-voice-detect"
+  mirrors:
+    - localai/localai-backends:master-gpu-vulkan-voice-detect
+- !!merge <<: *voicedetect
+  name: "cuda13-voice-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-voice-detect"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-voice-detect
+- !!merge <<: *voicedetect
+  name: "cuda13-voice-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-voice-detect"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-13-voice-detect
+## face-detect
+- !!merge <<: *facedetect
+  name: "face-detect-development"
+  capabilities:
+    default: "cpu-face-detect-development"
+    nvidia: "cuda12-face-detect-development"
+    intel: "intel-sycl-f16-face-detect-development"
+    metal: "metal-face-detect-development"
+    amd: "rocm-face-detect-development"
+    vulkan: "vulkan-face-detect-development"
+    nvidia-l4t: "nvidia-l4t-arm64-face-detect-development"
+    nvidia-cuda-13: "cuda13-face-detect-development"
+    nvidia-cuda-12: "cuda12-face-detect-development"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-face-detect-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-face-detect-development"
+- !!merge <<: *facedetect
+  name: "nvidia-l4t-arm64-face-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-face-detect"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-arm64-face-detect
+- !!merge <<: *facedetect
+  name: "nvidia-l4t-arm64-face-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-face-detect"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-arm64-face-detect
+- !!merge <<: *facedetect
+  name: "cuda13-nvidia-l4t-arm64-face-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-face-detect
+- !!merge <<: *facedetect
+  name: "cuda13-nvidia-l4t-arm64-face-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-face-detect
+- !!merge <<: *facedetect
+  name: "cpu-face-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-face-detect"
+  mirrors:
+    - localai/localai-backends:latest-cpu-face-detect
+- !!merge <<: *facedetect
+  name: "cpu-face-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-face-detect"
+  mirrors:
+    - localai/localai-backends:master-cpu-face-detect
+- !!merge <<: *facedetect
+  name: "metal-face-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-face-detect"
+  mirrors:
+    - localai/localai-backends:latest-metal-darwin-arm64-face-detect
+- !!merge <<: *facedetect
+  name: "metal-face-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-face-detect"
+  mirrors:
+    - localai/localai-backends:master-metal-darwin-arm64-face-detect
+- !!merge <<: *facedetect
+  name: "cuda12-face-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-face-detect"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-12-face-detect
+- !!merge <<: *facedetect
+  name: "cuda12-face-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-face-detect"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-12-face-detect
+- !!merge <<: *facedetect
+  name: "rocm-face-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-face-detect"
+  mirrors:
+    - localai/localai-backends:latest-gpu-rocm-hipblas-face-detect
+- !!merge <<: *facedetect
+  name: "rocm-face-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-face-detect"
+  mirrors:
+    - localai/localai-backends:master-gpu-rocm-hipblas-face-detect
+- !!merge <<: *facedetect
+  name: "intel-sycl-f32-face-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-face-detect"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f32-face-detect
+- !!merge <<: *facedetect
+  name: "intel-sycl-f32-face-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-face-detect"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f32-face-detect
+- !!merge <<: *facedetect
+  name: "intel-sycl-f16-face-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-face-detect"
+  mirrors:
+    - localai/localai-backends:latest-gpu-intel-sycl-f16-face-detect
+- !!merge <<: *facedetect
+  name: "intel-sycl-f16-face-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-face-detect"
+  mirrors:
+    - localai/localai-backends:master-gpu-intel-sycl-f16-face-detect
+- !!merge <<: *facedetect
+  name: "vulkan-face-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-face-detect"
+  mirrors:
+    - localai/localai-backends:latest-gpu-vulkan-face-detect
+- !!merge <<: *facedetect
+  name: "vulkan-face-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-face-detect"
+  mirrors:
+    - localai/localai-backends:master-gpu-vulkan-face-detect
+- !!merge <<: *facedetect
+  name: "cuda13-face-detect"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-face-detect"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-face-detect
+- !!merge <<: *facedetect
+  name: "cuda13-face-detect-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-face-detect"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-13-face-detect
 ## stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cpu-stablediffusion-ggml"
@@ -4932,7 +5233,6 @@
    intel: "intel-fish-speech-development"
    amd: "rocm-fish-speech-development"
    nvidia-l4t: "nvidia-l4t-fish-speech-development"
-    metal: "metal-fish-speech-development"
    default: "cpu-fish-speech-development"
    nvidia-cuda-13: "cuda13-fish-speech-development"
    nvidia-cuda-12: "cuda12-fish-speech-development"
@@ -5008,16 +5308,6 @@
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-fish-speech"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-fish-speech
- !!merge <<: *fish-speech
-  name: "metal-fish-speech"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-fish-speech"
-  mirrors:
-    - localai/localai-backends:latest-metal-darwin-arm64-fish-speech
- !!merge <<: *fish-speech
-  name: "metal-fish-speech-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-fish-speech"
-  mirrors:
-    - localai/localai-backends:master-metal-darwin-arm64-fish-speech
 ## faster-qwen3-tts
 - !!merge <<: *faster-qwen3-tts
  name: "faster-qwen3-tts-development"
--- a/backend/python/fish-speech/requirements-mps.txt
+++ b/backend/python/fish-speech/requirements-mps.txt
@@ -1,2 +0,0 @@
-torch
-torchaudio