From 0258f8af555e806f11b30ea0984cdd4afbbc3403 Mon Sep 17 00:00:00 2001
From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com>
Date: Sat, 27 Jun 2026 09:42:22 +0200
Subject: [PATCH] fix(backends): repair release CI build/test breaks (kokoros,
 fish-speech, llama-cpp-quantization, sglang) (#10547)

* fix(kokoros): implement new Backend RPCs to fix the build

The backend.proto grew six RPCs (SoundDetection, Depth, TokenClassify,
Score and the bidi-streaming Forward) that the kokoros gRPC service never
implemented, so the trait impl no longer satisfies `Backend`:

    error[E0046]: not all trait items implemented, missing:
      `sound_detection`, `depth`, `token_classify`, `score`,
      `ForwardStream`, `forward`

kokoros is a TTS backend with no use for these, so add `unimplemented`
stubs (plus the `ForwardStream` associated type) matching the existing
pattern for every other unsupported RPC in this file.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(fish-speech): add setuptools-rust for the editable source install

install.sh installs the fish-speech source tree editable with
`--no-build-isolation`, which means the build backends of its transitive
dependencies must already be present in the venv. One of them builds a
Rust extension and its metadata step fails with:

    ModuleNotFoundError: No module named 'setuptools_rust'

Add setuptools-rust to requirements.txt so installRequirements provisions
it before the editable install runs.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(llama-cpp-quantization): vendor convert_hf_to_gguf.py with conversion/

Upstream llama.cpp split the model-specific logic out of the single
convert_hf_to_gguf.py file into a sibling `conversion/` package, so the
script now starts with `from conversion import ...`. Downloading just the
one file therefore fails at runtime with:

    ModuleNotFoundError: No module named 'conversion'

Clone the repo (reusing the clone already needed to build llama-quantize)
and copy both the script and the `conversion/` package into the backend
dir. Python puts the script's own directory on sys.path[0], so the package
resolves when it sits beside the script.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

* fix(sglang): pin the CPU source build to sglang v0.5.11

The CPU profile builds sgl-kernel from a `git clone` of sglang with no
ref, so it always tracks master. Recent master added CPU kernels (e.g.
mamba/fla.cpp) that fail to compile in our builder:

    constexpr variable 'scale' must be initialized by a constant
    static library kineto_LIBRARY-NOTFOUND not found

Pin the clone to v0.5.11, the same release the GPU path already floors on
(requirements-cublas12-after.txt). Overridable via SGLANG_VERSION so the
pin can be bumped deliberately.

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/python/fish-speech/requirements.txt   |  4 ++
 .../python/llama-cpp-quantization/install.sh  | 36 ++++++++++++------
 backend/python/sglang/install.sh              |  8 +++-
 backend/rust/kokoros/src/service.rs           | 37 +++++++++++++++++++
 4 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/backend/python/fish-speech/requirements.txt b/backend/python/fish-speech/requirements.txt
index 1be3c8250..528abf737 100644
--- a/backend/python/fish-speech/requirements.txt
+++ b/backend/python/fish-speech/requirements.txt
@@ -7,3 +7,7 @@ setuptools
 six
 scipy
 numpy
+# fish-speech is installed editable with --no-build-isolation, so the build
+# backends of its transitive deps must already be in the venv. One of them
+# builds a Rust extension and needs setuptools-rust present at metadata time.
+setuptools-rust
diff --git a/backend/python/llama-cpp-quantization/install.sh b/backend/python/llama-cpp-quantization/install.sh
index 05ac24f70..a9001ffaa 100755
--- a/backend/python/llama-cpp-quantization/install.sh
+++ b/backend/python/llama-cpp-quantization/install.sh
@@ -11,14 +11,31 @@ fi
 EXTRA_PIP_INSTALL_FLAGS+=" --upgrade "
 installRequirements
 
-# Fetch convert_hf_to_gguf.py from llama.cpp
+# Fetch convert_hf_to_gguf.py from llama.cpp.
+# Upstream split the model-specific logic out of the single file into a
+# sibling `conversion/` package (convert_hf_to_gguf.py now does
+# `from conversion import ...`), so a single-file download no longer runs —
+# it fails with `ModuleNotFoundError: No module named 'conversion'`. We clone
+# the repo and copy both the script and the package; Python puts the script's
+# own directory on sys.path[0], so the package resolves when placed beside it.
 LLAMA_CPP_CONVERT_VERSION="${LLAMA_CPP_CONVERT_VERSION:-master}"
+LLAMA_CPP_SRC="${EDIR}/llama.cpp"
 CONVERT_SCRIPT="${EDIR}/convert_hf_to_gguf.py"
-if [ ! -f "${CONVERT_SCRIPT}" ]; then
-    echo "Downloading convert_hf_to_gguf.py from llama.cpp (${LLAMA_CPP_CONVERT_VERSION})..."
-    curl -L --fail --retry 3 \
-        "https://raw.githubusercontent.com/ggml-org/llama.cpp/${LLAMA_CPP_CONVERT_VERSION}/convert_hf_to_gguf.py" \
-        -o "${CONVERT_SCRIPT}" || echo "Warning: Failed to download convert_hf_to_gguf.py."
+
+cloneLlamaCpp() {
+    if [ ! -d "${LLAMA_CPP_SRC}/.git" ]; then
+        git clone --depth 1 --branch "${LLAMA_CPP_CONVERT_VERSION}" \
+            https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}" 2>/dev/null || \
+        git clone --depth 1 https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}"
+    fi
+}
+
+if [ ! -f "${CONVERT_SCRIPT}" ] || [ ! -d "${EDIR}/conversion" ]; then
+    echo "Fetching convert_hf_to_gguf.py + conversion/ from llama.cpp (${LLAMA_CPP_CONVERT_VERSION})..."
+    cloneLlamaCpp
+    cp "${LLAMA_CPP_SRC}/convert_hf_to_gguf.py" "${CONVERT_SCRIPT}"
+    rm -rf "${EDIR}/conversion"
+    cp -r "${LLAMA_CPP_SRC}/conversion" "${EDIR}/conversion"
 fi
 
 # Install gguf package from the same llama.cpp commit to keep them in sync
@@ -41,12 +58,7 @@ QUANTIZE_BIN="${EDIR}/llama-quantize"
 if [ ! -x "${QUANTIZE_BIN}" ] && ! command -v llama-quantize &>/dev/null; then
     if command -v cmake &>/dev/null; then
         echo "Building llama-quantize from llama.cpp (${LLAMA_CPP_CONVERT_VERSION})..."
-        LLAMA_CPP_SRC="${EDIR}/llama.cpp"
-        if [ ! -d "${LLAMA_CPP_SRC}" ]; then
-            git clone --depth 1 --branch "${LLAMA_CPP_CONVERT_VERSION}" \
-                https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}" 2>/dev/null || \
-            git clone --depth 1 https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}"
-        fi
+        cloneLlamaCpp  # reuses the clone fetched for convert_hf_to_gguf.py
         cmake -B "${LLAMA_CPP_SRC}/build" -S "${LLAMA_CPP_SRC}" -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF
         cmake --build "${LLAMA_CPP_SRC}/build" --target llama-quantize -j"$(nproc 2>/dev/null || echo 2)"
         cp "${LLAMA_CPP_SRC}/build/bin/llama-quantize" "${QUANTIZE_BIN}"
diff --git a/backend/python/sglang/install.sh b/backend/python/sglang/install.sh
index 928f7bd11..68812f8a7 100755
--- a/backend/python/sglang/install.sh
+++ b/backend/python/sglang/install.sh
@@ -85,9 +85,15 @@ if [ "x${BUILD_TYPE}" == "x" ] || [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
     # The resulting binary still requires an AVX-512 capable CPU at runtime,
     # same constraint sglang upstream documents in docker/xeon.Dockerfile.
 
+    # Pin the source build to the same release the GPU path floors on
+    # (0.5.11, see requirements-cublas12-after.txt). An unpinned master clone
+    # pulls in newer CPU kernels (e.g. mamba/fla.cpp) that fail to compile
+    # (constexpr non-constant + kineto_LIBRARY-NOTFOUND). Bump deliberately.
+    SGLANG_VERSION="${SGLANG_VERSION:-v0.5.11}"
     _sgl_src=$(mktemp -d)
     trap 'rm -rf "${_sgl_src}"' EXIT
-    git clone --depth 1 https://github.com/sgl-project/sglang "${_sgl_src}/sglang"
+    git clone --depth 1 --branch "${SGLANG_VERSION}" \
+        https://github.com/sgl-project/sglang "${_sgl_src}/sglang"
 
     # Patch -march=native → -march=sapphirerapids in the CPU kernel CMakeLists
     sed -i 's/-march=native/-march=sapphirerapids/g' \
diff --git a/backend/rust/kokoros/src/service.rs b/backend/rust/kokoros/src/service.rs
index b980feb52..ef361b9dc 100644
--- a/backend/rust/kokoros/src/service.rs
+++ b/backend/rust/kokoros/src/service.rs
@@ -570,6 +570,43 @@ impl Backend for KokorosService {
     ) -> Result<Response<backend::Result>, Status> {
         Err(Status::unimplemented("Not supported"))
     }
+
+    async fn sound_detection(
+        &self,
+        _: Request<backend::SoundDetectionRequest>,
+    ) -> Result<Response<backend::SoundDetectionResponse>, Status> {
+        Err(Status::unimplemented("Not supported"))
+    }
+
+    async fn depth(
+        &self,
+        _: Request<backend::DepthRequest>,
+    ) -> Result<Response<backend::DepthResponse>, Status> {
+        Err(Status::unimplemented("Not supported"))
+    }
+
+    async fn token_classify(
+        &self,
+        _: Request<backend::TokenClassifyRequest>,
+    ) -> Result<Response<backend::TokenClassifyResponse>, Status> {
+        Err(Status::unimplemented("Not supported"))
+    }
+
+    async fn score(
+        &self,
+        _: Request<backend::ScoreRequest>,
+    ) -> Result<Response<backend::ScoreResponse>, Status> {
+        Err(Status::unimplemented("Not supported"))
+    }
+
+    type ForwardStream = ReceiverStream<Result<backend::ForwardReply, Status>>;
+
+    async fn forward(
+        &self,
+        _: Request<tonic::Streaming<backend::ForwardRequest>>,
+    ) -> Result<Response<Self::ForwardStream>, Status> {
+        Err(Status::unimplemented("Not supported"))
+    }
 }
 
 #[cfg(test)]