From 0258f8af555e806f11b30ea0984cdd4afbbc3403 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 27 Jun 2026 09:42:22 +0200 Subject: [PATCH] fix(backends): repair release CI build/test breaks (kokoros, fish-speech, llama-cpp-quantization, sglang) (#10547) * fix(kokoros): implement new Backend RPCs to fix the build The backend.proto grew six RPCs (SoundDetection, Depth, TokenClassify, Score and the bidi-streaming Forward) that the kokoros gRPC service never implemented, so the trait impl no longer satisfies `Backend`: error[E0046]: not all trait items implemented, missing: `sound_detection`, `depth`, `token_classify`, `score`, `ForwardStream`, `forward` kokoros is a TTS backend with no use for these, so add `unimplemented` stubs (plus the `ForwardStream` associated type) matching the existing pattern for every other unsupported RPC in this file. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] * fix(fish-speech): add setuptools-rust for the editable source install install.sh installs the fish-speech source tree editable with `--no-build-isolation`, which means the build backends of its transitive dependencies must already be present in the venv. One of them builds a Rust extension and its metadata step fails with: ModuleNotFoundError: No module named 'setuptools_rust' Add setuptools-rust to requirements.txt so installRequirements provisions it before the editable install runs. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] * fix(llama-cpp-quantization): vendor convert_hf_to_gguf.py with conversion/ Upstream llama.cpp split the model-specific logic out of the single convert_hf_to_gguf.py file into a sibling `conversion/` package, so the script now starts with `from conversion import ...`. Downloading just the one file therefore fails at runtime with: ModuleNotFoundError: No module named 'conversion' Clone the repo (reusing the clone already needed to build llama-quantize) and copy both the script and the `conversion/` package into the backend dir. Python puts the script's own directory on sys.path[0], so the package resolves when it sits beside the script. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] * fix(sglang): pin the CPU source build to sglang v0.5.11 The CPU profile builds sgl-kernel from a `git clone` of sglang with no ref, so it always tracks master. Recent master added CPU kernels (e.g. mamba/fla.cpp) that fail to compile in our builder: constexpr variable 'scale' must be initialized by a constant static library kineto_LIBRARY-NOTFOUND not found Pin the clone to v0.5.11, the same release the GPU path already floors on (requirements-cublas12-after.txt). Overridable via SGLANG_VERSION so the pin can be bumped deliberately. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] --------- Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- backend/python/fish-speech/requirements.txt | 4 ++ .../python/llama-cpp-quantization/install.sh | 36 ++++++++++++------ backend/python/sglang/install.sh | 8 +++- backend/rust/kokoros/src/service.rs | 37 +++++++++++++++++++ 4 files changed, 72 insertions(+), 13 deletions(-) diff --git a/backend/python/fish-speech/requirements.txt b/backend/python/fish-speech/requirements.txt index 1be3c8250..528abf737 100644 --- a/backend/python/fish-speech/requirements.txt +++ b/backend/python/fish-speech/requirements.txt @@ -7,3 +7,7 @@ setuptools six scipy numpy +# fish-speech is installed editable with --no-build-isolation, so the build +# backends of its transitive deps must already be in the venv. One of them +# builds a Rust extension and needs setuptools-rust present at metadata time. +setuptools-rust diff --git a/backend/python/llama-cpp-quantization/install.sh b/backend/python/llama-cpp-quantization/install.sh index 05ac24f70..a9001ffaa 100755 --- a/backend/python/llama-cpp-quantization/install.sh +++ b/backend/python/llama-cpp-quantization/install.sh @@ -11,14 +11,31 @@ fi EXTRA_PIP_INSTALL_FLAGS+=" --upgrade " installRequirements -# Fetch convert_hf_to_gguf.py from llama.cpp +# Fetch convert_hf_to_gguf.py from llama.cpp. +# Upstream split the model-specific logic out of the single file into a +# sibling `conversion/` package (convert_hf_to_gguf.py now does +# `from conversion import ...`), so a single-file download no longer runs — +# it fails with `ModuleNotFoundError: No module named 'conversion'`. We clone +# the repo and copy both the script and the package; Python puts the script's +# own directory on sys.path[0], so the package resolves when placed beside it. LLAMA_CPP_CONVERT_VERSION="${LLAMA_CPP_CONVERT_VERSION:-master}" +LLAMA_CPP_SRC="${EDIR}/llama.cpp" CONVERT_SCRIPT="${EDIR}/convert_hf_to_gguf.py" -if [ ! -f "${CONVERT_SCRIPT}" ]; then - echo "Downloading convert_hf_to_gguf.py from llama.cpp (${LLAMA_CPP_CONVERT_VERSION})..." - curl -L --fail --retry 3 \ - "https://raw.githubusercontent.com/ggml-org/llama.cpp/${LLAMA_CPP_CONVERT_VERSION}/convert_hf_to_gguf.py" \ - -o "${CONVERT_SCRIPT}" || echo "Warning: Failed to download convert_hf_to_gguf.py." + +cloneLlamaCpp() { + if [ ! -d "${LLAMA_CPP_SRC}/.git" ]; then + git clone --depth 1 --branch "${LLAMA_CPP_CONVERT_VERSION}" \ + https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}" 2>/dev/null || \ + git clone --depth 1 https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}" + fi +} + +if [ ! -f "${CONVERT_SCRIPT}" ] || [ ! -d "${EDIR}/conversion" ]; then + echo "Fetching convert_hf_to_gguf.py + conversion/ from llama.cpp (${LLAMA_CPP_CONVERT_VERSION})..." + cloneLlamaCpp + cp "${LLAMA_CPP_SRC}/convert_hf_to_gguf.py" "${CONVERT_SCRIPT}" + rm -rf "${EDIR}/conversion" + cp -r "${LLAMA_CPP_SRC}/conversion" "${EDIR}/conversion" fi # Install gguf package from the same llama.cpp commit to keep them in sync @@ -41,12 +58,7 @@ QUANTIZE_BIN="${EDIR}/llama-quantize" if [ ! -x "${QUANTIZE_BIN}" ] && ! command -v llama-quantize &>/dev/null; then if command -v cmake &>/dev/null; then echo "Building llama-quantize from llama.cpp (${LLAMA_CPP_CONVERT_VERSION})..." - LLAMA_CPP_SRC="${EDIR}/llama.cpp" - if [ ! -d "${LLAMA_CPP_SRC}" ]; then - git clone --depth 1 --branch "${LLAMA_CPP_CONVERT_VERSION}" \ - https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}" 2>/dev/null || \ - git clone --depth 1 https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}" - fi + cloneLlamaCpp # reuses the clone fetched for convert_hf_to_gguf.py cmake -B "${LLAMA_CPP_SRC}/build" -S "${LLAMA_CPP_SRC}" -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF cmake --build "${LLAMA_CPP_SRC}/build" --target llama-quantize -j"$(nproc 2>/dev/null || echo 2)" cp "${LLAMA_CPP_SRC}/build/bin/llama-quantize" "${QUANTIZE_BIN}" diff --git a/backend/python/sglang/install.sh b/backend/python/sglang/install.sh index 928f7bd11..68812f8a7 100755 --- a/backend/python/sglang/install.sh +++ b/backend/python/sglang/install.sh @@ -85,9 +85,15 @@ if [ "x${BUILD_TYPE}" == "x" ] || [ "x${FROM_SOURCE:-}" == "xtrue" ]; then # The resulting binary still requires an AVX-512 capable CPU at runtime, # same constraint sglang upstream documents in docker/xeon.Dockerfile. + # Pin the source build to the same release the GPU path floors on + # (0.5.11, see requirements-cublas12-after.txt). An unpinned master clone + # pulls in newer CPU kernels (e.g. mamba/fla.cpp) that fail to compile + # (constexpr non-constant + kineto_LIBRARY-NOTFOUND). Bump deliberately. + SGLANG_VERSION="${SGLANG_VERSION:-v0.5.11}" _sgl_src=$(mktemp -d) trap 'rm -rf "${_sgl_src}"' EXIT - git clone --depth 1 https://github.com/sgl-project/sglang "${_sgl_src}/sglang" + git clone --depth 1 --branch "${SGLANG_VERSION}" \ + https://github.com/sgl-project/sglang "${_sgl_src}/sglang" # Patch -march=native → -march=sapphirerapids in the CPU kernel CMakeLists sed -i 's/-march=native/-march=sapphirerapids/g' \ diff --git a/backend/rust/kokoros/src/service.rs b/backend/rust/kokoros/src/service.rs index b980feb52..ef361b9dc 100644 --- a/backend/rust/kokoros/src/service.rs +++ b/backend/rust/kokoros/src/service.rs @@ -570,6 +570,43 @@ impl Backend for KokorosService { ) -> Result, Status> { Err(Status::unimplemented("Not supported")) } + + async fn sound_detection( + &self, + _: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not supported")) + } + + async fn depth( + &self, + _: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not supported")) + } + + async fn token_classify( + &self, + _: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not supported")) + } + + async fn score( + &self, + _: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not supported")) + } + + type ForwardStream = ReceiverStream>; + + async fn forward( + &self, + _: Request>, + ) -> Result, Status> { + Err(Status::unimplemented("Not supported")) + } } #[cfg(test)]