From 78d682224a1b74874d4ae1bee0cbdff82cc47b30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=95=AA=E8=8C=84=E6=91=94=E6=88=90=E7=95=AA=E8=8C=84?= =?UTF-8?q?=E9=85=B1?= <68098251+fqscfqj@users.noreply.github.com> Date: Fri, 19 Jun 2026 20:59:50 +0800 Subject: [PATCH 01/99] fix(grpc): forward word-level timestamps in AudioTranscription wrapper (#10402) The gRPC server wrapper in pkg/grpc/server.go reconstructs TranscriptSegment messages when relaying AudioTranscription results from backends. The Words field was not being copied, causing all word-level timestamps to be silently dropped regardless of backend support. This was introduced when PR #9621 added the TranscriptWord proto message and transcriptResultFromProto (server-side), but did not update the server-side gRPC relay to forward the new field. Fixes #9306 Signed-off-by: fqscfqj --- pkg/grpc/server.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pkg/grpc/server.go b/pkg/grpc/server.go index 6ddb521ba..35afb502c 100644 --- a/pkg/grpc/server.go +++ b/pkg/grpc/server.go @@ -243,6 +243,14 @@ func (s *server) AudioTranscription(ctx context.Context, in *pb.TranscriptReques for _, t := range s.Tokens { tks = append(tks, int32(t)) } + words := make([]*pb.TranscriptWord, 0, len(s.Words)) + for _, w := range s.Words { + words = append(words, &pb.TranscriptWord{ + Start: int64(w.Start), + End: int64(w.End), + Text: w.Text, + }) + } tresult.Segments = append(tresult.Segments, &pb.TranscriptSegment{ Text: s.Text, @@ -251,6 +259,7 @@ func (s *server) AudioTranscription(ctx context.Context, in *pb.TranscriptReques End: int64(s.End), Tokens: tks, Speaker: s.Speaker, + Words: words, }) } From 59c7ad51537f37a46ed1bc55a5cc50d55846d5c5 Mon Sep 17 00:00:00 2001 From: Souheab <85948717+Souheab@users.noreply.github.com> Date: Fri, 19 Jun 2026 11:15:18 -0400 Subject: [PATCH 02/99] fix(nix flake): ensure nix flake builds successfully (#10399) * Use inference defaults in repo src rather than fetching there are inference_defaults.json already in the repo so we can use those, they are regularly updated with github actions, and we avoid hash mismatch errors in the flake this way Signed-off-by: Souheab * Update vendor hash Signed-off-by: Souheab * Create react-ui derivation as it is required for go build Signed-off-by: Souheab * Add FHS env wrapper to make #!/bin/bash scripts work Signed-off-by: Souheab * use pkgs.importNpmLock to deal with npm dependencies instead of using npmDepsHash Signed-off-by: Souheab --------- Signed-off-by: Souheab --- flake.lock | 13 ------------- flake.nix | 48 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/flake.lock b/flake.lock index d67f05416..25b0fc536 100644 --- a/flake.lock +++ b/flake.lock @@ -1,17 +1,5 @@ { "nodes": { - "inference-defaults": { - "flake": false, - "locked": { - "narHash": "sha256-ygWIkY2xiUEWqAZQM4/0vBz8vWd/RKX5VBj7EHovU14=", - "type": "file", - "url": "https://raw.githubusercontent.com/unslothai/unsloth/main/studio/backend/assets/configs/inference_defaults.json" - }, - "original": { - "type": "file", - "url": "https://raw.githubusercontent.com/unslothai/unsloth/main/studio/backend/assets/configs/inference_defaults.json" - } - }, "nixpkgs": { "locked": { "lastModified": 1777578337, @@ -30,7 +18,6 @@ }, "root": { "inputs": { - "inference-defaults": "inference-defaults", "nixpkgs": "nixpkgs" } } diff --git a/flake.nix b/flake.nix index 2bbfd5c83..89691c716 100644 --- a/flake.nix +++ b/flake.nix @@ -4,24 +4,36 @@ inputs = { nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; - inference-defaults = { - url = "https://raw.githubusercontent.com/unslothai/unsloth/main/studio/backend/assets/configs/inference_defaults.json"; - flake = false; - }; }; - outputs = { self, nixpkgs, inference-defaults }: + outputs = { self, nixpkgs }: let system = "x86_64-linux"; pkgs = nixpkgs.legacyPackages.${system}; - in { - packages.${system}.default = pkgs.buildGoModule { + reactUi = pkgs.buildNpmPackage { + pname = "localai-react-ui"; + version = "custom"; + src = ./core/http/react-ui; + npmDeps = pkgs.importNpmLock { + npmRoot = ./core/http/react-ui; + }; + npmConfigHook = pkgs.importNpmLock.npmConfigHook; + npmBuildScript = "build"; + + installPhase = '' + runHook preInstall + mkdir -p $out + cp -r dist $out/ + runHook postInstall + ''; + }; + localai-unwrapped = pkgs.buildGoModule { pname = "localai"; version = "custom"; src = ./.; proxyVendor = true; - vendorHash = "sha256-6f3adjGsoFXlUtXjBDHP4Mv9jKCOK3aeUXprm0EAVO8="; + vendorHash = "sha256-z3lxQS8mXFuJzvYamejwapwVEmLpeAoiO3ksUKb4I3Q="; nativeBuildInputs = with pkgs; [ pkg-config cmake gcc protobuf go-protobuf protoc-gen-go protoc-gen-go-grpc @@ -44,8 +56,9 @@ go mod edit -replace github.com/mudler/LocalAI/pkg/grpc/proto=./pkg/grpc/proto - mkdir -p core/config/gen_inference_defaults - cp ${inference-defaults} core/config/gen_inference_defaults/inference_defaults.json + mkdir -p core/http/react-ui + cp -r ${reactUi}/dist core/http/react-ui/dist + sed -i '/go:generate/d' core/config/inference_defaults.go || true ''; @@ -57,6 +70,21 @@ [ -f $out/bin/local-ai ] && mv $out/bin/local-ai $out/bin/localai ''; }; + in { + packages.${system} = { + localai-unwrapped = localai-unwrapped; + + default = pkgs.buildFHSEnv { + name = "localai"; + targetPkgs = pkgs: with pkgs; [ + localai-unwrapped + bash + coreutils + gnugrep + ]; + runScript = "${localai-unwrapped}/bin/localai"; + }; + }; devShells.${system}.default = pkgs.mkShell { packages = with pkgs; [ From 606128e4e9aeabacb4a029799d4ee6e0d970250a Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Fri, 19 Jun 2026 16:16:33 +0100 Subject: [PATCH 03/99] feat(vulkan): make Vulkan backends self-contained on the GPU (#10404) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vulkan backends bundled their own loader and ICD manifests but neither the Mesa driver the manifests point at nor a way to make the loader find them, so on a runtime base image without Mesa the loader enumerated zero devices and the GPU silently fell back to CPU (only NVIDIA worked, since its ICD is injected by the container toolkit). - scripts/build/package-gpu-libs.sh: for each installed ICD manifest, bundle the driver .so its library_path names — no hard-coded, platform-dependent soname list — plus that driver's ldd dependencies, skipping manifests whose driver isn't installed. Rewrite each library_path to a bare soname so the bundled driver resolves via the LD_LIBRARY_PATH run.sh already sets. - .docker/install-base-deps.sh, backend/Dockerfile.golang, backend/Dockerfile.python: install mesa-vulkan-drivers in every Vulkan builder so the driver + manifests exist to be packaged (the LunarG SDK ships only the loader and shader tooling). - pkg/model/process.go: when a backend ships vulkan/icd.d/, point the loader at it via VK_DRIVER_FILES/VK_ICD_FILENAMES at launch (no-op otherwise). Covered by pkg/model/process_vulkan_test.go. - backend/go/parakeet-cpp/package.sh: complete the L0 stub (was missing the libc-family ldd walk + GPU-lib packaging) by mirroring whisper, so the vulkan-parakeet image actually bundles its GPU runtime. Assisted-by: Claude Code:claude-opus-4-8 Signed-off-by: Richard Palethorpe --- .docker/install-base-deps.sh | 6 ++ backend/Dockerfile.golang | 7 ++- backend/Dockerfile.python | 7 ++- backend/go/parakeet-cpp/package.sh | 55 +++++++++++++++-- pkg/model/process.go | 46 ++++++++++++++- pkg/model/process_vulkan_test.go | 58 ++++++++++++++++++ scripts/build/package-gpu-libs.sh | 95 ++++++++++++++++++++++++++++-- 7 files changed, 262 insertions(+), 12 deletions(-) create mode 100644 pkg/model/process_vulkan_test.go diff --git a/.docker/install-base-deps.sh b/.docker/install-base-deps.sh index 5b0908fa8..2b0e7e0c6 100755 --- a/.docker/install-base-deps.sh +++ b/.docker/install-base-deps.sh @@ -70,6 +70,12 @@ if [ "${BUILD_TYPE:-}" = "vulkan" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; t git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \ ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \ clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils + # Mesa Vulkan ICD drivers (ANV/RADV/lavapipe + Arm SoC) and their ICD + # manifests. The LunarG SDK below only provides the loader and shader + # tooling, not hardware drivers — without Mesa the packaged Vulkan backend + # would ship a loader that finds no GPU. package-gpu-libs.sh bundles these + # .so files plus their deps into the backend so it stays self-contained. + apt-get install -y mesa-vulkan-drivers libdrm2 if [ "amd64" = "${TARGETARCH:-}" ]; then wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz diff --git a/backend/Dockerfile.golang b/backend/Dockerfile.golang index 75fc3a0d9..d188cdf70 100644 --- a/backend/Dockerfile.golang +++ b/backend/Dockerfile.golang @@ -65,7 +65,12 @@ RUN </dev/null || { echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2 exit 1 } -echo "L0 package layout (full ldd walk lands in L3):" +# Detect architecture and copy the core runtime libs libparakeet.so links +# against, plus the matching dynamic loader as lib/ld.so. +if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then + echo "Detected x86_64 architecture, copying x86_64 libraries..." + cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so" + cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6" + cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2" + cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1" + cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0" +elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then + echo "Detected ARM64 architecture, copying ARM64 libraries..." + cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so" + cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6" + cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2" + cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1" + cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0" +elif [ "$(uname -s)" = "Darwin" ]; then + echo "Detected Darwin" +else + echo "Error: Could not detect architecture" + exit 1 +fi + +# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) +# based on BUILD_TYPE so the backend can reach the GPU without the runtime +# base image shipping those drivers. +GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh" +if [ -f "$GPU_LIB_SCRIPT" ]; then + echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..." + source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib" + package_gpu_libs +fi + +echo "Packaging completed successfully" ls -liah "$CURDIR/package/" "$CURDIR/package/lib/" diff --git a/pkg/model/process.go b/pkg/model/process.go index 60f8d318e..95e3e0758 100644 --- a/pkg/model/process.go +++ b/pkg/model/process.go @@ -154,11 +154,20 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string return nil, err } + env := os.Environ() + // Vulkan backends are self-contained: they bundle their own loader and + // Mesa driver .so files in lib/ plus the matching ICD manifests in + // vulkan/icd.d/. Point the loader at those manifests so it doesn't rely on + // the runtime base image shipping a Vulkan driver (it carries the + // SYCL/Level-Zero stack instead, so the default ICD search path is empty + // and the GPU would silently fall back to CPU). No-op for other backends. + env = append(env, vulkanICDEnv(workDir)...) + grpcControlProcess := process.New( process.WithTemporaryStateDir(), process.WithName(filepath.Base(grpcProcess)), process.WithArgs(append(args, []string{"--addr", serverAddress}...)...), - process.WithEnvironment(os.Environ()...), + process.WithEnvironment(env...), process.WithWorkDir(workDir), ) @@ -249,3 +258,38 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string return grpcControlProcess, nil } + +// vulkanICDEnv returns environment overrides that point the Vulkan loader at +// the ICD manifests a backend bundles in /vulkan/icd.d. Vulkan +// backends ship a self-contained stack — their own loader and Mesa driver .so +// files in lib/ (resolved via the LD_LIBRARY_PATH that run.sh sets) plus the +// matching ICD manifests — so the loader must be told where those manifests +// live; its default search path (/usr/share/vulkan/icd.d, /etc/vulkan/icd.d) +// is empty on the runtime base image. Returns nil when the directory holds no +// manifests (CPU/CUDA/SYCL builds), leaving the host's Vulkan setup untouched. +func vulkanICDEnv(workDir string) []string { + icdDir := filepath.Join(workDir, "vulkan", "icd.d") + entries, err := os.ReadDir(icdDir) + if err != nil { + return nil + } + + manifests := make([]string, 0, len(entries)) + for _, e := range entries { + if e.IsDir() || !strings.HasSuffix(e.Name(), ".json") { + continue + } + manifests = append(manifests, filepath.Join(icdDir, e.Name())) + } + if len(manifests) == 0 { + return nil + } + + list := strings.Join(manifests, string(os.PathListSeparator)) + // VK_DRIVER_FILES is the current loader variable; VK_ICD_FILENAMES is its + // deprecated alias, set too so older bundled loaders still pick it up. + return []string{ + "VK_DRIVER_FILES=" + list, + "VK_ICD_FILENAMES=" + list, + } +} diff --git a/pkg/model/process_vulkan_test.go b/pkg/model/process_vulkan_test.go new file mode 100644 index 000000000..c3bb108a7 --- /dev/null +++ b/pkg/model/process_vulkan_test.go @@ -0,0 +1,58 @@ +package model + +import ( + "os" + "path/filepath" + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("vulkanICDEnv", func() { + It("returns nil when the backend ships no vulkan/icd.d (CPU/CUDA/SYCL builds)", func() { + Expect(vulkanICDEnv(GinkgoT().TempDir())).To(BeNil()) + }) + + It("returns nil when icd.d exists but holds no .json manifests", func() { + work := GinkgoT().TempDir() + icdDir := filepath.Join(work, "vulkan", "icd.d") + Expect(os.MkdirAll(icdDir, 0o755)).To(Succeed()) + Expect(os.WriteFile(filepath.Join(icdDir, "README.txt"), []byte("not a manifest"), 0o644)).To(Succeed()) + // A directory whose name ends in .json must be ignored. + Expect(os.MkdirAll(filepath.Join(icdDir, "nested.json"), 0o755)).To(Succeed()) + + Expect(vulkanICDEnv(work)).To(BeNil()) + }) + + It("points VK_DRIVER_FILES/VK_ICD_FILENAMES at the bundled manifests", func() { + work := GinkgoT().TempDir() + icdDir := filepath.Join(work, "vulkan", "icd.d") + Expect(os.MkdirAll(icdDir, 0o755)).To(Succeed()) + for _, name := range []string{"intel_icd.json", "lvp_icd.json"} { + Expect(os.WriteFile(filepath.Join(icdDir, name), []byte("{}"), 0o644)).To(Succeed()) + } + + env := vulkanICDEnv(work) + Expect(env).To(HaveLen(2)) + + got := map[string]string{} + for _, kv := range env { + k, v, ok := strings.Cut(kv, "=") + Expect(ok).To(BeTrue(), "malformed env entry %q", kv) + got[k] = v + } + + for _, key := range []string{"VK_DRIVER_FILES", "VK_ICD_FILENAMES"} { + Expect(got).To(HaveKey(key)) + // Both manifests must be listed as absolute paths, joined by the + // OS path-list separator the Vulkan loader expects. + parts := strings.Split(got[key], string(os.PathListSeparator)) + Expect(parts).To(HaveLen(2)) + for _, p := range parts { + Expect(filepath.IsAbs(p)).To(BeTrue(), "%s entry %q must be absolute", key, p) + Expect(p).To(HaveSuffix(".json")) + } + } + }) +}) diff --git a/scripts/build/package-gpu-libs.sh b/scripts/build/package-gpu-libs.sh index 2b5b02aab..40f410173 100755 --- a/scripts/build/package-gpu-libs.sh +++ b/scripts/build/package-gpu-libs.sh @@ -109,6 +109,38 @@ copy_libs_glob() { done } +# Returns success for the core runtime libs the base image and package.sh +# already provide. We must NOT bundle our own copies of these — a second libc +# or libstdc++ on LD_LIBRARY_PATH clashes with the loader and the rest of the +# process — so they're skipped when pulling in a driver's transitive deps. +is_core_lib() { + case "$1" in + ld-linux*|ld.so|libc.so.*|libm.so.*|libdl.so.*|libpthread.so.*|librt.so.*|\ + libgcc_s.so.*|libstdc++.so.*|libresolv.so.*|libutil.so.*|linux-vdso.so.*) + return 0 ;; + esac + return 1 +} + +# Copy the shared-library dependencies of an ELF file into TARGET_LIB_DIR. +# Used to make a bundled GPU driver self-contained: e.g. the Mesa Vulkan ICDs +# pull in libdrm, libexpat and (for RADV/lavapipe) libLLVM, none of which the +# runtime base image is guaranteed to have. Core libc-family deps are skipped. +copy_elf_deps() { + local elf="$1" + [ -e "$elf" ] || return 0 + command -v ldd >/dev/null 2>&1 || return 0 + + # ldd lines look like: "libfoo.so.1 => /path/to/libfoo.so.1 (0x..)". + # Take the resolved absolute path (field 3) and skip vdso/static entries. + while read -r dep; do + if is_core_lib "$(basename "$dep")"; then + continue + fi + copy_lib "$dep" + done < <(ldd "$elf" 2>/dev/null | awk '/=>/ && $3 ~ /^\// {print $3}') +} + # Package NVIDIA CUDA libraries package_cuda_libs() { echo "Packaging CUDA libraries for BUILD_TYPE=${BUILD_TYPE}..." @@ -284,7 +316,7 @@ package_vulkan_libs() { "/usr/local/lib" ) - # Core Vulkan runtime libraries + # Core Vulkan runtime: the loader plus the shader tooling shipped by the SDK. local vulkan_libs=( "libvulkan.so*" "libshaderc_shared.so*" @@ -301,10 +333,63 @@ package_vulkan_libs() { fi done - # Copy Vulkan ICD files + # Bundle the ICD drivers. Rather than hard-code Mesa's (platform- and + # version-dependent) driver sonames, treat each installed ICD manifest as + # the source of truth: every /usr/share/vulkan/icd.d/*.json names the exact + # driver .so it needs in its "library_path". So we copy whatever drivers + # the manifests reference (libvulkan_intel/radeon/lvp/... on amd64, the SoC + # drivers on arm64, ...) plus each driver's transitive deps, and skip any + # manifest whose driver isn't actually installed. The loader picks the + # right driver for the GPU at runtime. if [ -d "/usr/share/vulkan/icd.d" ]; then - mkdir -p "$TARGET_LIB_DIR/../vulkan/icd.d" - cp -arfL /usr/share/vulkan/icd.d/* "$TARGET_LIB_DIR/../vulkan/icd.d/" 2>/dev/null || true + local icd_dest="$TARGET_LIB_DIR/../vulkan/icd.d" + mkdir -p "$icd_dest" + + local manifest driver driver_base resolved lib_path + for manifest in /usr/share/vulkan/icd.d/*.json; do + [ -e "$manifest" ] || continue + + # Pull the driver path out of "library_path": "". + driver=$(sed -nE 's/.*"library_path"[[:space:]]*:[[:space:]]*"([^"]+)".*/\1/p' "$manifest" | head -n1) + [ -n "$driver" ] || continue + driver_base=$(basename "$driver") + + # Resolve to an absolute path: honour an absolute library_path, + # else look in the standard lib dirs, else fall back to ldconfig. + resolved="" + case "$driver" in + /*) [ -e "$driver" ] && resolved="$driver" ;; + esac + if [ -z "$resolved" ]; then + for lib_path in "${vulkan_lib_paths[@]}"; do + if [ -e "${lib_path}/${driver_base}" ]; then + resolved="${lib_path}/${driver_base}" + break + fi + done + fi + if [ -z "$resolved" ] && command -v ldconfig >/dev/null 2>&1; then + resolved=$(ldconfig -p | awk -v n="$driver_base" '$1 == n { print $NF; exit }') + fi + + if [ -z "$resolved" ] || [ ! -e "$resolved" ]; then + echo "Vulkan ICD: driver '$driver_base' for $(basename "$manifest") not installed; skipping its manifest" >&2 + continue + fi + + # Bundle the driver + its transitive deps (libdrm, libexpat, and + # libLLVM for RADV/lavapipe, ...) so the backend is self-contained + # on a runtime base image without Mesa. + copy_lib "$resolved" + copy_elf_deps "$resolved" + + # Copy the manifest and rewrite its library_path to a bare soname + # so the loader resolves our bundled driver via LD_LIBRARY_PATH + # (run.sh adds lib/ to it) instead of a host path that won't exist + # on the runtime image. + cp -arfL "$manifest" "$icd_dest/" 2>/dev/null || true + sed -i -E 's#("library_path"[[:space:]]*:[[:space:]]*")[^"]*/#\1#' "$icd_dest/$(basename "$manifest")" + done fi echo "Vulkan libraries packaged successfully" @@ -345,6 +430,8 @@ package_gpu_libs() { export -f package_gpu_libs export -f copy_lib export -f copy_libs_glob +export -f is_core_lib +export -f copy_elf_deps export -f package_cuda_libs export -f package_rocm_libs export -f package_intel_libs From 72d46c1115546a65c3240b832eaace55aae3ce73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=95=AA=E8=8C=84=E6=91=94=E6=88=90=E7=95=AA=E8=8C=84?= =?UTF-8?q?=E9=85=B1?= <68098251+fqscfqj@users.noreply.github.com> Date: Sat, 20 Jun 2026 03:34:30 +0800 Subject: [PATCH 04/99] feat(crispasr): add word-level timestamp support (#10403) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(crispasr): add word-level timestamp support Add word-level timestamp extraction to the crispasr backend by calling the CrispASR C library's word accessor functions that are already exported by libgocraspasr but were not previously bound by the Go wrapper. Two families of word functions are supported: 1. Session-based (get_word_count/text/t0/t1) — works per-segment for whisper-like backends. 2. Parakeet-specific (get_parakeet_word_count/text/t0/t1) — returns a global word list for TDT/CTC/RNNT parakeet models where the session API does not expose per-segment word data. The Go code tries session-based first and falls back to parakeet-specific when the session word count is zero. Depends on #10402 (grpc server Words forwarding) for the words to reach the HTTP response. Signed-off-by: fqscfqj * fix(crispasr): use portable sed -i.bak for macOS compatibility BSD sed requires -i '' for in-place editing while GNU sed uses -i. Replace with -i.bak which works on both platforms, then remove the backup file. Signed-off-by: fqscfqj --------- Signed-off-by: fqscfqj --- backend/go/crispasr/Makefile | 2 +- backend/go/crispasr/cpp/crispasr_shim.cpp | 68 +++++++++++++++++++++++ backend/go/crispasr/cpp/crispasr_shim.h | 14 +++++ backend/go/crispasr/gocrispasr.go | 38 +++++++++++++ backend/go/crispasr/main.go | 8 +++ 5 files changed, 129 insertions(+), 1 deletion(-) diff --git a/backend/go/crispasr/Makefile b/backend/go/crispasr/Makefile index 42a7a7555..bbc84f1de 100644 --- a/backend/go/crispasr/Makefile +++ b/backend/go/crispasr/Makefile @@ -67,7 +67,7 @@ sources/CrispASR: # it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources # aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root), # which is correct both standalone and as a subproject. Idempotent. - sed -i 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt + sed -i.bak 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt && rm -f sources/CrispASR/src/CMakeLists.txt.bak # Detect OS UNAME_S := $(shell uname -s) diff --git a/backend/go/crispasr/cpp/crispasr_shim.cpp b/backend/go/crispasr/cpp/crispasr_shim.cpp index bf6151ae1..60dbfd86b 100644 --- a/backend/go/crispasr/cpp/crispasr_shim.cpp +++ b/backend/go/crispasr/cpp/crispasr_shim.cpp @@ -47,6 +47,74 @@ extern "C" void set_abort(int v) { g_abort.store(v, std::memory_order_relaxed); } +// --- word-level timestamp accessors --- +extern "C" { +int crispasr_session_result_n_words(crispasr_session_result *r, int seg_i); +const char *crispasr_session_result_word_text(crispasr_session_result *r, + int seg_i, int word_i); +int64_t crispasr_session_result_word_t0(crispasr_session_result *r, int seg_i, + int word_i); +int64_t crispasr_session_result_word_t1(crispasr_session_result *r, int seg_i, + int word_i); + +// Parakeet-specific word accessors +int crispasr_parakeet_result_n_words(void *r); +const char *crispasr_parakeet_result_word_text(void *r, int word_i); +int64_t crispasr_parakeet_result_word_t0(void *r, int word_i); +int64_t crispasr_parakeet_result_word_t1(void *r, int word_i); +} + +void *get_result(void) { return g_result; } + +int get_word_count(int seg_i) { + if (!g_result) + return 0; + return crispasr_session_result_n_words(g_result, seg_i); +} + +const char *get_word_text(int seg_i, int word_i) { + if (!g_result) + return ""; + return crispasr_session_result_word_text(g_result, seg_i, word_i); +} + +int64_t get_word_t0(int seg_i, int word_i) { + if (!g_result) + return 0; + return crispasr_session_result_word_t0(g_result, seg_i, word_i); +} + +int64_t get_word_t1(int seg_i, int word_i) { + if (!g_result) + return 0; + return crispasr_session_result_word_t1(g_result, seg_i, word_i); +} + +// Parakeet-specific word accessors +int get_parakeet_word_count(void) { + if (!g_result) + return 0; + return crispasr_parakeet_result_n_words(g_result); +} + +const char *get_parakeet_word_text(int word_i) { + if (!g_result) + return ""; + return crispasr_parakeet_result_word_text(g_result, word_i); +} + +int64_t get_parakeet_word_t0(int word_i) { + if (!g_result) + return 0; + return crispasr_parakeet_result_word_t0(g_result, word_i); +} + +int64_t get_parakeet_word_t1(int word_i) { + if (!g_result) + return 0; + return crispasr_parakeet_result_word_t1(g_result, word_i); +} + static void ggml_log_cb(enum ggml_log_level level, const char *log, void *data) { const char *level_str; diff --git a/backend/go/crispasr/cpp/crispasr_shim.h b/backend/go/crispasr/cpp/crispasr_shim.h index 7c593951a..c7baa41f4 100644 --- a/backend/go/crispasr/cpp/crispasr_shim.h +++ b/backend/go/crispasr/cpp/crispasr_shim.h @@ -20,4 +20,18 @@ float *tts_synthesize(const char *text, int *out_n_samples); // 24kHz mono float void tts_free(float *pcm); int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text) + +// --- word-level timestamp accessors --- +// Session-based (works for whisper-like backends) +void *get_result(void); +int get_word_count(int seg_i); +const char *get_word_text(int seg_i, int word_i); +int64_t get_word_t0(int seg_i, int word_i); +int64_t get_word_t1(int seg_i, int word_i); + +// Parakeet-specific (global word list, no segment index) +int get_parakeet_word_count(void); +const char *get_parakeet_word_text(int word_i); +int64_t get_parakeet_word_t0(int word_i); +int64_t get_parakeet_word_t1(int word_i); } diff --git a/backend/go/crispasr/gocrispasr.go b/backend/go/crispasr/gocrispasr.go index 5c3528d38..af1f1a95c 100644 --- a/backend/go/crispasr/gocrispasr.go +++ b/backend/go/crispasr/gocrispasr.go @@ -34,6 +34,18 @@ var ( CppTTSFree func(ptr uintptr) CppTTSSetVoice func(name string) int CppTTSSetVoiceFile func(path string, refText string) int + + // Word-level timestamp accessors (session-based, per-segment) + CppGetWordCount func(segI int) int + CppGetWordText func(segI int, wordI int) string + CppGetWordT0 func(segI int, wordI int) int64 + CppGetWordT1 func(segI int, wordI int) int64 + + // Parakeet-specific word accessors (global, no segment index) + CppGetParakeetWordCount func() int + CppGetParakeetWordText func(wordI int) string + CppGetParakeetWordT0 func(wordI int) int64 + CppGetParakeetWordT1 func(wordI int) int64 ) type CrispASR struct { @@ -290,10 +302,36 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe // IDs, so Tokens is left empty. txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "�") + // Populate word-level timestamps. Try session-based functions first + // (per-segment); fall back to parakeet-specific functions (global word + // list with no segment index — only populated on the first segment to + // avoid duplication). + words := []*pb.TranscriptWord{} + wordCount := CppGetWordCount(i) + if wordCount == 0 && i == 0 { + wordCount = CppGetParakeetWordCount() + for j := 0; j < wordCount; j++ { + words = append(words, &pb.TranscriptWord{ + Start: CppGetParakeetWordT0(j) * (10000000), + End: CppGetParakeetWordT1(j) * (10000000), + Text: strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "�"), + }) + } + } else { + for j := 0; j < wordCount; j++ { + words = append(words, &pb.TranscriptWord{ + Start: CppGetWordT0(i, j) * (10000000), + End: CppGetWordT1(i, j) * (10000000), + Text: strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "�"), + }) + } + } + segment := &pb.TranscriptSegment{ Id: int32(i), Text: txt, Start: s, End: t, + Words: words, } segments = append(segments, segment) diff --git a/backend/go/crispasr/main.go b/backend/go/crispasr/main.go index c2069bd85..9f3ef14d0 100644 --- a/backend/go/crispasr/main.go +++ b/backend/go/crispasr/main.go @@ -44,6 +44,14 @@ func main() { {&CppTTSFree, "tts_free"}, {&CppTTSSetVoice, "tts_set_voice"}, {&CppTTSSetVoiceFile, "tts_set_voice_file"}, + {&CppGetWordCount, "get_word_count"}, + {&CppGetWordText, "get_word_text"}, + {&CppGetWordT0, "get_word_t0"}, + {&CppGetWordT1, "get_word_t1"}, + {&CppGetParakeetWordCount, "get_parakeet_word_count"}, + {&CppGetParakeetWordText, "get_parakeet_word_text"}, + {&CppGetParakeetWordT0, "get_parakeet_word_t0"}, + {&CppGetParakeetWordT1, "get_parakeet_word_t1"}, } for _, lf := range libFuncs { From 2e734bf56039a2f412ba9216b00ba683096c5726 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Fri, 19 Jun 2026 21:35:21 +0200 Subject: [PATCH 05/99] fix(downloader): stall timeout, resume-safe cancel, and stale-partial reaping (#10406) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(downloader): stall timeout, resume-safe cancel, and stale-partial reaping Large model installs would hang forever or never finish. Three defects in the HTTP download path, all hit by big GGUF pulls over a slow or flaky link: 1. No stall timeout. The shared download client sets no body deadline (correct for streaming) but also no read-idle timeout, and the transport's IdleConnTimeout does not cover an in-flight body read. A silently-dropped TCP connection (no FIN/RST) blocked the body Read forever, freezing an install at N bytes until an external reaper killed it. Add an idle-timeout reader that closes the body after a window of zero progress (DownloadStallTimeout, default 60s), turning an indefinite hang into a fast, retryable error. A read that returns data resets the clock, so a slow-but-steady transfer is unaffected. 2. Cancellation deleted the partial. On context.Canceled the code removed the .partial file, so any frontend restart (deploy, OOM) mid-download wiped all progress and the retry restarted from zero. At slow egress, files larger than the restart interval never completed. Keep the .partial on cancel so the next attempt resumes via Range. 3. Partials leaked. Cleanup only ran on the context-cancel path, never on a stall or a SIGKILL/OOM, so abandoned .partial files accumulated and could fill the models volume. Add CleanupStalePartialFiles and reap partials older than 24h on startup. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] * fix(downloader): discard the .partial on a deliberate user cancel Review follow-up. The previous commit kept the .partial on every cancellation so restarts could resume, but that also left a dangling partial when a user *intentionally* cancelled an install — the file lingered until the 24h reaper. Distinguish the two: cancel the gallery operation's context with a cause (downloader.ErrUserCancelled) so the download layer can tell a deliberate abort (discard the partial) from an incidental one such as a shutdown/restart (keep it for resume). Detect cancellation via the context rather than the returned error, because an HTTP request cancelled with a cause surfaces the cause error, not context.Canceled. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] * fix(downloader): resolve gosec G122 in CleanupStalePartialFiles CI's code-scanning (gosec) flagged G122 (symlink TOCTOU) for the os.Remove call inside the filepath.WalkDir callback. Collect the stale paths during the walk and delete them afterwards instead of mutating the tree from inside the callback. Behavior is unchanged; the existing specs still pass. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] --------- Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- core/application/startup.go | 11 +++ core/services/galleryop/service.go | 15 ++- pkg/downloader/cancel_test.go | 148 +++++++++++++++++++++++++++++ pkg/downloader/partial.go | 69 ++++++++++++++ pkg/downloader/partial_test.go | 53 +++++++++++ pkg/downloader/stall.go | 77 +++++++++++++++ pkg/downloader/stall_test.go | 131 +++++++++++++++++++++++++ pkg/downloader/uri.go | 57 ++++++++--- 8 files changed, 547 insertions(+), 14 deletions(-) create mode 100644 pkg/downloader/cancel_test.go create mode 100644 pkg/downloader/partial.go create mode 100644 pkg/downloader/partial_test.go create mode 100644 pkg/downloader/stall.go create mode 100644 pkg/downloader/stall_test.go diff --git a/core/application/startup.go b/core/application/startup.go index 6438c7df3..fa5de5ede 100644 --- a/core/application/startup.go +++ b/core/application/startup.go @@ -25,6 +25,7 @@ import ( "github.com/mudler/LocalAI/core/services/storage" coreStartup "github.com/mudler/LocalAI/core/startup" "github.com/mudler/LocalAI/internal" + "github.com/mudler/LocalAI/pkg/downloader" "github.com/mudler/LocalAI/pkg/signals" "github.com/mudler/LocalAI/pkg/vram" @@ -71,6 +72,16 @@ func New(opts ...config.AppOption) (*Application, error) { if err != nil { return nil, fmt.Errorf("unable to create ModelPath: %q", err) } + + // Reap *.partial downloads abandoned by a previous run (killed mid-transfer + // by an OOM/restart, or stalled before cleanup could run). The 24h window + // is well beyond any legitimate in-flight download, so this never trims an + // active transfer; it just stops dead partials accumulating on the volume. + if removed, cErr := downloader.CleanupStalePartialFiles(options.SystemState.Model.ModelsPath, 24*time.Hour); cErr != nil { + xlog.Warn("Failed to reap stale partial downloads", "error", cErr) + } else if removed > 0 { + xlog.Info("Reaped stale partial downloads", "count", removed) + } if options.GeneratedContentDir != "" { err := os.MkdirAll(options.GeneratedContentDir, 0o750) if err != nil { diff --git a/core/services/galleryop/service.go b/core/services/galleryop/service.go index df0352e99..5b611d41e 100644 --- a/core/services/galleryop/service.go +++ b/core/services/galleryop/service.go @@ -11,6 +11,7 @@ import ( "github.com/mudler/LocalAI/core/gallery" "github.com/mudler/LocalAI/core/services/distributed" "github.com/mudler/LocalAI/core/services/messaging" + "github.com/mudler/LocalAI/pkg/downloader" "github.com/mudler/LocalAI/pkg/model" "github.com/mudler/LocalAI/pkg/system" "github.com/mudler/xlog" @@ -402,6 +403,16 @@ func (g *GalleryService) applyCancel(id string) { } } +// newUserCancellableContext returns a child context whose CancelFunc cancels +// with the downloader.ErrUserCancelled cause. This lets the download layer +// distinguish a deliberate user cancel (discard the half-downloaded .partial) +// from an incidental cancellation such as process shutdown (keep the .partial +// so the next run resumes via Range instead of restarting from zero). +func newUserCancellableContext(parent context.Context) (context.Context, context.CancelFunc) { + ctx, cancelCause := context.WithCancelCause(parent) + return ctx, func() { cancelCause(downloader.ErrUserCancelled) } +} + // storeCancellation stores a cancellation function for an operation func (g *GalleryService) storeCancellation(id string, cancelFunc context.CancelFunc) { g.Lock() @@ -444,7 +455,7 @@ func (g *GalleryService) Start(c context.Context, cl *config.ModelConfigLoader, case op := <-g.BackendGalleryChannel: // Create context if not provided if op.Context == nil { - op.Context, op.CancelFunc = context.WithCancel(c) + op.Context, op.CancelFunc = newUserCancellableContext(c) g.storeCancellation(op.ID, op.CancelFunc) } else if op.CancelFunc != nil { g.storeCancellation(op.ID, op.CancelFunc) @@ -472,7 +483,7 @@ func (g *GalleryService) Start(c context.Context, cl *config.ModelConfigLoader, case op := <-g.ModelGalleryChannel: // Create context if not provided if op.Context == nil { - op.Context, op.CancelFunc = context.WithCancel(c) + op.Context, op.CancelFunc = newUserCancellableContext(c) g.storeCancellation(op.ID, op.CancelFunc) } else if op.CancelFunc != nil { g.storeCancellation(op.ID, op.CancelFunc) diff --git a/pkg/downloader/cancel_test.go b/pkg/downloader/cancel_test.go new file mode 100644 index 000000000..76f8a2df5 --- /dev/null +++ b/pkg/downloader/cancel_test.go @@ -0,0 +1,148 @@ +package downloader_test + +import ( + "context" + "crypto/rand" + "crypto/sha256" + "errors" + "fmt" + "net/http" + "net/http/httptest" + "os" + "strconv" + "strings" + "time" + + . "github.com/mudler/LocalAI/pkg/downloader" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Download cancellation", func() { + var filePath string + + // streamingRangeServer serves data one small chunk at a time with a short + // pause between chunks, so a context cancellation can land mid-transfer. + // It honors a `bytes=N-` Range request so a second attempt can resume. + streamingRangeServer := func(data []byte) *httptest.Server { + return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method == "HEAD" { + w.Header().Set("Accept-Ranges", "bytes") + w.WriteHeader(http.StatusOK) + return + } + start := 0 + if rh := r.Header.Get("Range"); rh != "" { + _, _ = fmt.Sscanf(strings.TrimPrefix(rh, "bytes="), "%d-", &start) + } + w.Header().Set("Content-Length", strconv.Itoa(len(data)-start)) + if start > 0 { + w.WriteHeader(http.StatusPartialContent) + } else { + w.WriteHeader(http.StatusOK) + } + f, _ := w.(http.Flusher) + for i := start; i < len(data); i += 256 { + end := i + 256 + if end > len(data) { + end = len(data) + } + if _, err := w.Write(data[i:end]); err != nil { + return + } + if f != nil { + f.Flush() + } + time.Sleep(20 * time.Millisecond) + } + })) + } + + BeforeEach(func() { + dir, err := os.Getwd() + Expect(err).ToNot(HaveOccurred()) + filePath = dir + "/cancel_model" + }) + + AfterEach(func() { + _ = os.Remove(filePath) + _ = os.Remove(filePath + ".partial") + }) + + It("keeps the .partial file when the context is cancelled so the download can resume", func() { + data := make([]byte, 8192) + _, err := rand.Read(data) + Expect(err).ToNot(HaveOccurred()) + server := streamingRangeServer(data) + defer server.Close() + + ctx, cancel := context.WithCancel(context.Background()) + go func() { + time.Sleep(150 * time.Millisecond) + cancel() + }() + + err = URI(server.URL).DownloadFileWithContext(ctx, filePath, "", 1, 1, func(s1, s2, s3 string, f float64) {}) + Expect(err).To(HaveOccurred()) + Expect(errors.Is(err, context.Canceled)).To(BeTrue()) + + info, statErr := os.Stat(filePath + ".partial") + Expect(statErr).ToNot(HaveOccurred(), + "a cancelled download must leave its .partial behind so the retry resumes instead of restarting from zero") + Expect(info.Size()).To(BeNumerically(">", 0)) + Expect(info.Size()).To(BeNumerically("<", int64(len(data)))) + }) + + It("discards the .partial when the cancellation cause is ErrUserCancelled", func() { + data := make([]byte, 8192) + _, err := rand.Read(data) + Expect(err).ToNot(HaveOccurred()) + server := streamingRangeServer(data) + defer server.Close() + + // A deliberate user abort: cancel WITH the ErrUserCancelled cause. The + // half-finished download should not linger on disk. + ctx, cancel := context.WithCancelCause(context.Background()) + go func() { + time.Sleep(150 * time.Millisecond) + cancel(ErrUserCancelled) + }() + + err = URI(server.URL).DownloadFileWithContext(ctx, filePath, "", 1, 1, func(s1, s2, s3 string, f float64) {}) + Expect(err).To(HaveOccurred()) + Expect(errors.Is(err, context.Canceled)).To(BeTrue()) + + Expect(filePath + ".partial").ToNot(BeAnExistingFile(), + "a deliberate user cancel must not leave a dangling .partial behind") + }) + + It("resumes from the preserved .partial after a cancellation and completes", func() { + data := make([]byte, 8192) + _, err := rand.Read(data) + Expect(err).ToNot(HaveOccurred()) + sum := sha256.Sum256(data) + sha := fmt.Sprintf("%x", sum) + server := streamingRangeServer(data) + defer server.Close() + + // First attempt: cancel mid-stream. + ctx, cancel := context.WithCancel(context.Background()) + go func() { + time.Sleep(150 * time.Millisecond) + cancel() + }() + err = URI(server.URL).DownloadFileWithContext(ctx, filePath, sha, 1, 1, func(s1, s2, s3 string, f float64) {}) + Expect(err).To(HaveOccurred()) + partialInfo, statErr := os.Stat(filePath + ".partial") + Expect(statErr).ToNot(HaveOccurred()) + resumedFrom := partialInfo.Size() + Expect(resumedFrom).To(BeNumerically(">", 0)) + + // Second attempt: fresh context, must resume and finish with a valid SHA. + err = URI(server.URL).DownloadFileWithContext(context.Background(), filePath, sha, 1, 1, func(s1, s2, s3 string, f float64) {}) + Expect(err).ToNot(HaveOccurred()) + final, rerr := os.ReadFile(filePath) + Expect(rerr).ToNot(HaveOccurred()) + Expect(final).To(Equal(data)) + }) +}) diff --git a/pkg/downloader/partial.go b/pkg/downloader/partial.go new file mode 100644 index 000000000..f816bb09f --- /dev/null +++ b/pkg/downloader/partial.go @@ -0,0 +1,69 @@ +package downloader + +import ( + "io/fs" + "os" + "path/filepath" + "strings" + "time" + + "github.com/mudler/xlog" +) + +// PartialFileSuffix marks an in-progress download. The success path renames the +// partial to its final name, so any leftover with this suffix is an unfinished +// transfer. +const PartialFileSuffix = ".partial" + +// CleanupStalePartialFiles removes *.partial files under root whose last +// modification is older than olderThan, returning the number removed. These are +// abandoned downloads left by a process killed mid-transfer (OOM, restart) or +// by a stall whose cleanup never ran; without reaping they accumulate and can +// fill the models volume. A still-in-progress download touches its .partial on +// every write, so a generous olderThan never trims an active transfer. +// +// A missing root is not an error (nothing to clean). Unreadable entries are +// skipped so one bad file does not abort the whole sweep. +func CleanupStalePartialFiles(root string, olderThan time.Duration) (int, error) { + if _, err := os.Stat(root); err != nil { + if os.IsNotExist(err) { + return 0, nil + } + return 0, err + } + + cutoff := time.Now().Add(-olderThan) + + // Collect candidates during the walk and delete them afterwards rather than + // mutating the tree from inside the WalkDir callback (avoids the symlink + // TOCTOU class flagged by gosec G122, and never removes an entry mid-walk). + var stale []string + err := filepath.WalkDir(root, func(path string, d fs.DirEntry, walkErr error) error { + if walkErr != nil { + return nil // skip unreadable subtree, keep going + } + if d.IsDir() || !strings.HasSuffix(d.Name(), PartialFileSuffix) { + return nil + } + info, err := d.Info() + if err != nil || info.ModTime().After(cutoff) { + return nil + } + stale = append(stale, path) + return nil + }) + if err != nil { + return 0, err + } + + removed := 0 + for _, path := range stale { + if err := os.Remove(path); err != nil { + xlog.Warn("failed to remove stale partial download", "file", path, "error", err) + continue + } + removed++ + xlog.Info("removed stale partial download", "file", path) + } + return removed, nil +} diff --git a/pkg/downloader/partial_test.go b/pkg/downloader/partial_test.go new file mode 100644 index 000000000..ceec8417f --- /dev/null +++ b/pkg/downloader/partial_test.go @@ -0,0 +1,53 @@ +package downloader_test + +import ( + "os" + "path/filepath" + "time" + + . "github.com/mudler/LocalAI/pkg/downloader" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("CleanupStalePartialFiles", func() { + var root string + + BeforeEach(func() { + var err error + root, err = os.MkdirTemp("", "partials") + Expect(err).ToNot(HaveOccurred()) + }) + + AfterEach(func() { + _ = os.RemoveAll(root) + }) + + It("removes stale .partial files (recursively) while keeping fresh ones and completed files", func() { + nested := filepath.Join(root, "llama-cpp", "models", "foo") + Expect(os.MkdirAll(nested, 0755)).To(Succeed()) + + stale := filepath.Join(nested, "model.gguf.partial") + fresh := filepath.Join(root, "fresh.gguf.partial") + completed := filepath.Join(root, "done.gguf") + for _, f := range []string{stale, fresh, completed} { + Expect(os.WriteFile(f, []byte("data"), 0644)).To(Succeed()) + } + old := time.Now().Add(-2 * time.Hour) + Expect(os.Chtimes(stale, old, old)).To(Succeed()) + + removed, err := CleanupStalePartialFiles(root, time.Hour) + Expect(err).ToNot(HaveOccurred()) + Expect(removed).To(Equal(1)) + + Expect(stale).ToNot(BeAnExistingFile()) + Expect(fresh).To(BeAnExistingFile()) + Expect(completed).To(BeAnExistingFile()) + }) + + It("returns no error when the root directory does not exist", func() { + removed, err := CleanupStalePartialFiles(filepath.Join(root, "does-not-exist"), time.Hour) + Expect(err).ToNot(HaveOccurred()) + Expect(removed).To(Equal(0)) + }) +}) diff --git a/pkg/downloader/stall.go b/pkg/downloader/stall.go new file mode 100644 index 000000000..697ad25d9 --- /dev/null +++ b/pkg/downloader/stall.go @@ -0,0 +1,77 @@ +package downloader + +import ( + "fmt" + "io" + "sync" + "time" +) + +// DownloadStallTimeout bounds how long an in-flight download may receive no +// data before it is aborted. A silently-dropped TCP connection (no FIN/RST) +// would otherwise block the body read forever, freezing an install at N bytes +// until an external reaper kills it. Overridable (tests set it small); a value +// <= 0 disables the guard. +var DownloadStallTimeout = 60 * time.Second + +// idleTimeoutReader wraps a streaming ReadCloser and aborts reads that make no +// progress within timeout. A standard io.Copy blocks indefinitely on a Read +// against a dead-but-unclosed socket; nothing in the copy loop can interrupt a +// blocked syscall. The watchdog timer closes the underlying reader on expiry, +// which unblocks the in-flight Read with an error. Each read that returns data +// resets the idle clock, so a slow-but-steady transfer never trips the guard. +type idleTimeoutReader struct { + rc io.ReadCloser + timeout time.Duration + + mu sync.Mutex + timer *time.Timer + fired bool + done bool +} + +func newIdleTimeoutReader(rc io.ReadCloser, timeout time.Duration) *idleTimeoutReader { + r := &idleTimeoutReader{rc: rc, timeout: timeout} + r.timer = time.AfterFunc(timeout, r.onStall) + return r +} + +// onStall fires when no data has arrived within the timeout. Closing the +// underlying reader is what unblocks a Read parked in the kernel. +func (r *idleTimeoutReader) onStall() { + r.mu.Lock() + if r.done { + r.mu.Unlock() + return + } + r.fired = true + r.mu.Unlock() + _ = r.rc.Close() +} + +func (r *idleTimeoutReader) Read(p []byte) (int, error) { + n, err := r.rc.Read(p) + if n > 0 { + r.timer.Reset(r.timeout) + } + if err != nil { + r.mu.Lock() + fired := r.fired + r.mu.Unlock() + if fired { + // Translate the "use of closed connection" the watchdog induced + // into an actionable stall error. This is not context.Canceled, + // so the caller keeps the .partial file for a later resume. + return n, fmt.Errorf("download stalled: no data received for %s", r.timeout) + } + } + return n, err +} + +func (r *idleTimeoutReader) Close() error { + r.mu.Lock() + r.done = true + r.mu.Unlock() + r.timer.Stop() + return r.rc.Close() +} diff --git a/pkg/downloader/stall_test.go b/pkg/downloader/stall_test.go new file mode 100644 index 000000000..8e6a003c6 --- /dev/null +++ b/pkg/downloader/stall_test.go @@ -0,0 +1,131 @@ +package downloader_test + +import ( + "context" + "net/http" + "net/http/httptest" + "os" + "time" + + . "github.com/mudler/LocalAI/pkg/downloader" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Download stall timeout", func() { + var filePath string + var savedTimeout time.Duration + + BeforeEach(func() { + dir, err := os.Getwd() + Expect(err).ToNot(HaveOccurred()) + filePath = dir + "/stall_model" + savedTimeout = DownloadStallTimeout + }) + + AfterEach(func() { + DownloadStallTimeout = savedTimeout + _ = os.Remove(filePath) + _ = os.Remove(filePath + ".partial") + }) + + It("aborts a download that stalls mid-stream instead of hanging forever", func() { + // Server sends a chunk, flushes, then blocks forever without closing + // the connection — a silently-dropped TCP stream. Without a stall + // guard the body Read blocks indefinitely and DownloadFile never + // returns. + release := make(chan struct{}) + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method == "HEAD" { + w.Header().Set("Accept-Ranges", "bytes") + w.WriteHeader(http.StatusOK) + return + } + w.WriteHeader(http.StatusOK) + _, _ = w.Write(make([]byte, 4096)) + if f, ok := w.(http.Flusher); ok { + f.Flush() + } + <-release // hang: no more data, never close + })) + defer server.Close() + defer close(release) + + DownloadStallTimeout = 300 * time.Millisecond + + done := make(chan error, 1) + go func() { + done <- URI(server.URL).DownloadFileWithContext( + context.Background(), filePath, "", 1, 1, + func(s1, s2, s3 string, f float64) {}) + }() + + var err error + Eventually(done, "5s").Should(Receive(&err)) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("stall")) + }) + + It("preserves the .partial file when a download stalls so it can resume", func() { + release := make(chan struct{}) + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method == "HEAD" { + w.Header().Set("Accept-Ranges", "bytes") + w.WriteHeader(http.StatusOK) + return + } + w.WriteHeader(http.StatusOK) + _, _ = w.Write(make([]byte, 4096)) + if f, ok := w.(http.Flusher); ok { + f.Flush() + } + <-release + })) + defer server.Close() + defer close(release) + + DownloadStallTimeout = 300 * time.Millisecond + + done := make(chan error, 1) + go func() { + done <- URI(server.URL).DownloadFileWithContext( + context.Background(), filePath, "", 1, 1, + func(s1, s2, s3 string, f float64) {}) + }() + Eventually(done, "5s").Should(Receive(HaveOccurred())) + + info, statErr := os.Stat(filePath + ".partial") + Expect(statErr).ToNot(HaveOccurred(), "the .partial must survive a stall so the next attempt can resume") + Expect(info.Size()).To(BeNumerically(">", 0)) + }) + + It("does not abort a slow-but-steady download", func() { + // One byte every 100ms keeps the idle clock from ever expiring even + // though the total transfer outlasts the stall timeout. + payload := make([]byte, 12) + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method == "HEAD" { + w.Header().Set("Accept-Ranges", "bytes") + w.WriteHeader(http.StatusOK) + return + } + w.WriteHeader(http.StatusOK) + f, _ := w.(http.Flusher) + for i := range payload { + _, _ = w.Write(payload[i : i+1]) + if f != nil { + f.Flush() + } + time.Sleep(100 * time.Millisecond) + } + })) + defer server.Close() + + DownloadStallTimeout = 300 * time.Millisecond + + err := URI(server.URL).DownloadFileWithContext( + context.Background(), filePath, "", 1, 1, + func(s1, s2, s3 string, f float64) {}) + Expect(err).ToNot(HaveOccurred()) + }) +}) diff --git a/pkg/downloader/uri.go b/pkg/downloader/uri.go index 4be1b9081..41bdbe672 100644 --- a/pkg/downloader/uri.go +++ b/pkg/downloader/uri.go @@ -330,6 +330,18 @@ func (s URI) ResolveURL() string { return string(s) } +// ErrUserCancelled distinguishes a deliberate user abort from an incidental +// context cancellation (process shutdown, pod restart). Pass it as the cause +// when cancelling the download context: +// +// ctx, cancel := context.WithCancelCause(parent) +// cancel(downloader.ErrUserCancelled) // discards the .partial +// +// On a deliberate cancel the downloader removes the .partial (the user does not +// want a half-download lingering). On a plain cancellation it keeps the .partial +// so the next run resumes via Range instead of restarting from zero. +var ErrUserCancelled = errors.New("download cancelled by user") + func removePartialFile(tmpFilePath string) error { xlog.Debug("Removing temporary file", "file", tmpFilePath) if err := os.Remove(tmpFilePath); err != nil && !errors.Is(err, os.ErrNotExist) { @@ -594,11 +606,17 @@ func (uri URI) DownloadFileWithContext(ctx context.Context, filePath, sha string // Start the request resp, err := downloadClient.Do(req) if err != nil { - // Check if error is due to context cancellation - if errors.Is(err, context.Canceled) { - // Clean up partial file on cancellation - removePartialFile(tmpFilePath) - return err + // Detect cancellation via the context, not the returned error: a + // request cancelled *with a cause* surfaces the cause error (not + // context.Canceled) from the HTTP client. Keep the .partial for + // resume on an incidental cancel (shutdown, restart) — large GGUFs + // take long enough that deleting progress means they never finish — + // but discard it on a deliberate user abort (ErrUserCancelled). + if ctx.Err() != nil { + if errors.Is(context.Cause(ctx), ErrUserCancelled) { + _ = removePartialFile(tmpFilePath) + } + return ctx.Err() } return fmt.Errorf("failed to download file %q: %v", filePath, err) } @@ -608,6 +626,13 @@ func (uri URI) DownloadFileWithContext(ctx context.Context, filePath, sha string return fmt.Errorf("failed to download url %q, invalid status code %d", url, resp.StatusCode) } source = resp.Body + // Guard against a silently-stalled stream: a dropped TCP connection + // that never sends FIN/RST would otherwise block the body Read (and + // thus the whole install) forever. The watchdog aborts after a window + // of zero progress; the .partial is kept for a later resume. + if DownloadStallTimeout > 0 { + source = newIdleTimeoutReader(resp.Body, DownloadStallTimeout) + } contentLength = resp.ContentLength } defer source.Close() @@ -640,19 +665,27 @@ func (uri URI) DownloadFileWithContext(ctx context.Context, filePath, sha string _, err = xio.Copy(ctx, io.MultiWriter(outFile, progress), source) if err != nil { - // Check if error is due to context cancellation - if errors.Is(err, context.Canceled) { - // Clean up partial file on cancellation - removePartialFile(tmpFilePath) - return err + // Detect cancellation via the context (a cause-cancelled read surfaces + // the cause, not context.Canceled). Keep the .partial for resume, + // except on a deliberate user abort (ErrUserCancelled), which discards + // it. A stall-guard abort leaves ctx uncancelled, so it falls through + // to the error path below and likewise preserves the partial. + if ctx.Err() != nil { + if errors.Is(context.Cause(ctx), ErrUserCancelled) { + _ = removePartialFile(tmpFilePath) + } + return ctx.Err() } return fmt.Errorf("failed to write file %q: %v", filePath, err) } - // Check for cancellation before finalizing + // Check for cancellation before finalizing. Keep the .partial for resume + // unless the user deliberately aborted. select { case <-ctx.Done(): - removePartialFile(tmpFilePath) + if errors.Is(context.Cause(ctx), ErrUserCancelled) { + _ = removePartialFile(tmpFilePath) + } return ctx.Err() default: } From 079ac0e15abb51c8330946dec5f506c7a1fdb3f7 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Fri, 19 Jun 2026 21:36:25 +0200 Subject: [PATCH 06/99] fix(realtime): raise WebRTC data-channel max-message-size + keep sendLoop alive (#10407) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(realtime): raise WebRTC data-channel max-message-size for large events Browsers advertise a conservative SCTP max-message-size in their SDP offer (Chrome uses 256 KiB). pion enforces the remote's advertised value on send, so a single realtime event larger than it cannot be sent over the "oai-events" data channel: SendText fails, the event is dropped, and the turn silently yields no response. Some turns legitimately produce a >256 KiB JSON event — notably tool calls with sizeable schemas or results. Browsers advertise the value conservatively but their SCTP stacks reassemble much larger messages, so raise the max-message-size honored for our own server-generated events by rewriting the attribute in the offer before SetRemoteDescription. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto * fix(realtime): keep the WebRTC sendLoop alive when one event send fails A failed SendText on the oai-events data channel exited the sender goroutine, so a single dropped event (e.g. one over the negotiated SCTP max-message-size) tore down the session and silently dropped every subsequent event. Log and skip the offending event instead and keep draining; a genuinely dead transport is still handled by the closed / connection-state path. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- .../openai/realtime_transport_webrtc.go | 12 +++++-- core/http/endpoints/openai/realtime_webrtc.go | 7 ++-- .../endpoints/openai/realtime_webrtc_sctp.go | 29 ++++++++++++++++ .../openai/realtime_webrtc_sctp_test.go | 33 +++++++++++++++++++ 4 files changed, 76 insertions(+), 5 deletions(-) create mode 100644 core/http/endpoints/openai/realtime_webrtc_sctp.go create mode 100644 core/http/endpoints/openai/realtime_webrtc_sctp_test.go diff --git a/core/http/endpoints/openai/realtime_transport_webrtc.go b/core/http/endpoints/openai/realtime_transport_webrtc.go index b687654bd..9ddec5edb 100644 --- a/core/http/endpoints/openai/realtime_transport_webrtc.go +++ b/core/http/endpoints/openai/realtime_transport_webrtc.go @@ -113,8 +113,13 @@ func (t *WebRTCTransport) sendLoop() { return } if err := t.dc.SendText(string(data)); err != nil { - xlog.Error("data channel send failed", "error", err) - return + // Drop just this event and keep the loop alive: a single + // failed send (e.g. an event over the negotiated SCTP + // max-message-size) must not tear down the session and + // silently drop every subsequent event. A genuinely dead + // transport is handled by the <-t.closed case. + xlog.Error("data channel send failed, dropping event", "error", err) + continue } case <-t.closed: // Drain any remaining queued events before exiting @@ -122,7 +127,8 @@ func (t *WebRTCTransport) sendLoop() { select { case data := <-t.outEvents: if err := t.dc.SendText(string(data)); err != nil { - return + xlog.Error("data channel send failed while draining, dropping event", "error", err) + continue } default: return diff --git a/core/http/endpoints/openai/realtime_webrtc.go b/core/http/endpoints/openai/realtime_webrtc.go index 0ac982c19..26edf94ea 100644 --- a/core/http/endpoints/openai/realtime_webrtc.go +++ b/core/http/endpoints/openai/realtime_webrtc.go @@ -128,10 +128,13 @@ func RealtimeCalls(application *application.Application) echo.HandlerFunc { handleIncomingAudioTrack(track, transport) }) - // Set the remote SDP (client's offer) + // Set the remote SDP (client's offer). Raise the data-channel + // max-message-size the browser advertised so pion permits the larger + // realtime events some turns produce (e.g. tool calls), which would + // otherwise be dropped on send. See realtime_webrtc_sctp.go. if err := pc.SetRemoteDescription(webrtc.SessionDescription{ Type: webrtc.SDPTypeOffer, - SDP: req.SDP, + SDP: raiseDataChannelMaxMessageSize(req.SDP), }); err != nil { transport.Close() xlog.Error("failed to set remote description", "error", err) diff --git a/core/http/endpoints/openai/realtime_webrtc_sctp.go b/core/http/endpoints/openai/realtime_webrtc_sctp.go new file mode 100644 index 000000000..b0355ba70 --- /dev/null +++ b/core/http/endpoints/openai/realtime_webrtc_sctp.go @@ -0,0 +1,29 @@ +package openai + +import ( + "fmt" + "regexp" +) + +// realtimeDataChannelMaxMessageSize is the SCTP max-message-size LocalAI honors +// for the "oai-events" data channel, in bytes. +// +// Browsers advertise a conservative max-message-size in their SDP offer (Chrome +// uses 262144 = 256 KiB). pion enforces the remote's advertised value on send, +// so a single realtime event larger than it cannot be sent: the SendText fails, +// the event is dropped, and the turn silently yields no response. Some turns +// legitimately produce a single JSON event above 256 KiB (notably tool calls +// with sizeable schemas or results). Browsers advertise this value +// conservatively but their SCTP stacks reassemble much larger messages, so we +// raise the value honored for our own server-generated events. +const realtimeDataChannelMaxMessageSize = 16 * 1024 * 1024 // 16 MiB + +var maxMessageSizeAttrRe = regexp.MustCompile(`a=max-message-size:\d+`) + +// raiseDataChannelMaxMessageSize rewrites the SCTP max-message-size attribute in +// an SDP offer to realtimeDataChannelMaxMessageSize so pion permits larger +// outbound realtime events. Offers that don't carry the attribute are returned +// unchanged. +func raiseDataChannelMaxMessageSize(sdp string) string { + return maxMessageSizeAttrRe.ReplaceAllString(sdp, fmt.Sprintf("a=max-message-size:%d", realtimeDataChannelMaxMessageSize)) +} diff --git a/core/http/endpoints/openai/realtime_webrtc_sctp_test.go b/core/http/endpoints/openai/realtime_webrtc_sctp_test.go new file mode 100644 index 000000000..92da4e706 --- /dev/null +++ b/core/http/endpoints/openai/realtime_webrtc_sctp_test.go @@ -0,0 +1,33 @@ +package openai + +import ( + "fmt" + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("raiseDataChannelMaxMessageSize", func() { + It("raises a max-message-size the browser advertised", func() { + offer := "v=0\r\nm=application 9 UDP/DTLS/SCTP webrtc-datachannel\r\na=max-message-size:262144\r\n" + out := raiseDataChannelMaxMessageSize(offer) + Expect(out).To(ContainSubstring(fmt.Sprintf("a=max-message-size:%d", realtimeDataChannelMaxMessageSize))) + Expect(out).NotTo(ContainSubstring("a=max-message-size:262144")) + }) + + It("leaves an offer without the attribute unchanged", func() { + offer := "v=0\r\nm=application 9 UDP/DTLS/SCTP webrtc-datachannel\r\n" + Expect(raiseDataChannelMaxMessageSize(offer)).To(Equal(offer)) + }) + + It("rewrites every occurrence", func() { + offer := "a=max-message-size:1024\r\na=max-message-size:262144\r\n" + out := raiseDataChannelMaxMessageSize(offer) + Expect(strings.Count(out, fmt.Sprintf("a=max-message-size:%d", realtimeDataChannelMaxMessageSize))).To(Equal(2)) + }) + + It("raises above the 256 KiB browsers advertise", func() { + Expect(realtimeDataChannelMaxMessageSize).To(BeNumerically(">", 262144)) + }) +}) From c43a752afc288bb8691a6843e9cd8f1d51fc5e31 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 01:36:22 +0200 Subject: [PATCH 07/99] chore: :arrow_up: Update ServeurpersoCom/omnivoice.cpp to `96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd` (#10408) :arrow_up: Update ServeurpersoCom/omnivoice.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- backend/go/omnivoice-cpp/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/go/omnivoice-cpp/Makefile b/backend/go/omnivoice-cpp/Makefile index 7806ce11f..b42610aac 100644 --- a/backend/go/omnivoice-cpp/Makefile +++ b/backend/go/omnivoice-cpp/Makefile @@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1) # omnivoice.cpp version OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp -OMNIVOICE_VERSION?=2603355a5dfacae5cfc33531d5d0933221843509 +OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd SO_TARGET?=libgomnivoicecpp.so CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF From dd928f0bddd699aaf051574a23ce9049200e212e Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 01:36:36 +0200 Subject: [PATCH 08/99] chore: :arrow_up: Update ServeurpersoCom/qwentts.cpp to `26fcea5468e4069bc72d1f2fcc812c985e7361bb` (#10409) :arrow_up: Update ServeurpersoCom/qwentts.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- backend/go/qwen3-tts-cpp/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/go/qwen3-tts-cpp/Makefile b/backend/go/qwen3-tts-cpp/Makefile index e5f6a838f..84c543af6 100644 --- a/backend/go/qwen3-tts-cpp/Makefile +++ b/backend/go/qwen3-tts-cpp/Makefile @@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1) # qwentts.cpp version QWEN3TTS_REPO?=https://github.com/ServeurpersoCom/qwentts.cpp -QWEN3TTS_CPP_VERSION?=0bf4a18b22e8bb8718d95294e9f7f45c0d4270a4 +QWEN3TTS_CPP_VERSION?=26fcea5468e4069bc72d1f2fcc812c985e7361bb SO_TARGET?=libgoqwen3ttscpp.so CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF From f143d7f6885596ae228f99338604b86fe0c53f4c Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 01:36:51 +0200 Subject: [PATCH 09/99] chore: :arrow_up: Update ikawrakow/ik_llama.cpp to `d47f484d299cafad2e606afc0d31677a91b242d0` (#10410) :arrow_up: Update ikawrakow/ik_llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- backend/cpp/ik-llama-cpp/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/cpp/ik-llama-cpp/Makefile b/backend/cpp/ik-llama-cpp/Makefile index 85b7ee4a8..39fa7fa4e 100644 --- a/backend/cpp/ik-llama-cpp/Makefile +++ b/backend/cpp/ik-llama-cpp/Makefile @@ -1,5 +1,5 @@ -IK_LLAMA_VERSION?=b3dfb7858cfcb9166e92f366e5af87f19ebc94be +IK_LLAMA_VERSION?=d47f484d299cafad2e606afc0d31677a91b242d0 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp CMAKE_ARGS?= From 8915f2ab917f90c3bb6a734b15bf95f72189ba06 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 01:37:06 +0200 Subject: [PATCH 10/99] chore: :arrow_up: Update ggml-org/whisper.cpp to `5ed76e9a079962f1c85cfce44edd325c27ef1f97` (#10396) :arrow_up: Update ggml-org/whisper.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- backend/go/whisper/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/go/whisper/Makefile b/backend/go/whisper/Makefile index e291e4d62..9858b1d07 100644 --- a/backend/go/whisper/Makefile +++ b/backend/go/whisper/Makefile @@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1) # whisper.cpp version WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp -WHISPER_CPP_VERSION?=86c40c3bd6fc86f1187fb751d111b49e0fc18e84 +WHISPER_CPP_VERSION?=5ed76e9a079962f1c85cfce44edd325c27ef1f97 SO_TARGET?=libgowhisper.so CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF From 11aee03a805faa5ab72034afbef06a4037f737ef Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 01:37:21 +0200 Subject: [PATCH 11/99] chore: :arrow_up: Update localai-org/privacy-filter.cpp to `98f52c5ef2250f207cc6b9a6aef05393a120cb7c` (#10394) :arrow_up: Update localai-org/privacy-filter.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- backend/cpp/privacy-filter/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/cpp/privacy-filter/Makefile b/backend/cpp/privacy-filter/Makefile index 173d4176b..774f2c433 100644 --- a/backend/cpp/privacy-filter/Makefile +++ b/backend/cpp/privacy-filter/Makefile @@ -8,7 +8,7 @@ # Local development: point at a working checkout instead of cloning, e.g. # make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server -PRIVACY_FILTER_VERSION?=646342f7a59c6b7d195185eac60bad762e572f1d +PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp PRIVACY_FILTER_SRC?= From 93706fec57c98d689a10737cdc042c1a29ce7969 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 01:37:33 +0200 Subject: [PATCH 12/99] chore: :arrow_up: Update mudler/parakeet.cpp to `db755a78d39f789bb7d4e3935158a9e8105dbe36` (#10393) :arrow_up: Update mudler/parakeet.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> --- backend/go/parakeet-cpp/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/go/parakeet-cpp/Makefile b/backend/go/parakeet-cpp/Makefile index 2ea86a0c6..9a781d634 100644 --- a/backend/go/parakeet-cpp/Makefile +++ b/backend/go/parakeet-cpp/Makefile @@ -1,6 +1,6 @@ # parakeet-cpp backend Makefile. # -# Upstream pin lives below as PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45 +# Upstream pin lives below as PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36 # (.github/bump_deps.sh) can find and update it - matches the # whisper.cpp / ds4 / vibevoice-cpp convention. # @@ -15,7 +15,7 @@ # That's what the L0 smoke test uses. The default target below does the # proper clone-at-pin + cmake build so CI doesn't need a side-checkout. -PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45 +PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp GOCMD?=go From 518381278e362423ab70aaa8ad23c7b44ee13a03 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 08:22:22 +0200 Subject: [PATCH 13/99] chore: :arrow_up: Update ggml-org/llama.cpp to `e475fa2b5f9fb50c3d6fc3e7c6fdf1e004465b62` (#10392) * :arrow_up: Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * fix(llama-cpp): adapt grpc-server to upstream server-schema split Upstream llama.cpp (e475fa2) extracted the JSON request-schema evaluation out of the static server_task::params_from_json_cmpl into the new server_schema::eval_llama_cmpl_schema (tools/server/server-schema.cpp). The grpc-server unity build still called the old static member, breaking every llama-cpp backend build with "no member named 'params_from_json_cmpl' in 'server_task'". Pull server-schema.cpp into the translation unit and call the new function, keeping both guarded by __has_include so forks that predate the split (e.g. llama-cpp-turboquant, which still exposes params_from_json_cmpl) keep compiling against the old static member. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] --------- Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: Ettore Di Giacinto Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/Makefile | 2 +- backend/cpp/llama-cpp/grpc-server.cpp | 24 ++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index 64414ec30..bf9f4f608 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -1,5 +1,5 @@ -LLAMA_VERSION?=f3e182816421c648188b5eab269853bf1531d950 +LLAMA_VERSION?=e475fa2b5f9fb50c3d6fc3e7c6fdf1e004465b62 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp CMAKE_ARGS?= diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 8502e9530..c2e7f22e4 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -18,6 +18,18 @@ #if __has_include("server-chat.cpp") #include "server-chat.cpp" #endif +// server-schema.cpp exists only in llama.cpp after the upstream refactor that +// extracted the JSON request-schema evaluation (previously the static +// server_task::params_from_json_cmpl) into server_schema::eval_llama_cmpl_schema. +// server-context.cpp and grpc-server.cpp both call into it, so its definitions +// must be part of this translation unit or the link fails. __has_include keeps +// the source compatible with older pins/forks (e.g. llama-cpp-turboquant) that +// predate the split and still expose params_from_json_cmpl (see the guarded +// call sites below). +#if __has_include("server-schema.cpp") +#define LOCALAI_HAS_SERVER_SCHEMA 1 +#include "server-schema.cpp" +#endif #include "server-context.cpp" // LocalAI @@ -2102,7 +2114,11 @@ public: task.index = i; task.tokens = std::move(inputs[i]); +#ifdef LOCALAI_HAS_SERVER_SCHEMA + task.params = server_schema::eval_llama_cmpl_schema( +#else task.params = server_task::params_from_json_cmpl( +#endif ctx_server.impl->vocab, params_base, ctx_server.get_meta().slot_n_ctx, @@ -2116,7 +2132,7 @@ public: // cannot detect tool calls or separate reasoning from content. task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT; task.params.oaicompat_cmpl_id = completion_id; - // oaicompat_model is already populated by params_from_json_cmpl + // oaicompat_model is already populated by eval_llama_cmpl_schema tasks.push_back(std::move(task)); } @@ -2940,7 +2956,11 @@ public: task.index = i; task.tokens = std::move(inputs[i]); +#ifdef LOCALAI_HAS_SERVER_SCHEMA + task.params = server_schema::eval_llama_cmpl_schema( +#else task.params = server_task::params_from_json_cmpl( +#endif ctx_server.impl->vocab, params_base, ctx_server.get_meta().slot_n_ctx, @@ -2952,7 +2972,7 @@ public: // reasoning, tool calls, and content are classified into ChatDeltas. task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT; task.params.oaicompat_cmpl_id = completion_id; - // oaicompat_model is already populated by params_from_json_cmpl + // oaicompat_model is already populated by eval_llama_cmpl_schema tasks.push_back(std::move(task)); } From 1be959ce30e68ed686a630932dba2754a6d5fed9 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 11:04:56 +0200 Subject: [PATCH 14/99] docs: mention apex-quant in the README (#10412) Add apex-quant (MoE per-tensor/per-layer quantization recipe) to the "Backends built by us" section as a note after the engines table, since it is a quantization recipe rather than a native inference engine. Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index b05af2dfb..5fff7db69 100644 --- a/README.md +++ b/README.md @@ -240,6 +240,8 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native | [LocalVQE](https://github.com/localai-org/LocalVQE) | Joint acoustic echo cancellation, noise suppression, and dereverberation | | [local-store](https://github.com/mudler/LocalAI) | Local-first vector database for embeddings (shipped in-tree) | +We also maintain [apex-quant](https://github.com/localai-org/apex-quant), a per-tensor, per-layer quantization recipe for Mixture-of-Experts models that exploits their structural sparsity to produce GGUFs matching or beating Q8_0 quality - and they run out of the box on stock llama.cpp. + ## Resources - [Documentation](https://localai.io/) From b081247d95bff5bde5e5147e1add185e1fbc1a31 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 14:45:59 +0200 Subject: [PATCH 15/99] =?UTF-8?q?feat(config):=20hardware-tuned=20defaults?= =?UTF-8?q?=20=E2=80=94=20Blackwell=20batch=20+=20VRAM-scaled=20concurrenc?= =?UTF-8?q?y=20(#10411)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(config): node-aware hardware defaults — larger physical batch on Blackwell A larger physical batch (n_batch/n_ubatch) materially lifts MoE prefill on NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark) — measured on a GB10 with Qwen3-Coder-30B-A3B, the prefill ceiling rises (ub512 ~2994 -> ub2048 ~3316 t/s) and saturates around 2048. The heuristic lives in core/config alongside the other config overriders (ApplyInferenceDefaults, guessDefaultsFromFile/NGPULayers) — they all fill the ModelConfig from heuristics, so hardware tuning is the same domain and stays in one place. It is parameterized on a GPU descriptor (not direct detection) so it works in both deployment shapes: - Single host: SetDefaults applies it with the LocalGPU. - Distributed: only the worker sees the GPU, so the worker reports its compute capability on registration (gpu_compute_capability -> BackendNode), and the router re-applies the SAME core/config heuristic for the SELECTED node before loading — fixing the case where the frontend has no GPU at all. Explicit `batch:` always wins (only managed default values are touched). xsysinfo gains NVIDIAComputeCapability() (detection only); all interpretation lives in core/config. Tests: core/config, pkg/xsysinfo, core/services/nodes. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto * test(config): injectable local-GPU seam + single-instance coverage Make local GPU detection an injectable package var (localGPU) so the single-instance path (SetDefaults -> ApplyHardwareDefaults) is deterministically testable without a real GPU, mirroring the distributed override's coverage. Adds specs asserting SetDefaults sets the Blackwell physical batch, leaves it unset on non-Blackwell, and never overrides an explicit batch. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto * feat(config): default concurrent serving (n_parallel) by GPU VRAM The llama.cpp backend defaults n_parallel=1, which serializes multi-user requests and leaves continuous batching off (it auto-enables only at n_parallel>1). Fold a VRAM-scaled parallel-slot default into the hardware-config path so multi-user serving works out of the box: >=32GiB->8, >=8GiB->4, >=4GiB->2, else unchanged. With the backend's unified KV the slots SHARE the context budget, so this adds concurrency without multiplying KV memory. Explicit parallel/n_parallel always wins. EnsureParallelOption is shared by the single-host path (ApplyHardwareDefaults with the local GPU) and the distributed router (per selected node's reported VRAM, since the frontend may have no GPU). LocalGPU now also reports VRAM. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- core/config/hardware_defaults.go | 190 ++++++++++++++++++ .../config/hardware_defaults_internal_test.go | 37 ++++ core/config/hardware_defaults_test.go | 97 +++++++++ core/config/model_config.go | 5 + core/http/endpoints/localai/nodes.go | 48 +++-- core/services/nodes/registry.go | 6 + core/services/nodes/router.go | 30 +++ .../nodes/router_hardware_internal_test.go | 46 +++++ core/services/worker/registration.go | 5 + pkg/xsysinfo/computecap_internal_test.go | 23 +++ pkg/xsysinfo/gpu.go | 98 ++++++++- 11 files changed, 553 insertions(+), 32 deletions(-) create mode 100644 core/config/hardware_defaults.go create mode 100644 core/config/hardware_defaults_internal_test.go create mode 100644 core/config/hardware_defaults_test.go create mode 100644 core/services/nodes/router_hardware_internal_test.go create mode 100644 pkg/xsysinfo/computecap_internal_test.go diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go new file mode 100644 index 000000000..2ed54265f --- /dev/null +++ b/core/config/hardware_defaults.go @@ -0,0 +1,190 @@ +package config + +import ( + "fmt" + "strconv" + "strings" + + "github.com/mudler/LocalAI/pkg/xsysinfo" + "github.com/mudler/xlog" +) + +// Hardware-driven model-config defaults. +// +// This sits alongside the other config overriders (ApplyInferenceDefaults for +// model families, guessDefaultsFromFile for GGUF/NGPULayers): they all +// heuristically fill ModelConfig values the user left unset. Hardware tuning is +// the same domain — "adjust the config from the device that will run it" — so +// it lives here rather than scattered into the backend or a separate package. +// +// The heuristics are parameterized on a GPU descriptor (not on direct +// detection) so they apply in both deployment shapes: SetDefaults passes the +// LocalGPU on a single host, and the distributed router passes the *selected +// node's* reported GPU before loading there (the frontend that loaded the +// config may have no GPU at all). + +// GPU describes the device that will run a model. +type GPU struct { + // Vendor is "nvidia", "amd", … (matches xsysinfo vendor constants). + Vendor string + // ComputeCapability is the NVIDIA compute capability as "major.minor" + // (e.g. "12.1" for GB10 / DGX Spark). Empty for non-NVIDIA / unknown. + ComputeCapability string + // VRAM is total device memory in bytes (0 = unknown). + VRAM uint64 +} + +// Physical batch (n_batch / n_ubatch) defaults. +const ( + // DefaultPhysicalBatch is the conservative default when no hardware-specific + // tuning applies. Matches backend.DefaultBatchSize. + DefaultPhysicalBatch = 512 + // BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs + // (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical + // batch materially lifts MoE prefill there (per-expert GEMM tiles fill + // better); measured on a GB10 with Qwen3-30B-A3B to saturate around 2048. + BlackwellPhysicalBatch = 2048 +) + +// IsNVIDIABlackwell reports whether the GPU is in the NVIDIA Blackwell consumer +// family (sm_12x). Datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0) +// reports a different compute capability and is intentionally not matched. +func (g GPU) IsNVIDIABlackwell() bool { + maj, _ := parseComputeCapability(g.ComputeCapability) + return maj >= 12 +} + +// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the +// given hardware, used when the model config leaves batch unset. +func PhysicalBatch(g GPU) int { + if g.IsNVIDIABlackwell() { + return BlackwellPhysicalBatch + } + return DefaultPhysicalBatch +} + +// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns. +// Callers that re-tune a value chosen by an upstream host (the distributed +// router correcting the frontend's guess) use this to avoid clobbering an +// explicit user batch such as 1024. +func IsManagedPhysicalBatch(n int) bool { + return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch +} + +// Parallel-slot (n_parallel) VRAM tiers. llama.cpp serializes requests at +// n_parallel=1 (the backend default) and only auto-enables continuous batching +// when n_parallel > 1 — so a single-slot default makes concurrent requests +// queue. We default a slot count by GPU size so multi-user serving works out of +// the box. With the backend's unified KV cache the slots SHARE the context +// budget, so more slots add concurrency without multiplying KV memory. +const ( + parallelSlotsVRAMHigh = uint64(32) << 30 // >=32 GiB -> 8 slots + parallelSlotsVRAMMid = uint64(8) << 30 // >=8 GiB -> 4 slots + parallelSlotsVRAMLow = uint64(4) << 30 // >=4 GiB -> 2 slots +) + +// DefaultParallelSlots returns the n_parallel default for the given GPU. Returns +// 1 (no concurrency) when VRAM is unknown or too small, so we never change +// behavior on CPU-only / tiny devices. +func DefaultParallelSlots(g GPU) int { + switch { + case g.VRAM >= parallelSlotsVRAMHigh: + return 8 + case g.VRAM >= parallelSlotsVRAMMid: + return 4 + case g.VRAM >= parallelSlotsVRAMLow: + return 2 + default: + return 1 + } +} + +// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the +// model doesn't already set one (and the GPU warrants concurrency). Returns the +// possibly-extended options. Shared by the single-host config path +// (ApplyHardwareDefaults) and the distributed router (per selected node). +func EnsureParallelOption(opts []string, gpu GPU) []string { + if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) { + return append(opts, fmt.Sprintf("parallel:%d", slots)) + } + return opts +} + +// hasParallelOption reports whether the model already sets parallel/n_parallel +// (backend options are "name:value" strings) so we never override an explicit value. +func hasParallelOption(opts []string) bool { + for _, o := range opts { + name := o + if i := strings.IndexByte(o, ':'); i >= 0 { + name = o[:i] + } + switch strings.TrimSpace(strings.ToLower(name)) { + case "parallel", "n_parallel": + return true + } + } + return false +} + +// localGPU builds a GPU descriptor from local detection, used by SetDefaults on +// a single host (the distributed router builds it from the selected node's +// reported info instead). It is a package var so tests can inject a +// deterministic device — detection does a live nvidia-smi call. +var localGPU = func() GPU { + vendor, _ := xsysinfo.DetectGPUVendor() + vram, _ := xsysinfo.TotalAvailableVRAM() + return GPU{ + Vendor: vendor, + ComputeCapability: xsysinfo.NVIDIAComputeCapability(), + VRAM: vram, + } +} + +// ApplyHardwareDefaults fills ModelConfig values that depend on the target GPU +// and were left unset by the user. Currently: a larger physical batch on +// Blackwell. Explicit config always wins (we only touch zero values). +func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) { + if cfg == nil { + return + } + if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() { + cfg.Batch = BlackwellPhysicalBatch + xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch", + "batch", cfg.Batch, "compute_cap", gpu.ComputeCapability) + } + + // Enable concurrent serving by default on a capable GPU: without this the + // llama.cpp backend runs n_parallel=1 and serializes multi-user requests + // (continuous batching stays off). Unified KV means the slots share the + // context budget, so this is concurrency without extra KV memory. Explicit + // parallel/n_parallel in the model options always wins. + if before := len(cfg.Options); true { + cfg.Options = EnsureParallelOption(cfg.Options, gpu) + if len(cfg.Options) > before { + xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving", + "option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30) + } + } +} + +// parseComputeCapability splits a "major.minor" string into integer parts. +// Returns (-1, -1) when it can't be parsed. +func parseComputeCapability(cc string) (int, int) { + cc = strings.TrimSpace(cc) + if cc == "" { + return -1, -1 + } + majStr, minStr := cc, "0" + if dot := strings.IndexByte(cc, '.'); dot >= 0 { + majStr, minStr = cc[:dot], cc[dot+1:] + } + maj, err := strconv.Atoi(strings.TrimSpace(majStr)) + if err != nil { + return -1, -1 + } + min, err := strconv.Atoi(strings.TrimSpace(minStr)) + if err != nil { + min = 0 + } + return maj, min +} diff --git a/core/config/hardware_defaults_internal_test.go b/core/config/hardware_defaults_internal_test.go new file mode 100644 index 000000000..52c674c2d --- /dev/null +++ b/core/config/hardware_defaults_internal_test.go @@ -0,0 +1,37 @@ +package config + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// Single-instance path: SetDefaults applies hardware defaults from the local +// GPU. The detection seam (localGPU) is injected so the path is deterministic +// without a real GPU. +var _ = Describe("SetDefaults hardware defaults (single-instance)", func() { + var orig func() GPU + BeforeEach(func() { orig = localGPU }) + AfterEach(func() { localGPU = orig }) + + It("sets the physical batch on a local Blackwell GPU", func() { + localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} } + cfg := &ModelConfig{} + cfg.SetDefaults() + Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch)) + }) + + It("leaves batch unset on a non-Blackwell local GPU", func() { + localGPU = func() GPU { return GPU{ComputeCapability: "8.9"} } + cfg := &ModelConfig{} + cfg.SetDefaults() + Expect(cfg.Batch).To(Equal(0)) + }) + + It("never overrides an explicit batch", func() { + localGPU = func() GPU { return GPU{ComputeCapability: "12.1"} } + cfg := &ModelConfig{} + cfg.Batch = 1024 + cfg.SetDefaults() + Expect(cfg.Batch).To(Equal(1024)) + }) +}) diff --git a/core/config/hardware_defaults_test.go b/core/config/hardware_defaults_test.go new file mode 100644 index 000000000..ae7bf3964 --- /dev/null +++ b/core/config/hardware_defaults_test.go @@ -0,0 +1,97 @@ +package config_test + +import ( + . "github.com/mudler/LocalAI/core/config" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Hardware-driven config defaults", func() { + DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)", + func(cc string, want bool) { + Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want)) + }, + Entry("GB10 12.1", "12.1", true), + Entry("RTX 50 12.0", "12.0", true), + Entry("future 13.0", "13.0", true), + Entry("Hopper 9.0", "9.0", false), + Entry("Ada 8.9", "8.9", false), + Entry("datacenter Blackwell sm_100 10.0", "10.0", false), + Entry("unknown", "", false), + ) + + Describe("PhysicalBatch / IsManagedPhysicalBatch", func() { + It("returns the Blackwell batch on Blackwell", func() { + Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch)) + }) + It("returns the default batch otherwise", func() { + Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch)) + Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch)) + }) + It("recognizes managed defaults but not explicit values", func() { + Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue()) + Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue()) + Expect(IsManagedPhysicalBatch(1024)).To(BeFalse()) + }) + }) + + Describe("ApplyHardwareDefaults", func() { + It("raises an unset batch to 2048 on Blackwell", func() { + cfg := &ModelConfig{} + ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"}) + Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch)) + }) + It("leaves batch unset on non-Blackwell", func() { + cfg := &ModelConfig{} + ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"}) + Expect(cfg.Batch).To(Equal(0)) + }) + It("never overrides an explicit batch", func() { + cfg := &ModelConfig{} + cfg.Batch = 1024 + ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"}) + Expect(cfg.Batch).To(Equal(1024)) + }) + It("no-ops on nil", func() { + Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic()) + }) + }) + + const gib = uint64(1) << 30 + + DescribeTable("DefaultParallelSlots (by VRAM)", + func(vramGiB uint64, want int) { + Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want)) + }, + Entry("GB10 119 GiB", uint64(119), 8), + Entry("48 GiB", uint64(48), 8), + Entry("24 GiB", uint64(24), 4), + Entry("8 GiB", uint64(8), 4), + Entry("6 GiB", uint64(6), 2), + Entry("2 GiB", uint64(2), 1), + Entry("unknown 0", uint64(0), 1), + ) + + Describe("ApplyHardwareDefaults parallel slots", func() { + It("adds a VRAM-scaled parallel option on a capable GPU", func() { + cfg := &ModelConfig{} + ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib}) + Expect(cfg.Options).To(ContainElement("parallel:8")) + }) + It("scales the slot count down with VRAM", func() { + cfg := &ModelConfig{} + ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib}) + Expect(cfg.Options).To(ContainElement("parallel:4")) + }) + It("adds no parallel option on small/unknown VRAM", func() { + cfg := &ModelConfig{} + ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib}) + Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel"))) + }) + It("never overrides an explicit parallel option", func() { + cfg := &ModelConfig{Options: []string{"parallel:2"}} + ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib}) + Expect(cfg.Options).To(Equal([]string{"parallel:2"})) + }) + }) +}) diff --git a/core/config/model_config.go b/core/config/model_config.go index dfe151a64..75136ec6c 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -1111,6 +1111,11 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) { // This ensures gallery-installed and runtime-loaded models get optimal parameters. ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model) + // Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell). + // Uses the local GPU here; in distributed mode the router re-applies the same + // heuristics for the selected node's GPU before loading. Explicit config wins. + ApplyHardwareDefaults(cfg, localGPU()) + // https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22 defaultTopP := 0.95 defaultTopK := 40 diff --git a/core/http/endpoints/localai/nodes.go b/core/http/endpoints/localai/nodes.go index 5a6edab22..820cb137f 100644 --- a/core/http/endpoints/localai/nodes.go +++ b/core/http/endpoints/localai/nodes.go @@ -70,17 +70,20 @@ func GetNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc { // RegisterNodeRequest is the request body for registering a new worker node. type RegisterNodeRequest struct { - Name string `json:"name"` - NodeType string `json:"node_type,omitempty"` // "backend" (default) or "agent" - Address string `json:"address"` - HTTPAddress string `json:"http_address,omitempty"` - Token string `json:"token,omitempty"` - TotalVRAM uint64 `json:"total_vram,omitempty"` - AvailableVRAM uint64 `json:"available_vram,omitempty"` - TotalRAM uint64 `json:"total_ram,omitempty"` - AvailableRAM uint64 `json:"available_ram,omitempty"` - GPUVendor string `json:"gpu_vendor,omitempty"` - Labels map[string]string `json:"labels,omitempty"` + Name string `json:"name"` + NodeType string `json:"node_type,omitempty"` // "backend" (default) or "agent" + Address string `json:"address"` + HTTPAddress string `json:"http_address,omitempty"` + Token string `json:"token,omitempty"` + TotalVRAM uint64 `json:"total_vram,omitempty"` + AvailableVRAM uint64 `json:"available_vram,omitempty"` + TotalRAM uint64 `json:"total_ram,omitempty"` + AvailableRAM uint64 `json:"available_ram,omitempty"` + GPUVendor string `json:"gpu_vendor,omitempty"` + // GPUComputeCapability is the worker GPU's compute capability ("major.minor", + // e.g. "12.1" for GB10). Used by the router for per-arch option tuning. + GPUComputeCapability string `json:"gpu_compute_capability,omitempty"` + Labels map[string]string `json:"labels,omitempty"` // MaxReplicasPerModel is the per-node cap on replicas of any single model. // Workers older than this field omit it; we coerce 0 → 1 below to preserve // historical single-replica behavior. @@ -152,17 +155,18 @@ func RegisterNodeEndpoint(registry *nodes.NodeRegistry, expectedToken string, au } node := &nodes.BackendNode{ - Name: req.Name, - NodeType: nodeType, - Address: req.Address, - HTTPAddress: req.HTTPAddress, - TokenHash: tokenHash, - TotalVRAM: req.TotalVRAM, - AvailableVRAM: req.AvailableVRAM, - TotalRAM: req.TotalRAM, - AvailableRAM: req.AvailableRAM, - GPUVendor: req.GPUVendor, - MaxReplicasPerModel: maxReplicasPerModel, + Name: req.Name, + NodeType: nodeType, + Address: req.Address, + HTTPAddress: req.HTTPAddress, + TokenHash: tokenHash, + TotalVRAM: req.TotalVRAM, + AvailableVRAM: req.AvailableVRAM, + TotalRAM: req.TotalRAM, + AvailableRAM: req.AvailableRAM, + GPUVendor: req.GPUVendor, + GPUComputeCapability: req.GPUComputeCapability, + MaxReplicasPerModel: maxReplicasPerModel, } ctx := c.Request().Context() diff --git a/core/services/nodes/registry.go b/core/services/nodes/registry.go index 3d34d086c..aafee13cb 100644 --- a/core/services/nodes/registry.go +++ b/core/services/nodes/registry.go @@ -36,6 +36,11 @@ type BackendNode struct { TotalRAM uint64 `gorm:"column:total_ram" json:"total_ram"` // Total system RAM in bytes (fallback when no GPU) AvailableRAM uint64 `gorm:"column:available_ram" json:"available_ram"` // Available system RAM in bytes GPUVendor string `gorm:"column:gpu_vendor;size:32" json:"gpu_vendor"` // nvidia, amd, intel, vulkan, unknown + // GPUComputeCapability is the worker GPU's compute capability as + // "major.minor" (e.g. "12.1" for GB10 / DGX Spark). Reported by the worker + // on registration; used by the router to pick per-arch options (e.g. a + // larger physical batch on Blackwell). Empty when unknown / non-NVIDIA. + GPUComputeCapability string `gorm:"column:gpu_compute_capability;size:16" json:"gpu_compute_capability"` // MaxReplicasPerModel caps how many replicas of any one model can run on // this node concurrently. Default 1 preserves the historical "one // (node, model)" assumption; set higher (via worker --max-replicas-per-model) @@ -69,6 +74,7 @@ const ( ColReservedVRAM = "reserved_vram" ColAvailableRAM = "available_ram" ColGPUVendor = "gpu_vendor" + ColGPUComputeCap = "gpu_compute_capability" ColLastHeartbeat = "last_heartbeat" ColMaxReplicasPerModel = "max_replicas_per_model" ) diff --git a/core/services/nodes/router.go b/core/services/nodes/router.go index e5ce52306..ccbf48f43 100644 --- a/core/services/nodes/router.go +++ b/core/services/nodes/router.go @@ -12,6 +12,7 @@ import ( "strings" "time" + "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/services/advisorylock" "github.com/mudler/LocalAI/core/services/nodes/prefixcache" "github.com/mudler/LocalAI/pkg/distributedhdr" @@ -138,6 +139,30 @@ type scheduleLoadResult struct { ReplicaIndex int } +// applyNodeHardwareDefaults tunes node-agnostic ModelOptions to the GPU of the +// node that was actually selected to run the model, reusing the same hardware +// heuristics as single-host config loading (core/config). On Blackwell it +// raises the physical batch; on non-Blackwell it resets a hardware-default that +// an upstream host (the GPU-less frontend in distributed mode) guessed higher. +// Only values the heuristics themselves manage are touched, so an explicit user +// batch (e.g. 1024) is never overridden. +func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) { + if opts == nil || node == nil { + return + } + gpu := config.GPU{ + Vendor: node.GPUVendor, + ComputeCapability: node.GPUComputeCapability, + VRAM: node.TotalVRAM, + } + if config.IsManagedPhysicalBatch(int(opts.NBatch)) { + opts.NBatch = int32(config.PhysicalBatch(gpu)) + } + // Default concurrent serving for the selected node (the frontend that built + // the options may have no GPU). Only adds when no parallel option is set. + opts.Options = config.EnsureParallelOption(opts.Options, gpu) +} + // scheduleAndLoad is the shared core for loading a model on a new node. // Used by both Route() (for first-time loads) and ScheduleAndLoadModel() (for reconciler scale-ups). // @@ -153,6 +178,11 @@ func (r *SmartRouter) scheduleAndLoad(ctx context.Context, backendType, tracking return nil, fmt.Errorf("no available nodes: %w", err) } + // Tune node-agnostic options to the SELECTED node's GPU. Only now do we know + // which node (and its compute capability) will run the model — the frontend + // that built modelOpts may have no GPU at all in distributed mode. + applyNodeHardwareDefaults(modelOpts, node) + // Pre-stage model files via FileStager before loading loadOpts := modelOpts if r.fileStager != nil && modelOpts != nil { diff --git a/core/services/nodes/router_hardware_internal_test.go b/core/services/nodes/router_hardware_internal_test.go new file mode 100644 index 000000000..2418bf444 --- /dev/null +++ b/core/services/nodes/router_hardware_internal_test.go @@ -0,0 +1,46 @@ +package nodes + +import ( + "github.com/mudler/LocalAI/core/config" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("applyNodeHardwareDefaults", func() { + It("raises a managed default batch on a Blackwell node", func() { + opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch} + applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"}) + Expect(opts.NBatch).To(BeEquivalentTo(config.BlackwellPhysicalBatch)) + }) + + It("resets a Blackwell guess on a non-Blackwell node", func() { + // frontend (Blackwell) guessed high, but the selected node is not Blackwell + opts := &pb.ModelOptions{NBatch: config.BlackwellPhysicalBatch} + applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "9.0"}) + Expect(opts.NBatch).To(BeEquivalentTo(config.DefaultPhysicalBatch)) + }) + + It("never overrides an explicit (non-managed) batch", func() { + opts := &pb.ModelOptions{NBatch: 1024} + applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"}) + Expect(opts.NBatch).To(BeEquivalentTo(int32(1024))) + }) + + It("adds a VRAM-scaled parallel option for the selected node", func() { + // frontend may have had no GPU (no parallel option); the node has a big GPU + opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch} + applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30}) + Expect(opts.Options).To(ContainElement("parallel:8")) + }) + + It("never overrides an explicit parallel option on the node path", func() { + opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, Options: []string{"parallel:2"}} + applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30}) + Expect(opts.Options).To(Equal([]string{"parallel:2"})) + }) + + It("no-ops on nil inputs", func() { + Expect(func() { applyNodeHardwareDefaults(nil, nil) }).ToNot(Panic()) + }) +}) diff --git a/core/services/worker/registration.go b/core/services/worker/registration.go index 87a8a7966..432cc845b 100644 --- a/core/services/worker/registration.go +++ b/core/services/worker/registration.go @@ -73,6 +73,10 @@ func (cfg *Config) registrationBody() map[string]any { // Detect GPU info for VRAM-aware scheduling totalVRAM, _ := xsysinfo.TotalAvailableVRAM() gpuVendor, _ := xsysinfo.DetectGPUVendor() + // Compute capability (e.g. "12.1" for GB10) lets the router pick per-arch + // options (e.g. larger physical batch on Blackwell). Detected on the worker + // because only the worker sees the GPU in distributed mode. + gpuComputeCap := xsysinfo.NVIDIAComputeCapability() maxReplicas := cfg.MaxReplicasPerModel if maxReplicas < 1 { @@ -85,6 +89,7 @@ func (cfg *Config) registrationBody() map[string]any { "total_vram": totalVRAM, "available_vram": totalVRAM, // initially all VRAM is available "gpu_vendor": gpuVendor, + "gpu_compute_capability": gpuComputeCap, "max_replicas_per_model": maxReplicas, } diff --git a/pkg/xsysinfo/computecap_internal_test.go b/pkg/xsysinfo/computecap_internal_test.go new file mode 100644 index 000000000..3bf2602d0 --- /dev/null +++ b/pkg/xsysinfo/computecap_internal_test.go @@ -0,0 +1,23 @@ +package xsysinfo + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("parseComputeCap", func() { + DescribeTable("splits major.minor", + func(in string, maj, min int) { + m, n := parseComputeCap(in) + Expect(m).To(Equal(maj)) + Expect(n).To(Equal(min)) + }, + Entry("GB10 / DGX Spark", "12.1", 12, 1), + Entry("RTX 50-series", "12.0", 12, 0), + Entry("Hopper", "9.0", 9, 0), + Entry("major only", "12", 12, 0), + Entry("whitespace", " 12.1 ", 12, 1), + Entry("empty", "", -1, -1), + Entry("garbage", "abc", -1, -1), + ) +}) diff --git a/pkg/xsysinfo/gpu.go b/pkg/xsysinfo/gpu.go index a5575edb8..f0185ddeb 100644 --- a/pkg/xsysinfo/gpu.go +++ b/pkg/xsysinfo/gpu.go @@ -38,9 +38,9 @@ var UnifiedMemoryDevices = []string{ // GPUMemoryInfo contains real-time GPU memory usage information type GPUMemoryInfo struct { - Index int `json:"index"` - Name string `json:"name"` - Vendor string `json:"vendor"` + Index int `json:"index"` + Name string `json:"name"` + Vendor string `json:"vendor"` // BDF is the canonical PCI bus address (dddd:bb:dd.f) when known. // Populated by detection paths that can attribute the device to a // PCI location (clinfo, future amdgpu/nvidia paths); empty for @@ -307,6 +307,84 @@ func GetGPUAggregateInfo() GPUAggregateInfo { return aggregate } +var ( + computeCapOnce sync.Once + computeCapResult string +) + +// NVIDIAComputeCapability returns the highest NVIDIA GPU compute capability on +// this host as a "major.minor" string (e.g. "12.1" for GB10 / DGX Spark), or "" +// when nvidia-smi is unavailable or reports none. Detected once and cached. +// +// This runs where the GPU actually is. In distributed mode it is reported by +// each worker on registration so the router can make per-node decisions rather +// than guessing from the (possibly GPU-less) frontend host. +func NVIDIAComputeCapability() string { + computeCapOnce.Do(func() { + computeCapResult = detectNVIDIAComputeCapability() + }) + return computeCapResult +} + +func detectNVIDIAComputeCapability() string { + if _, err := exec.LookPath("nvidia-smi"); err != nil { + return "" + } + + cmd := exec.Command("nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader") + + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + if err := cmd.Run(); err != nil { + xlog.Debug("nvidia-smi compute_cap query failed", "error", err, "stderr", stderr.String()) + return "" + } + + best := "" + bestMajor, bestMinor := -1, -1 + for _, line := range strings.Split(strings.TrimSpace(stdout.String()), "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + maj, min := parseComputeCap(line) + if maj < 0 { + continue + } + if maj > bestMajor || (maj == bestMajor && min > bestMinor) { + bestMajor, bestMinor, best = maj, min, line + } + } + if best != "" { + xlog.Debug("NVIDIA compute capability detected", "compute_cap", best) + } + return best +} + +// parseComputeCap splits a "major.minor" compute-capability string into its +// integer parts. Returns (-1, -1) if it can't be parsed. +func parseComputeCap(cc string) (int, int) { + cc = strings.TrimSpace(cc) + if cc == "" { + return -1, -1 + } + majStr, minStr := cc, "0" + if dot := strings.IndexByte(cc, '.'); dot >= 0 { + majStr, minStr = cc[:dot], cc[dot+1:] + } + maj, err := strconv.Atoi(strings.TrimSpace(majStr)) + if err != nil { + return -1, -1 + } + min, err := strconv.Atoi(strings.TrimSpace(minStr)) + if err != nil { + min = 0 + } + return maj, min +} + // getNVIDIAGPUMemory queries NVIDIA GPUs using nvidia-smi func getNVIDIAGPUMemory() []GPUMemoryInfo { // Check if nvidia-smi is available @@ -866,12 +944,12 @@ func getVulkanGPUMemory() []GPUMemoryInfo { } type vulkanGPUTextInfo struct { - index int - name string - deviceType string - totalVRAM uint64 - budgetVRAM uint64 - usageVRAM uint64 + index int + name string + deviceType string + totalVRAM uint64 + budgetVRAM uint64 + usageVRAM uint64 } func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo { @@ -909,7 +987,7 @@ func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo { } else if current.usageVRAM != 0 && current.budgetVRAM == 0 { current.budgetVRAM = current.totalVRAM - current.usageVRAM } else if current.usageVRAM == 0 && current.budgetVRAM == 0 { - current.usageVRAM = 0 + current.usageVRAM = 0 current.budgetVRAM = current.totalVRAM } From e19c43cf043032c47da2a370b2f4d967b89b2035 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 14:56:16 +0200 Subject: [PATCH 16/99] feat(gallery): add Depth Anything V2 models + bump native version (#10413) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(gallery): add Depth Anything V2 models + bump native version Add Depth Anything V2 (DA2) support to the depth-anything backend. DA2 is depth-only (no camera pose, no confidence) and ships both relative (relative inverse depth) and metric (depth in metres) variants. The Go backend is model-agnostic, so no backend code changes are required — only a native version bump and new gallery entries. - backend/go/depth-anything-cpp/Makefile: pin DEPTHANYTHING_VERSION to the depth-anything.cpp commit that adds the DA2 engine + C-API routing (e3dec57f13a52366bbc4f279ef44804915960a6b, kept alive by the upstream tag da2-support so it survives a squash-merge). - gallery/index.yaml: add 12 DA2 entries (4 base quants, small, large, plus Hypersim indoor and VKITTI outdoor metric models in S/B/L). Metric models carry the metric-depth tag; none carry camera-pose. Assisted-by: Claude:claude-opus-4-8 * chore(depth-anything-cpp): pin to merged DA2 master commit PR #1 (mudler/depth-anything.cpp) merged to master as f4e17de (squash); repoint the pin from the pre-merge commit to the canonical master commit. Assisted-by: Claude:claude-opus-4-8 --------- Co-authored-by: Ettore Di Giacinto --- backend/go/depth-anything-cpp/Makefile | 10 +- gallery/index.yaml | 227 +++++++++++++++++++++++++ 2 files changed, 233 insertions(+), 4 deletions(-) diff --git a/backend/go/depth-anything-cpp/Makefile b/backend/go/depth-anything-cpp/Makefile index 815d2b0db..f1a0b9f97 100644 --- a/backend/go/depth-anything-cpp/Makefile +++ b/backend/go/depth-anything-cpp/Makefile @@ -8,11 +8,13 @@ JOBS?=$(shell nproc --ignore=1) # depth-anything.cpp. Pin to a specific commit for a stable build; a squash # merge upstream can orphan a branch, so the native version is pinned by SHA. -# This SHA adds the nested two-file metric C-API (abi_version 4, -# da_capi_load_nested) required by the depth-anything-3-nested gallery model; -# tag it (e.g. v0.1.3) upstream to keep the SHA alive. +# This SHA adds the Depth Anything V2 engine + C-API routing (depth-only, +# relative + metric) on top of the nested two-file metric C-API (abi_version 4, +# da_capi_load_nested) required by the depth-anything-3-nested gallery model. +# It is kept alive by the upstream tag da2-support (survives a squash-merge); +# repoint to the master merge commit once mudler/depth-anything.cpp PR #1 lands. DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git -DEPTHANYTHING_VERSION?=cce5edc395fd1843806093d7ccc0c8b0d0b97b72 +DEPTHANYTHING_VERSION?=f4e17dea695dd12ae76bea98ba58030996b98118 ifeq ($(NATIVE),false) CMAKE_ARGS+=-DGGML_NATIVE=OFF diff --git a/gallery/index.yaml b/gallery/index.yaml index beede9e79..18d6b1839 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -8343,6 +8343,233 @@ - filename: depth-anything-nested-metric.gguf uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything-nested-metric.gguf sha256: "b54ed50cbc0b0c14fae1f8edd0fea8bd1cac0850485fd6e7eb2422c7a19e570e" +- &depth-anything-2-base + name: depth-anything-2-base + url: github:mudler/LocalAI/gallery/virtual.yaml@master + urls: + - https://github.com/mudler/depth-anything.cpp + - https://huggingface.co/depth-anything/Depth-Anything-V2 + - https://huggingface.co/mudler/depth-anything.cpp-gguf + description: | + Depth Anything V2 (base / ViT-B) monocular depth, served via the native + depth-anything.cpp backend (C++/ggml + purego, no Python at inference). + Given an image it returns a dense monocular depth map only — no camera pose, + no confidence. This is the relative variant (relative inverse depth). Use + GenerateImage (src -> normalized depth PNG at dst) or the Depth endpoint. + q4_k is the recommended CPU default. + license: apache-2.0 + icon: https://avatars.githubusercontent.com/u/53104118?s=200&v=4 + tags: + - depth-estimation + - depth-anything + - native + - cpp + - cpu + overrides: + backend: depth-anything + parameters: + model: depth-anything2-base-q4_k.gguf + files: + - filename: depth-anything2-base-q4_k.gguf + uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-base-q4_k.gguf + sha256: "49e77ec7e593080111242fa76017cae3e26498d550841cf8a70dfcd36bb175f2" +- !!merge <<: *depth-anything-2-base + name: depth-anything-2-base-q8_0 + description: | + Depth Anything V2 (base / ViT-B), q8_0 — near-lossless 8-bit quant. Same + relative monocular depth output as the q4_k default at higher fidelity. Use + GenerateImage (src -> depth PNG) or the Depth endpoint. + overrides: + backend: depth-anything + parameters: + model: depth-anything2-base-q8_0.gguf + files: + - filename: depth-anything2-base-q8_0.gguf + uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-base-q8_0.gguf + sha256: "11920ec7a8dfc2fa7fe8ed44811a46fafe415c641d13144bb733d1437832291b" +- !!merge <<: *depth-anything-2-base + name: depth-anything-2-base-f16 + description: | + Depth Anything V2 (base / ViT-B), f16 — half precision, no measurable + accuracy loss vs f32. Relative monocular depth only (no pose). Use + GenerateImage (src -> depth PNG) or the Depth endpoint. + overrides: + backend: depth-anything + parameters: + model: depth-anything2-base-f16.gguf + files: + - filename: depth-anything2-base-f16.gguf + uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-base-f16.gguf + sha256: "e91c011fbbf90a44639fea55b8d61ca9e80dfb5541220946c8b6e6261fe67ab1" +- !!merge <<: *depth-anything-2-base + name: depth-anything-2-base-f32 + description: | + Depth Anything V2 (base / ViT-B), f32 — maximum reference fidelity. Relative + monocular depth only (no pose). Use GenerateImage (src -> depth PNG) or the + Depth endpoint. + overrides: + backend: depth-anything + parameters: + model: depth-anything2-base-f32.gguf + files: + - filename: depth-anything2-base-f32.gguf + uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-base-f32.gguf + sha256: "2d3d2e4d8fae9646c17577b84c870c7d77a34ded8cf8d5da0a60b2bee1530ccc" +- !!merge <<: *depth-anything-2-base + name: depth-anything-2-small + description: | + Depth Anything V2 (small / ViT-S), f32 — the smallest, fastest backbone for + relative monocular depth on CPU. Depth only (no pose). Use GenerateImage + (src -> depth PNG) or the Depth endpoint. + overrides: + backend: depth-anything + parameters: + model: depth-anything2-small-f32.gguf + files: + - filename: depth-anything2-small-f32.gguf + uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-small-f32.gguf + sha256: "1f6622aa70cbd0eba34d34e2f635a156ed8c3f8e158fb149eb355366e2deb899" +- !!merge <<: *depth-anything-2-base + name: depth-anything-2-large + description: | + Depth Anything V2 (large / ViT-L), f32 — higher-quality relative monocular + depth than base. Depth only (no pose). Use GenerateImage (src -> depth PNG) + or the Depth endpoint. + overrides: + backend: depth-anything + parameters: + model: depth-anything2-large-f32.gguf + files: + - filename: depth-anything2-large-f32.gguf + uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-large-f32.gguf + sha256: "e187658b01b6df62e1553b03df9973606a7287551d4771b09802fc10b26f19f3" +- !!merge <<: *depth-anything-2-base + name: depth-anything-2-metric-hypersim-small + description: | + Depth Anything V2 Metric (Hypersim, indoor / ViT-S), q4_k — metric monocular + depth in METRES (indoor, max_depth 20). Depth only (no pose). Use + GenerateImage (src -> depth PNG) or the Depth endpoint. + tags: + - depth-estimation + - depth-anything + - metric-depth + - native + - cpp + - cpu + overrides: + backend: depth-anything + parameters: + model: depth-anything2-metric-hypersim-small-q4_k.gguf + files: + - filename: depth-anything2-metric-hypersim-small-q4_k.gguf + uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-metric-hypersim-small-q4_k.gguf + sha256: "decd99f5c756b564aae5ca1a1612f896e7f76889060e1d25ba610549bbc39b52" +- !!merge <<: *depth-anything-2-base + name: depth-anything-2-metric-hypersim-base + description: | + Depth Anything V2 Metric (Hypersim, indoor / ViT-B), q4_k — metric monocular + depth in METRES (indoor, max_depth 20). Depth only (no pose). Use + GenerateImage (src -> depth PNG) or the Depth endpoint. + tags: + - depth-estimation + - depth-anything + - metric-depth + - native + - cpp + - cpu + overrides: + backend: depth-anything + parameters: + model: depth-anything2-metric-hypersim-base-q4_k.gguf + files: + - filename: depth-anything2-metric-hypersim-base-q4_k.gguf + uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-metric-hypersim-base-q4_k.gguf + sha256: "c7c6a8628ac154f2ad43db09b8146518e863375be2cb03b8b46caec49a4fcf3a" +- !!merge <<: *depth-anything-2-base + name: depth-anything-2-metric-hypersim-large + description: | + Depth Anything V2 Metric (Hypersim, indoor / ViT-L), q4_k — highest-quality + metric monocular depth in METRES (indoor, max_depth 20). Depth only (no + pose). Use GenerateImage (src -> depth PNG) or the Depth endpoint. + tags: + - depth-estimation + - depth-anything + - metric-depth + - native + - cpp + - cpu + overrides: + backend: depth-anything + parameters: + model: depth-anything2-metric-hypersim-large-q4_k.gguf + files: + - filename: depth-anything2-metric-hypersim-large-q4_k.gguf + uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-metric-hypersim-large-q4_k.gguf + sha256: "3664506bea55e64926fff2cb112ea5a9ad923d13647b9c69617184a89dd1e473" +- !!merge <<: *depth-anything-2-base + name: depth-anything-2-metric-vkitti-small + description: | + Depth Anything V2 Metric (Virtual KITTI, outdoor / ViT-S), q4_k — metric + monocular depth in METRES (outdoor, max_depth 80). Depth only (no pose). Use + GenerateImage (src -> depth PNG) or the Depth endpoint. + tags: + - depth-estimation + - depth-anything + - metric-depth + - native + - cpp + - cpu + overrides: + backend: depth-anything + parameters: + model: depth-anything2-metric-vkitti-small-q4_k.gguf + files: + - filename: depth-anything2-metric-vkitti-small-q4_k.gguf + uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-metric-vkitti-small-q4_k.gguf + sha256: "8dcaa5d0f8475c3dc5de59e28faacd6d46e5ef73c73ecc58e365d7751bc2279f" +- !!merge <<: *depth-anything-2-base + name: depth-anything-2-metric-vkitti-base + description: | + Depth Anything V2 Metric (Virtual KITTI, outdoor / ViT-B), q4_k — metric + monocular depth in METRES (outdoor, max_depth 80). Depth only (no pose). Use + GenerateImage (src -> depth PNG) or the Depth endpoint. + tags: + - depth-estimation + - depth-anything + - metric-depth + - native + - cpp + - cpu + overrides: + backend: depth-anything + parameters: + model: depth-anything2-metric-vkitti-base-q4_k.gguf + files: + - filename: depth-anything2-metric-vkitti-base-q4_k.gguf + uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-metric-vkitti-base-q4_k.gguf + sha256: "1de5a7aae674df6afb8fa5e06d67843dccfbab92cd64b7c816c1218229446d6d" +- !!merge <<: *depth-anything-2-base + name: depth-anything-2-metric-vkitti-large + description: | + Depth Anything V2 Metric (Virtual KITTI, outdoor / ViT-L), q4_k — + highest-quality metric monocular depth in METRES (outdoor, max_depth 80). + Depth only (no pose). Use GenerateImage (src -> depth PNG) or the Depth + endpoint. + tags: + - depth-estimation + - depth-anything + - metric-depth + - native + - cpp + - cpu + overrides: + backend: depth-anything + parameters: + model: depth-anything2-metric-vkitti-large-q4_k.gguf + files: + - filename: depth-anything2-metric-vkitti-large-q4_k.gguf + uri: huggingface://mudler/depth-anything.cpp-gguf/depth-anything2-metric-vkitti-large-q4_k.gguf + sha256: "3b72e9a34262a7025ffba2fc4b760553398ac0622c26f164bff3d2c93991c757" - name: rfdetr-cpp-base url: github:mudler/LocalAI/gallery/virtual.yaml@master urls: From 9565db5f949cfc814d8bc8c4dfc60c166f899528 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 22:38:42 +0200 Subject: [PATCH 17/99] feat(models): model aliases - redirect a model name to another configured model (#10414) * feat(config): add model alias field and self-validation Add ModelConfig.Alias (yaml: alias), IsAlias(), and an alias short-circuit at the top of Validate() that rejects self-reference and forbids setting backend/parameters.model on a pure-redirect alias. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto * feat(config): resolve and validate model alias targets in the loader Assisted-by: Claude:opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto * feat(middleware): resolve model aliases and stamp requested/served identity Signed-off-by: Ettore Di Giacinto * feat(modeladmin): reject alias configs with invalid targets on create/edit Validate alias targets at create/swap entry points (ImportModelEndpoint, EditYAML, PatchConfig) so a dangling, chained, or disabled alias target is rejected at save time rather than surfacing as a runtime error. Assisted-by: Claude:opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto * feat(api): add GET /api/aliases to list model aliases Adds an admin-gated read-only endpoint that lists every model alias config as {name, target} pairs, backed by the loader's existing GetAllModelsConfigs(). Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto * feat(mcp): add set_alias and list_aliases tools Expose model-alias management over the LocalAI Assistant MCP surface: list_aliases (read-only, GET /api/aliases) and set_alias (mutating). SetAlias is swap-first: PATCH /api/models/config-json/:name swaps an existing alias's target (validated, non-destructive) and a 404 falls back to POST /models/import to create a fresh {name, alias} config. The inproc client mirrors this via ConfigService.PatchConfig + a create path modeled on ImportModelEndpoint. Deletion reuses delete_model. Assisted-by: Claude:claude-opus-4 [Claude Code] Signed-off-by: Ettore Di Giacinto * style(mcp): replace em dashes in alias tool comments Signed-off-by: Ettore Di Giacinto * feat(config-meta): expose alias as a model-select field Add an 'alias' section to DefaultSections() and an 'alias' field override in DefaultRegistry() so the schema-driven React editor renders the new top-level ModelConfig.Alias field as a model picker in its own section. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto * feat(ui): add alias template card and Manage alias badge Add an 'Alias / Routing' template to the create-flow gallery that seeds a minimal name + alias config, and a read-only 'alias -> target' badge on the Manage Models tab. The capabilities row payload does not carry the alias field, so the badge resolves targets from GET /api/aliases looked up by name. Assisted-by: Claude:claude-opus-4 [Claude Code] Signed-off-by: Ettore Di Giacinto * docs: document model aliases Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto * docs(swagger): regenerate for GET /api/aliases Adds the /api/aliases path and AliasInfo schema generated from the ListAliasesEndpoint annotation. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto * test(localai): check os.RemoveAll error in aliases_test Signed-off-by: Ettore Di Giacinto * fix: correct alias conversion docs and advertise /api/aliases in instructions Signed-off-by: Ettore Di Giacinto * fix(mcp): write alias config 0600 to satisfy gosec G306 The inproc createAlias path wrote the alias YAML with 0644, which gosec flags as a new G306 finding on the PR. The LocalAI process is the sole reader/writer of model configs, so 0600 is correct and keeps the scan clean. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- core/config/meta/registry.go | 9 ++ core/config/meta/registry_test.go | 28 +++++ core/config/meta/types.go | 1 + core/config/model_config.go | 26 +++++ core/config/model_config_loader.go | 54 ++++++++++ core/config/model_config_loader_test.go | 48 +++++++++ core/config/model_config_test.go | 29 +++++ core/http/endpoints/localai/aliases.go | 33 ++++++ core/http/endpoints/localai/aliases_test.go | 57 ++++++++++ core/http/endpoints/localai/import_model.go | 6 ++ .../endpoints/mcp/localai_assistant_test.go | 6 ++ core/http/middleware/request.go | 21 ++++ core/http/middleware/request_test.go | 101 ++++++++++++++++++ core/http/middleware/route_model.go | 7 +- core/http/react-ui/e2e/alias-template.spec.js | 77 +++++++++++++ core/http/react-ui/src/pages/Manage.jsx | 23 +++- core/http/react-ui/src/utils/api.js | 1 + core/http/react-ui/src/utils/config.js | 1 + .../http/react-ui/src/utils/modelTemplates.js | 10 ++ core/http/routes/localai.go | 4 + core/services/modeladmin/config.go | 6 ++ core/services/modeladmin/config_test.go | 18 ++++ docs/content/features/model-aliases.md | 81 ++++++++++++++ pkg/mcp/localaitools/client.go | 8 ++ pkg/mcp/localaitools/coverage_test.go | 2 + pkg/mcp/localaitools/dto.go | 8 ++ pkg/mcp/localaitools/fakes_test.go | 18 ++++ pkg/mcp/localaitools/httpapi/client.go | 36 +++++++ pkg/mcp/localaitools/httpapi/client_test.go | 86 +++++++++++++++ pkg/mcp/localaitools/httpapi/routes.go | 2 + pkg/mcp/localaitools/inproc/client.go | 78 ++++++++++++++ pkg/mcp/localaitools/inproc/client_test.go | 77 +++++++++++++ pkg/mcp/localaitools/server.go | 1 + pkg/mcp/localaitools/server_test.go | 5 + pkg/mcp/localaitools/tools.go | 5 + pkg/mcp/localaitools/tools_aliases.go | 48 +++++++++ swagger/docs.go | 30 ++++++ swagger/swagger.json | 30 ++++++ swagger/swagger.yaml | 19 ++++ 39 files changed, 1098 insertions(+), 2 deletions(-) create mode 100644 core/config/meta/registry_test.go create mode 100644 core/http/endpoints/localai/aliases.go create mode 100644 core/http/endpoints/localai/aliases_test.go create mode 100644 core/http/react-ui/e2e/alias-template.spec.js create mode 100644 docs/content/features/model-aliases.md create mode 100644 pkg/mcp/localaitools/tools_aliases.go diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go index ca10f604c..84fc9afda 100644 --- a/core/config/meta/registry.go +++ b/core/config/meta/registry.go @@ -286,6 +286,15 @@ func DefaultRegistry() map[string]FieldMetaOverride { Order: 45, }, + // --- Alias --- + "alias": { + Section: "alias", + Label: "Alias target", + Description: "Redirect all traffic for this model to another configured model. When set, every other field on this config is ignored and requests are served by the target model.", + Component: "model-select", + Order: 0, + }, + // --- Pipeline --- "pipeline.llm": { Section: "pipeline", diff --git a/core/config/meta/registry_test.go b/core/config/meta/registry_test.go new file mode 100644 index 000000000..e9d998609 --- /dev/null +++ b/core/config/meta/registry_test.go @@ -0,0 +1,28 @@ +package meta_test + +import ( + "github.com/mudler/LocalAI/core/config/meta" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("alias field metadata", func() { + It("registers the alias field as a model-select in the alias section", func() { + reg := meta.DefaultRegistry() + f, ok := reg["alias"] + Expect(ok).To(BeTrue(), "alias field should have a registry override") + Expect(f.Section).To(Equal("alias")) + Expect(f.Component).To(Equal("model-select")) + }) + + It("defines an alias section", func() { + var found bool + for _, s := range meta.DefaultSections() { + if s.ID == "alias" { + found = true + } + } + Expect(found).To(BeTrue(), "DefaultSections should include an alias section") + }) +}) diff --git a/core/config/meta/types.go b/core/config/meta/types.go index a86b8bb69..a29e66967 100644 --- a/core/config/meta/types.go +++ b/core/config/meta/types.go @@ -69,6 +69,7 @@ type FieldMetaOverride struct { func DefaultSections() []Section { return []Section{ {ID: "general", Label: "General", Icon: "settings", Order: 0}, + {ID: "alias", Label: "Alias", Icon: "git-merge", Order: 5}, {ID: "llm", Label: "LLM", Icon: "cpu", Order: 10}, {ID: "parameters", Label: "Parameters", Icon: "sliders", Order: 20}, {ID: "templates", Label: "Templates", Icon: "file-text", Order: 30}, diff --git a/core/config/model_config.go b/core/config/model_config.go index 75136ec6c..50836b99e 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -37,6 +37,12 @@ type ModelConfig struct { schema.PredictionOptions `yaml:"parameters,omitempty" json:"parameters,omitempty"` Name string `yaml:"name,omitempty" json:"name,omitempty"` + // Alias, when set, makes this config a pure redirect: every request for + // Name is served by the model named here. All other fields are ignored. + // The target must be an existing, non-alias model (enforced at load and + // at create/swap time). See docs/content for Model Aliases. + Alias string `yaml:"alias,omitempty" json:"alias,omitempty"` + F16 *bool `yaml:"f16,omitempty" json:"f16,omitempty"` Threads *int `yaml:"threads,omitempty" json:"threads,omitempty"` Debug *bool `yaml:"debug,omitempty" json:"debug,omitempty"` @@ -391,6 +397,10 @@ func (c *ModelConfig) HasRouter() bool { return len(c.Router.Candidates) > 0 } +// IsAlias reports whether this config is a pure redirect to another model. +// Value receiver so it is callable on non-addressable config values too. +func (c ModelConfig) IsAlias() bool { return c.Alias != "" } + // @Description PII filtering configuration. PII redaction is per-model so // that local models don't pay the latency or behaviour change of regex // scanning, while cloud-bound traffic (cloud-proxy backend) can default to @@ -1248,6 +1258,22 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) { } func (c *ModelConfig) Validate() (bool, error) { + // An alias is a pure redirect: validate only its own shape here. Target + // existence and the no-chain rule need the full config set, so the loader + // (load-time) and the create/swap endpoints enforce those. + if c.IsAlias() { + if c.Name == "" { + return false, fmt.Errorf("alias config requires a name") + } + if c.Alias == c.Name { + return false, fmt.Errorf("alias %q cannot point to itself", c.Name) + } + if c.Backend != "" || c.Model != "" { + return false, fmt.Errorf("alias config %q must not set backend or parameters.model: an alias is a pure redirect", c.Name) + } + return true, nil + } + downloadedFileNames := []string{} for _, f := range c.DownloadFiles { downloadedFileNames = append(downloadedFileNames, f.Filename) diff --git a/core/config/model_config_loader.go b/core/config/model_config_loader.go index 89f4bc5cb..e2f43e83f 100644 --- a/core/config/model_config_loader.go +++ b/core/config/model_config_loader.go @@ -294,6 +294,44 @@ func (bcl *ModelConfigLoader) UpdateModelConfig(m string, updater func(*ModelCon } } +// ResolveAlias follows a one-hop alias to its target config. Returns +// (resolved, wasAlias, err). Non-alias configs return (cfg, false, nil) +// unchanged. Strict: the target must exist and must not itself be an alias +// (chains are rejected). The returned config is a copy of the target. +func (bcl *ModelConfigLoader) ResolveAlias(cfg *ModelConfig) (*ModelConfig, bool, error) { + if cfg == nil || !cfg.IsAlias() { + return cfg, false, nil + } + target, exists := bcl.GetModelConfig(cfg.Alias) + if !exists { + return nil, true, fmt.Errorf("alias %q points to unknown model %q", cfg.Name, cfg.Alias) + } + if target.IsAlias() { + return nil, true, fmt.Errorf("alias %q points to another alias %q (chains are not allowed)", cfg.Name, cfg.Alias) + } + return &target, true, nil +} + +// ValidateAliasTarget checks an alias config's target at create/swap time: +// the target must exist, must not be an alias, and must not be disabled. +// Returns nil for non-alias configs. +func (bcl *ModelConfigLoader) ValidateAliasTarget(cfg *ModelConfig) error { + if cfg == nil || !cfg.IsAlias() { + return nil + } + target, exists := bcl.GetModelConfig(cfg.Alias) + if !exists { + return fmt.Errorf("alias target %q does not exist", cfg.Alias) + } + if target.IsAlias() { + return fmt.Errorf("alias target %q is itself an alias (chains are not allowed)", cfg.Alias) + } + if target.IsDisabled() { + return fmt.Errorf("alias target %q is disabled", cfg.Alias) + } + return nil +} + // Preload prepare models if they are not local but url or huggingface repositories func (bcl *ModelConfigLoader) Preload(modelPath string) error { bcl.Lock() @@ -475,5 +513,21 @@ func (bcl *ModelConfigLoader) LoadModelConfigsFromPath(path string, opts ...Conf } } + // Surface aliases whose targets are missing or themselves aliases. These + // resolve to a clear request-time error; warning here gives operators + // visibility without failing startup. + for name, c := range bcl.configs { + if !c.IsAlias() { + continue + } + target, ok := bcl.configs[c.Alias] + switch { + case !ok: + xlog.Warn("alias points to unknown model", "alias", name, "target", c.Alias) + case target.IsAlias(): + xlog.Warn("alias points to another alias (chains are not allowed)", "alias", name, "target", c.Alias) + } + } + return nil } diff --git a/core/config/model_config_loader_test.go b/core/config/model_config_loader_test.go index 924a4d1e4..06ab65a20 100644 --- a/core/config/model_config_loader_test.go +++ b/core/config/model_config_loader_test.go @@ -61,3 +61,51 @@ var _ = Describe("ModelConfigLoader.GetModelsConflictingWith", func() { Expect(bcl.GetModelsConflictingWith("a")).To(ConsistOf("b")) }) }) + +var _ = Describe("ModelConfigLoader alias resolution", func() { + var loader *ModelConfigLoader + + BeforeEach(func() { + loader = NewModelConfigLoader("") + loader.configs["real"] = ModelConfig{Name: "real", Backend: "llama-cpp"} + loader.configs["gpt-4"] = ModelConfig{Name: "gpt-4", Alias: "real"} + loader.configs["chain"] = ModelConfig{Name: "chain", Alias: "gpt-4"} + loader.configs["dangling"] = ModelConfig{Name: "dangling", Alias: "nope"} + }) + + It("returns non-alias configs unchanged", func() { + cfg := loader.configs["real"] + got, was, err := loader.ResolveAlias(&cfg) + Expect(err).ToNot(HaveOccurred()) + Expect(was).To(BeFalse()) + Expect(got.Name).To(Equal("real")) + }) + + It("resolves an alias to its target", func() { + cfg := loader.configs["gpt-4"] + got, was, err := loader.ResolveAlias(&cfg) + Expect(err).ToNot(HaveOccurred()) + Expect(was).To(BeTrue()) + Expect(got.Name).To(Equal("real")) + }) + + It("rejects an alias chain", func() { + cfg := loader.configs["chain"] + _, was, err := loader.ResolveAlias(&cfg) + Expect(was).To(BeTrue()) + Expect(err).To(MatchError(ContainSubstring("chains are not allowed"))) + }) + + It("rejects a dangling alias", func() { + cfg := loader.configs["dangling"] + _, _, err := loader.ResolveAlias(&cfg) + Expect(err).To(MatchError(ContainSubstring("unknown model"))) + }) + + It("ValidateAliasTarget passes for a real target and fails for a chain", func() { + good := loader.configs["gpt-4"] + Expect(loader.ValidateAliasTarget(&good)).ToNot(HaveOccurred()) + bad := loader.configs["chain"] + Expect(loader.ValidateAliasTarget(&bad)).To(MatchError(ContainSubstring("itself an alias"))) + }) +}) diff --git a/core/config/model_config_test.go b/core/config/model_config_test.go index 7f256354d..2f2f3fd82 100644 --- a/core/config/model_config_test.go +++ b/core/config/model_config_test.go @@ -787,3 +787,32 @@ var _ = Describe("pattern detector config", func() { Expect(err).To(MatchError(ContainSubstring("pattern \"EMAILish\""))) }) }) + +var _ = Describe("ModelConfig alias", func() { + It("reports IsAlias when alias is set", func() { + c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"} + Expect(c.IsAlias()).To(BeTrue()) + Expect(ModelConfig{Name: "real"}.IsAlias()).To(BeFalse()) + }) + + It("validates a minimal alias config", func() { + c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3"} + ok, err := c.Validate() + Expect(err).ToNot(HaveOccurred()) + Expect(ok).To(BeTrue()) + }) + + It("rejects an alias pointing to itself", func() { + c := ModelConfig{Name: "loop", Alias: "loop"} + ok, err := c.Validate() + Expect(ok).To(BeFalse()) + Expect(err).To(MatchError(ContainSubstring("itself"))) + }) + + It("rejects an alias that also sets a backend", func() { + c := ModelConfig{Name: "gpt-4", Alias: "my-llama-3", Backend: "llama-cpp"} + ok, err := c.Validate() + Expect(ok).To(BeFalse()) + Expect(err).To(MatchError(ContainSubstring("pure redirect"))) + }) +}) diff --git a/core/http/endpoints/localai/aliases.go b/core/http/endpoints/localai/aliases.go new file mode 100644 index 000000000..923e22c63 --- /dev/null +++ b/core/http/endpoints/localai/aliases.go @@ -0,0 +1,33 @@ +package localai + +import ( + "net/http" + + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/config" +) + +// AliasInfo is one alias -> target pair. +type AliasInfo struct { + Name string `json:"name"` + Target string `json:"target"` +} + +// ListAliasesEndpoint returns every configured model alias and its target. +// +// @Summary List model aliases +// @Tags models +// @Success 200 {array} AliasInfo +// @Router /api/aliases [get] +func ListAliasesEndpoint(cl *config.ModelConfigLoader) echo.HandlerFunc { + return func(c echo.Context) error { + // Non-nil so an empty result marshals as [] rather than null. + out := []AliasInfo{} + for _, cfg := range cl.GetAllModelsConfigs() { + if cfg.IsAlias() { + out = append(out, AliasInfo{Name: cfg.Name, Target: cfg.Alias}) + } + } + return c.JSON(http.StatusOK, out) + } +} diff --git a/core/http/endpoints/localai/aliases_test.go b/core/http/endpoints/localai/aliases_test.go new file mode 100644 index 000000000..e1c44898a --- /dev/null +++ b/core/http/endpoints/localai/aliases_test.go @@ -0,0 +1,57 @@ +package localai_test + +import ( + "net/http" + "net/http/httptest" + "os" + "path/filepath" + + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/config" + . "github.com/mudler/LocalAI/core/http/endpoints/localai" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("ListAliasesEndpoint", func() { + var tempDir string + + BeforeEach(func() { + var err error + tempDir, err = os.MkdirTemp("", "localai-aliases-test") + Expect(err).ToNot(HaveOccurred()) + }) + AfterEach(func() { + _ = os.RemoveAll(tempDir) + }) + + It("returns only alias configs as name/target pairs", func() { + // Seed one real model and one alias pointing at it. + Expect(os.WriteFile( + filepath.Join(tempDir, "real.yaml"), + []byte("name: real\nbackend: llama-cpp\nmodel: foo\n"), + 0644, + )).To(Succeed()) + Expect(os.WriteFile( + filepath.Join(tempDir, "gpt-4.yaml"), + []byte("name: gpt-4\nalias: real\n"), + 0644, + )).To(Succeed()) + + loader := config.NewModelConfigLoader(tempDir) + Expect(loader.LoadModelConfigsFromPath(tempDir)).To(Succeed()) + + app := echo.New() + app.GET("/api/aliases", ListAliasesEndpoint(loader)) + + req := httptest.NewRequest("GET", "/api/aliases", nil) + rec := httptest.NewRecorder() + app.ServeHTTP(rec, req) + + Expect(rec.Code).To(Equal(http.StatusOK)) + Expect(rec.Body.String()).To(ContainSubstring(`"name":"gpt-4"`)) + Expect(rec.Body.String()).To(ContainSubstring(`"target":"real"`)) + // The real model must not appear as an alias entry. + Expect(rec.Body.String()).ToNot(ContainSubstring(`"name":"real"`)) + }) +}) diff --git a/core/http/endpoints/localai/import_model.go b/core/http/endpoints/localai/import_model.go index dc225abdd..54a80a9cc 100644 --- a/core/http/endpoints/localai/import_model.go +++ b/core/http/endpoints/localai/import_model.go @@ -181,6 +181,12 @@ func ImportModelEndpoint(cl *config.ModelConfigLoader, appConfig *config.Applica return c.JSON(http.StatusBadRequest, ModelResponse{Success: false, Error: msg}) } + // Reject aliases whose target is missing, chained, or disabled so a + // dangling alias can't be persisted and surface as a runtime error later. + if err := cl.ValidateAliasTarget(&modelConfig); err != nil { + return c.JSON(http.StatusBadRequest, ModelResponse{Success: false, Error: err.Error()}) + } + // Create the configuration file configPath := filepath.Join(appConfig.SystemState.Model.ModelsPath, modelConfig.Name+".yaml") if err := utils.VerifyPath(modelConfig.Name+".yaml", appConfig.SystemState.Model.ModelsPath); err != nil { diff --git a/core/http/endpoints/mcp/localai_assistant_test.go b/core/http/endpoints/mcp/localai_assistant_test.go index 26cd2878f..8de7355c6 100644 --- a/core/http/endpoints/mcp/localai_assistant_test.go +++ b/core/http/endpoints/mcp/localai_assistant_test.go @@ -51,6 +51,12 @@ func (stubClient) EditModelConfig(_ context.Context, _ string, _ map[string]any) return nil } func (stubClient) ReloadModels(_ context.Context) error { return nil } +func (stubClient) SetAlias(_ context.Context, _, _ string) error { + return nil +} +func (stubClient) ListAliases(_ context.Context) ([]localaitools.AliasInfo, error) { + return nil, nil +} func (stubClient) ListBackends(_ context.Context) ([]localaitools.Backend, error) { return []localaitools.Backend{{Name: "stub-backend", Installed: true}}, nil } diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go index ff0d929ac..74f7e8565 100644 --- a/core/http/middleware/request.go +++ b/core/http/middleware/request.go @@ -167,6 +167,27 @@ func (re *RequestExtractor) SetModelAndConfig(initializer func() schema.LocalAIR } } + // Resolve a model alias to its target before the disabled check and + // before storing MODEL_CONFIG, so every modality (chat, embeddings, + // tts, image, ...) inherits redirection. The response keeps echoing + // the alias name (input.ModelName is left unchanged); usage accounting + // records requested=alias / served=target. + if cfg != nil && cfg.IsAlias() { + resolved, _, aliasErr := re.modelConfigLoader.ResolveAlias(cfg) + if aliasErr != nil { + return c.JSON(http.StatusBadRequest, schema.ErrorResponse{ + Error: &schema.APIError{ + Message: aliasErr.Error(), + Code: http.StatusBadRequest, + Type: "invalid_request_error", + }, + }) + } + c.Set(ContextKeyRequestedModel, modelName) + c.Set(ContextKeyServedModel, resolved.Name) + cfg = resolved + } + // Check if the model is disabled if cfg != nil && cfg.IsDisabled() { return c.JSON(http.StatusForbidden, schema.ErrorResponse{ diff --git a/core/http/middleware/request_test.go b/core/http/middleware/request_test.go index fe9fc926c..010379714 100644 --- a/core/http/middleware/request_test.go +++ b/core/http/middleware/request_test.go @@ -151,6 +151,107 @@ var _ = Describe("SetModelAndConfig middleware", func() { }) }) +// --------------------------------------------------------------------------- +// SetModelAndConfig - model alias resolution +// --------------------------------------------------------------------------- +// +// An alias config (`alias: `) is a pure redirect: the middleware must +// swap MODEL_CONFIG to the target config before the disabled check and before +// storing it, while leaving the response-facing model name as the alias. It +// also stamps routing.requested_model = alias and routing.served_model = +// target so usage accounting records both identities. +var _ = Describe("SetModelAndConfig alias resolution", func() { + var ( + modelDir string + capturedConfig *config.ModelConfig + capturedReq any + capturedServed any + app *echo.Echo + ) + + BeforeEach(func() { + var err error + modelDir, err = os.MkdirTemp("", "localai-alias-*") + Expect(err).ToNot(HaveOccurred()) + }) + + AfterEach(func() { + _ = os.RemoveAll(modelDir) + }) + + // buildApp seeds the loader from every YAML in modelDir (so an alias's + // target is present in the loader map) and wires a handler that captures + // the resolved config plus the stamped identity keys. + buildApp := func() *echo.Echo { + ss := &system.SystemState{Model: system.Model{ModelsPath: modelDir}} + appConfig := config.NewApplicationConfig() + appConfig.SystemState = ss + + mcl := config.NewModelConfigLoader(modelDir) + Expect(mcl.LoadModelConfigsFromPath(modelDir)).To(Succeed()) + ml := model.NewModelLoader(ss) + re := NewRequestExtractor(mcl, ml, appConfig) + + capturedConfig = nil + capturedReq = nil + capturedServed = nil + e := echo.New() + e.POST("/v1/chat/completions", + func(c echo.Context) error { + if cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig); ok { + capturedConfig = cfg + } + capturedReq = c.Get(ContextKeyRequestedModel) + capturedServed = c.Get(ContextKeyServedModel) + return c.String(http.StatusOK, "ok") + }, + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }), + ) + return e + } + + It("serves the target config but keeps the alias name and stamps identity", func() { + Expect(os.WriteFile(filepath.Join(modelDir, "real.yaml"), + []byte("name: real\nbackend: llama-cpp\n"), 0644)).To(Succeed()) + Expect(os.WriteFile(filepath.Join(modelDir, "gpt-4.yaml"), + []byte("name: gpt-4\nalias: real\n"), 0644)).To(Succeed()) + app = buildApp() + + req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", + strings.NewReader(`{"model":"gpt-4","messages":[{"role":"user","content":"hi"}]}`)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + app.ServeHTTP(rec, req) + + Expect(rec.Code).To(Equal(http.StatusOK)) + Expect(capturedConfig).ToNot(BeNil()) + // MODEL_CONFIG must be the target, not the alias stub. + Expect(capturedConfig.Name).To(Equal("real")) + Expect(capturedConfig.IsAlias()).To(BeFalse()) + // Identity stamps: requested = alias, served = target. + Expect(capturedReq).To(Equal("gpt-4")) + Expect(capturedServed).To(Equal("real")) + }) + + It("returns 400 when the alias target is missing", func() { + Expect(os.WriteFile(filepath.Join(modelDir, "gpt-4.yaml"), + []byte("name: gpt-4\nalias: nope\n"), 0644)).To(Succeed()) + app = buildApp() + + req := httptest.NewRequest(http.MethodPost, "/v1/chat/completions", + strings.NewReader(`{"model":"gpt-4","messages":[{"role":"user","content":"hi"}]}`)) + req.Header.Set("Content-Type", "application/json") + rec := httptest.NewRecorder() + app.ServeHTTP(rec, req) + + Expect(rec.Code).To(Equal(http.StatusBadRequest)) + var resp schema.ErrorResponse + Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed()) + Expect(resp.Error).ToNot(BeNil()) + Expect(resp.Error.Type).To(Equal("invalid_request_error")) + }) +}) + // --------------------------------------------------------------------------- // MergeOpenResponsesConfig — tool_choice parsing // --------------------------------------------------------------------------- diff --git a/core/http/middleware/route_model.go b/core/http/middleware/route_model.go index 7ff286af4..470bd05f5 100644 --- a/core/http/middleware/route_model.go +++ b/core/http/middleware/route_model.go @@ -189,7 +189,12 @@ func RouteModel(loader *config.ModelConfigLoader, appConfig *config.ApplicationC } c.Set(CONTEXT_LOCALS_KEY_MODEL_CONFIG, result.ChosenConfig) - c.Set(ContextKeyRequestedModel, result.RouterModel) + // Preserve an upstream requested model (e.g. an alias that points + // at this router model) so accounting keeps the name the client + // actually sent. Served always reflects the final candidate. + if c.Get(ContextKeyRequestedModel) == nil { + c.Set(ContextKeyRequestedModel, result.RouterModel) + } c.Set(ContextKeyServedModel, result.ChosenModel) if store != nil { diff --git a/core/http/react-ui/e2e/alias-template.spec.js b/core/http/react-ui/e2e/alias-template.spec.js new file mode 100644 index 000000000..f3b1a0ca0 --- /dev/null +++ b/core/http/react-ui/e2e/alias-template.spec.js @@ -0,0 +1,77 @@ +import { test, expect } from './coverage-fixtures.js' + +// Alias / Routing template + Manage alias badge regression tests. +// +// An alias is a model config with `alias: ` that redirects traffic to +// the target model. This covers the two discoverability surfaces: +// - the create-flow template gallery exposes an "Alias / Routing" card that +// seeds a minimal name + alias config +// - the Manage Models tab renders a read-only "alias -> target" badge on +// rows that resolve to an alias (looked up via GET /api/aliases, since the +// capabilities row payload doesn't carry the alias field) + +// Minimal metadata so the editor renders the alias field once the template +// loads. Mirrors the Task 7 config-meta registry, which surfaces `alias` as a +// model-select component. +const ALIAS_METADATA = { + sections: [ + { id: 'general', label: 'General', icon: 'settings', order: 0 }, + { id: 'other', label: 'Other', icon: 'more-horizontal', order: 100 }, + ], + fields: [ + { path: 'name', yaml_key: 'name', go_type: 'string', ui_type: 'string', + section: 'general', label: 'Model Name', component: 'input', order: 0 }, + { path: 'alias', yaml_key: 'alias', go_type: 'string', ui_type: 'string', + section: 'general', label: 'Alias', component: 'model-select', autocomplete_provider: 'models', + description: 'Redirect this model name to another configured model.', order: 1 }, + ], +} + +test.describe('Alias template - create flow', () => { + test.beforeEach(async ({ page }) => { + await page.route('**/api/auth/status', (route) => + route.fulfill({ contentType: 'application/json', body: JSON.stringify({ authEnabled: false, staticApiKeyRequired: false, providers: [] }) })) + await page.route('**/api/models/config-metadata*', (route) => + route.fulfill({ contentType: 'application/json', body: JSON.stringify(ALIAS_METADATA) })) + await page.route('**/api/models/config-metadata/autocomplete/**', (route) => + route.fulfill({ contentType: 'application/json', body: JSON.stringify({ values: [] }) })) + + page.on('pageerror', (err) => { + throw new Error(`uncaught page error: ${err.message}`) + }) + }) + + test('template gallery exposes the Alias / Routing card', async ({ page }) => { + await page.goto('/app/model-editor') + await expect(page.getByRole('button', { name: /Alias \/ Routing/i })).toBeVisible({ timeout: 10_000 }) + }) + + test('alias template loads the editor with the alias field', async ({ page }) => { + await page.goto('/app/model-editor?template=alias') + await expect(page.getByText(/Unexpected Application Error/i)).toHaveCount(0) + await expect(page.locator('h1.page-title')).toBeVisible({ timeout: 10_000 }) + await expect(page.getByText('Alias').first()).toBeVisible() + }) +}) + +test.describe('Manage - alias badge', () => { + test.beforeEach(async ({ page }) => { + await page.route('**/api/auth/status', (route) => + route.fulfill({ contentType: 'application/json', body: JSON.stringify({ authEnabled: false, staticApiKeyRequired: false, providers: [] }) })) + await page.route('**/api/models/capabilities', (route) => + route.fulfill({ contentType: 'application/json', body: JSON.stringify({ data: [ + { id: 'fast-llm', capabilities: ['chat'], backend: 'llama-cpp' }, + { id: 'gpt-4', capabilities: ['chat'], backend: 'llama-cpp' }, + ] }) })) + await page.route('**/api/aliases', (route) => + route.fulfill({ contentType: 'application/json', body: JSON.stringify([{ name: 'gpt-4', target: 'fast-llm' }]) })) + }) + + test('renders a read-only alias -> target badge on aliased rows', async ({ page }) => { + await page.goto('/app/manage') + await expect(page.locator('.table')).toBeVisible({ timeout: 10_000 }) + + // The aliased row shows the target; the plain model row does not. + await expect(page.getByText('alias -> fast-llm')).toBeVisible({ timeout: 10_000 }) + }) +}) diff --git a/core/http/react-ui/src/pages/Manage.jsx b/core/http/react-ui/src/pages/Manage.jsx index 48d18c33c..16d04f709 100644 --- a/core/http/react-ui/src/pages/Manage.jsx +++ b/core/http/react-ui/src/pages/Manage.jsx @@ -133,6 +133,10 @@ export default function Manage() { const { enrichModel, enrichBackend } = useGalleryEnrichment() const { operations } = useOperations() const [loadedModelIds, setLoadedModelIds] = useState(new Set()) + // Map of alias name -> target. The capabilities endpoint that feeds the row + // list doesn't carry the alias field, so we fetch it once and look rows up by + // name to render the read-only "alias -> target" badge. + const [aliasTargets, setAliasTargets] = useState({}) const [backends, setBackends] = useState([]) const [backendsLoading, setBackendsLoading] = useState(true) const [reloading, setReloading] = useState(false) @@ -228,12 +232,24 @@ export default function Manage() { } }, []) + const fetchAliases = useCallback(async () => { + try { + const data = await modelsApi.listAliases() + const map = {} + for (const a of Array.isArray(data) ? data : []) map[a.name] = a.target + setAliasTargets(map) + } catch { + setAliasTargets({}) + } + }, []) + useEffect(() => { fetchLoadedModels() fetchBackends() + fetchAliases() // Detect distributed mode (nodes API returns 503 when not enabled) nodesApi.list().then(() => setDistributedMode(true)).catch(() => {}) - }, [fetchLoadedModels, fetchBackends]) + }, [fetchLoadedModels, fetchBackends, fetchAliases]) // Auto-refresh the Models tab every 10s in distributed mode so ghost models // (loaded on a worker but absent from this frontend's in-memory cache) @@ -636,6 +652,11 @@ export default function Manage() { Pinned )} + {aliasTargets[model.id] && ( + ${aliasTargets[model.id]}`}> + alias -> {aliasTargets[model.id]} + + )} diff --git a/core/http/react-ui/src/utils/api.js b/core/http/react-ui/src/utils/api.js index a8ffa2f04..20bb90363 100644 --- a/core/http/react-ui/src/utils/api.js +++ b/core/http/react-ui/src/utils/api.js @@ -84,6 +84,7 @@ export const modelsApi = { list: (params) => fetchJSON(buildUrl(API_CONFIG.endpoints.models, params)), listV1: () => fetchJSON(API_CONFIG.endpoints.modelsList), listCapabilities: () => fetchJSON(API_CONFIG.endpoints.modelsCapabilities), + listAliases: () => fetchJSON(API_CONFIG.endpoints.modelsAliases), install: (id) => postJSON(API_CONFIG.endpoints.installModel(id), {}), delete: (id) => postJSON(API_CONFIG.endpoints.deleteModel(id), {}), estimate: (id, contexts) => fetchJSON( diff --git a/core/http/react-ui/src/utils/config.js b/core/http/react-ui/src/utils/config.js index cf83d590f..65797fe41 100644 --- a/core/http/react-ui/src/utils/config.js +++ b/core/http/react-ui/src/utils/config.js @@ -95,6 +95,7 @@ export const API_CONFIG = { modelsList: '/v1/models', modelsCapabilities: '/api/models/capabilities', + modelsAliases: '/api/aliases', // Realtime / WebRTC realtimeCalls: '/v1/realtime/calls', diff --git a/core/http/react-ui/src/utils/modelTemplates.js b/core/http/react-ui/src/utils/modelTemplates.js index 54d34aecc..c3675f9db 100644 --- a/core/http/react-ui/src/utils/modelTemplates.js +++ b/core/http/react-ui/src/utils/modelTemplates.js @@ -142,6 +142,16 @@ const MODEL_TEMPLATES = [ ], }, }, + { + id: 'alias', + label: 'Alias / Routing', + icon: 'fa-arrow-right-arrow-left', + description: 'Point a model name at another configured model. Clients keep calling the alias; you swap the target anytime.', + fields: { + 'name': '', + 'alias': '', + }, + }, { id: 'mitm', label: 'MITM Intercept', diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go index a66801556..1df1d5d8c 100644 --- a/core/http/routes/localai.go +++ b/core/http/routes/localai.go @@ -80,6 +80,9 @@ func RegisterLocalAIRoutes(router *echo.Echo, // Custom model edit endpoint router.POST("/models/edit/:name", localai.EditModelEndpoint(cl, ml, appConfig), adminMiddleware) + // List model aliases endpoint + router.GET("/api/aliases", localai.ListAliasesEndpoint(cl), adminMiddleware) + // Toggle model enable/disable endpoint router.PUT("/models/toggle-state/:name/:action", localai.ToggleStateModelEndpoint(cl, ml, appConfig), adminMiddleware) @@ -303,6 +306,7 @@ func RegisterLocalAIRoutes(router *echo.Echo, "edit": "/models/edit/:name", "import": "/models/import", "reload": "/models/reload", + "list_aliases": "/api/aliases", }, "ai_functions": map[string]string{ "tts": "/tts", diff --git a/core/services/modeladmin/config.go b/core/services/modeladmin/config.go index c01e2fb4c..f4fc53d97 100644 --- a/core/services/modeladmin/config.go +++ b/core/services/modeladmin/config.go @@ -130,6 +130,9 @@ func (s *ConfigService) PatchConfig(_ context.Context, name string, patch map[st } return nil, ErrInvalidConfig } + if err := s.Loader.ValidateAliasTarget(&updated); err != nil { + return nil, fmt.Errorf("%w: %v", ErrInvalidConfig, err) + } if err := writeFileAtomic(configPath, yamlData, 0644); err != nil { return nil, fmt.Errorf("write config file: %w", err) } @@ -215,6 +218,9 @@ func (s *ConfigService) EditYAML(_ context.Context, name string, body []byte, ml if valid, _ := req.Validate(); !valid { return nil, ErrInvalidConfig } + if err := s.Loader.ValidateAliasTarget(&req); err != nil { + return nil, fmt.Errorf("%w: %v", ErrInvalidConfig, err) + } configPath := existing.GetModelConfigFile() modelsPath := s.modelsPath() diff --git a/core/services/modeladmin/config_test.go b/core/services/modeladmin/config_test.go index d4157047d..36569c19b 100644 --- a/core/services/modeladmin/config_test.go +++ b/core/services/modeladmin/config_test.go @@ -211,5 +211,23 @@ var _ = Describe("ConfigService", func() { _, err := svc.EditYAML(ctx, "alpha", nil, nil) Expect(err).To(MatchError(ErrEmptyBody)) }) + + It("rejects editing a config into an alias with a missing target", func() { + writeModelYAML(svc, dir, "base", map[string]any{"backend": "llama-cpp"}) + + body := []byte("name: base\nalias: ghost\n") + _, err := svc.EditYAML(ctx, "base", body, nil) + Expect(err).To(MatchError(ErrInvalidConfig)) + Expect(err.Error()).To(ContainSubstring("ghost")) + }) + + It("accepts editing a config into an alias with a real target", func() { + writeModelYAML(svc, dir, "base", map[string]any{"backend": "llama-cpp"}) + writeModelYAML(svc, dir, "target", map[string]any{"backend": "llama-cpp"}) + + body := []byte("name: base\nalias: target\n") + _, err := svc.EditYAML(ctx, "base", body, nil) + Expect(err).ToNot(HaveOccurred()) + }) }) }) diff --git a/docs/content/features/model-aliases.md b/docs/content/features/model-aliases.md new file mode 100644 index 000000000..8c4bd977d --- /dev/null +++ b/docs/content/features/model-aliases.md @@ -0,0 +1,81 @@ + ++++ +disableToc = false +title = "Model Aliases" +weight = 24 +url = "/features/model-aliases/" ++++ + +A **model alias** is a model name that redirects all traffic to another +configured model. Declare `gpt-4` as an alias of `my-llama-3` and every client +calling `gpt-4` is served by `my-llama-3` with no client reconfiguration: the +clients keep their existing model name while you control what answers them on +the server side. + +## Declaring an alias + +Create a minimal config file in your models directory: + +```yaml +name: gpt-4 +alias: my-llama-3 +``` + +That is the whole config: a `name` (the alias clients call) and an `alias` key +(the target that actually serves the request). + +## Rules and behavior + +- The target (`my-llama-3`) must be an existing, non-alias, enabled model. You + cannot point an alias at a missing model, a disabled model, or another alias + (no chains). +- Aliases are 1:1. One alias maps to exactly one target. +- The target can be swapped live by editing the config file, calling the API, + using the UI, or asking the assistant. No restart is required. +- Both `gpt-4` and `my-llama-3` appear in `GET /v1/models`. +- Responses echo the requested alias: a call to `gpt-4` returns `gpt-4` in the + response `model` field, not the target name. +- Usage accounting records both sides: requested `gpt-4`, served `my-llama-3`. +- Aliases work for every modality (chat, embeddings, audio, images, and so on). + +## Managing aliases + +You can create, swap, and remove aliases from any of the management surfaces. + +### Web UI + +Open **Add Model** and pick the **Alias / Routing** template, then set a name +and a target. To re-point an existing alias, edit it and change the target. + +### REST API + +- Create: `POST /models/import` +- Swap the target: `PATCH /api/models/config-json/:name` +- List all aliases: `GET /api/aliases` +- Delete: `POST /models/delete/:name` + +### Assistant and MCP + +The LocalAI Assistant (and the MCP server) expose the same operations as tools: +`set_alias`, `list_aliases`, and `delete_model`. + +{{% notice note %}} +**You cannot turn an existing real model into an alias.** If you run `set_alias` +(or `PATCH /api/models/config-json/:name`) against a name that is already a real, +non-alias model, the request is **rejected**. An alias is a pure redirect, so it +must not carry a `backend` or `parameters.model`; a real model does, and merging +an `alias` onto it produces an invalid config that validation refuses with +`alias config ... must not set backend or parameters.model`. This is intentional: +it stops a stray `set_alias` call from clobbering a model that is serving. + +To add an alias, point a **new** name at the target instead of reusing an +existing model's name. Re-pointing an **existing alias** at a different target +is fully supported and is the live-swap path: the alias config has no backend of +its own, so swapping its target stays a valid pure redirect. +{{% /notice %}} + +## Limits + +Aliases are a static 1:1 redirect. For classifier-based or load-balanced +selection across several downstream models, use the intelligent router in the +[Middleware]({{%relref "features/middleware" %}}) feature instead. diff --git a/pkg/mcp/localaitools/client.go b/pkg/mcp/localaitools/client.go index 5ac519aca..f6f6114be 100644 --- a/pkg/mcp/localaitools/client.go +++ b/pkg/mcp/localaitools/client.go @@ -38,6 +38,14 @@ type LocalAIClient interface { ReloadModels(ctx context.Context) error ImportModelURI(ctx context.Context, req ImportModelURIRequest) (*ImportModelURIResponse, error) + // ---- Model aliases ---- + // SetAlias creates the alias `name` pointing at `target`, or swaps an + // existing alias's target. The server validates that `target` is an + // existing, non-alias, enabled model. Deletion reuses DeleteModel. + SetAlias(ctx context.Context, name, target string) error + // ListAliases returns every configured alias and its target. + ListAliases(ctx context.Context) ([]AliasInfo, error) + // ---- Backends ---- // ListBackends returns installed backends. The shape stays a thin // localaitools.Backend rather than gallery.SystemBackend because the diff --git a/pkg/mcp/localaitools/coverage_test.go b/pkg/mcp/localaitools/coverage_test.go index ddf5e9c1d..39a2ab544 100644 --- a/pkg/mcp/localaitools/coverage_test.go +++ b/pkg/mcp/localaitools/coverage_test.go @@ -41,6 +41,7 @@ var toolToHTTPRoute = map[string]string{ ToolGetPIIEvents: "GET /api/pii/events", ToolGetMiddlewareStatus: "GET /api/middleware/status", ToolGetRouterDecisions: "GET /api/router/decisions", + ToolListAliases: "GET /api/aliases", // Mutating tools. ToolInstallModel: "POST /models/apply", @@ -53,6 +54,7 @@ var toolToHTTPRoute = map[string]string{ ToolToggleModelState: "PUT /models/toggle-state/:name/:action", ToolToggleModelPinned: "PUT /models/toggle-pinned/:name/:action", ToolSetBranding: "POST /api/settings (instance_name, instance_tagline)", + ToolSetAlias: "PATCH /api/models/config-json/:name (swap) or POST /models/import (create)", } // allKnownTools is the union of expectedFullCatalog (defined in diff --git a/pkg/mcp/localaitools/dto.go b/pkg/mcp/localaitools/dto.go index 77e9a9065..f8aa98eee 100644 --- a/pkg/mcp/localaitools/dto.go +++ b/pkg/mcp/localaitools/dto.go @@ -52,6 +52,14 @@ type ModelConfigView struct { JSON map[string]any `json:"json,omitempty" jsonschema:"Parsed JSON view of the same config (convenience for diffing)."` } +// AliasInfo is one alias -> target pair, the shape list_aliases returns and +// GET /api/aliases emits. Kept aligned with localai.AliasInfo so the +// MCP wire output matches the REST endpoint by construction. +type AliasInfo struct { + Name string `json:"name"` + Target string `json:"target"` +} + // InstallModelRequest is the input for install_model. type InstallModelRequest struct { GalleryName string `json:"gallery_name,omitempty" jsonschema:"The gallery the model lives in (from gallery_search). Optional when ModelName is unique across galleries."` diff --git a/pkg/mcp/localaitools/fakes_test.go b/pkg/mcp/localaitools/fakes_test.go index 3d76ae8b9..388245ad2 100644 --- a/pkg/mcp/localaitools/fakes_test.go +++ b/pkg/mcp/localaitools/fakes_test.go @@ -32,6 +32,8 @@ type fakeClient struct { importModelURI func(ImportModelURIRequest) (*ImportModelURIResponse, error) deleteModel func(string) error editModelConfig func(string, map[string]any) error + setAlias func(string, string) error + listAliases func() ([]AliasInfo, error) reloadModels func() error listBackends func() ([]Backend, error) listKnownBackends func() ([]schema.KnownBackend, error) @@ -143,6 +145,22 @@ func (f *fakeClient) EditModelConfig(_ context.Context, name string, patch map[s return nil } +func (f *fakeClient) SetAlias(_ context.Context, name, target string) error { + f.record("SetAlias", []any{name, target}) + if f.setAlias != nil { + return f.setAlias(name, target) + } + return nil +} + +func (f *fakeClient) ListAliases(_ context.Context) ([]AliasInfo, error) { + f.record("ListAliases", nil) + if f.listAliases != nil { + return f.listAliases() + } + return []AliasInfo{}, nil +} + func (f *fakeClient) ReloadModels(_ context.Context) error { f.record("ReloadModels", nil) if f.reloadModels != nil { diff --git a/pkg/mcp/localaitools/httpapi/client.go b/pkg/mcp/localaitools/httpapi/client.go index d2947a5b1..90ec332e2 100644 --- a/pkg/mcp/localaitools/httpapi/client.go +++ b/pkg/mcp/localaitools/httpapi/client.go @@ -338,6 +338,42 @@ func (c *Client) ReloadModels(ctx context.Context) error { return c.do(ctx, http.MethodPost, routeModelsReload, nil, nil) } +// ---- Model aliases ---- + +// SetAlias is swap-first: it PATCHes the alias config (a deep-merge that +// validates the target and preserves any other fields), and only creates a +// fresh config when the PATCH reports the model doesn't exist yet. We prefer +// PATCH over POST /models/import for existing names because import rewrites +// the whole file, whereas PATCH gives a reliable 404 not-found signal +// (ErrHTTPNotFound) to branch on and never clobbers an existing config. +func (c *Client) SetAlias(ctx context.Context, name, target string) error { + if name == "" { + return errors.New("name is required") + } + if target == "" { + return errors.New("target is required") + } + err := c.do(ctx, http.MethodPatch, routeModelConfigJSON(name), map[string]any{"alias": target}, nil) + if err == nil { + return nil + } + if !errors.Is(err, ErrHTTPNotFound) { + return err + } + // No such config yet: create it. The import endpoint validates the alias + // target server-side, same as the PATCH path. + return c.do(ctx, http.MethodPost, routeModelImport, map[string]any{"name": name, "alias": target}, nil) +} + +func (c *Client) ListAliases(ctx context.Context) ([]localaitools.AliasInfo, error) { + // /api/aliases returns []{name,target} directly - pass it through. + var out []localaitools.AliasInfo + if err := c.do(ctx, http.MethodGet, routeAliases, nil, &out); err != nil { + return nil, err + } + return out, nil +} + // ---- Backends ---- func (c *Client) ListBackends(ctx context.Context) ([]localaitools.Backend, error) { diff --git a/pkg/mcp/localaitools/httpapi/client_test.go b/pkg/mcp/localaitools/httpapi/client_test.go index 6e6fc3972..319ceffee 100644 --- a/pkg/mcp/localaitools/httpapi/client_test.go +++ b/pkg/mcp/localaitools/httpapi/client_test.go @@ -166,6 +166,92 @@ var _ = Describe("httpapi.Client against the LocalAI admin REST surface", func() }) }) +var _ = Describe("Model aliases", func() { + Describe("ListAliases", func() { + It("passes the GET /api/aliases payload through unchanged", func() { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + Expect(r.Method).To(Equal(http.MethodGet)) + Expect(r.URL.Path).To(Equal("/api/aliases")) + _ = json.NewEncoder(w).Encode([]map[string]any{ + {"name": "gpt-4", "target": "qwen"}, + }) + })) + DeferCleanup(srv.Close) + + out, err := New(srv.URL, "").ListAliases(context.Background()) + Expect(err).ToNot(HaveOccurred()) + Expect(out).To(HaveLen(1)) + Expect(out[0].Name).To(Equal("gpt-4")) + Expect(out[0].Target).To(Equal("qwen")) + }) + }) + + Describe("SetAlias", func() { + It("swaps an existing alias via PATCH without falling back to import", func() { + var patched, imported bool + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodPatch && r.URL.Path == "/api/models/config-json/gpt-4": + patched = true + var body map[string]any + Expect(json.NewDecoder(r.Body).Decode(&body)).To(Succeed()) + Expect(body).To(HaveKeyWithValue("alias", "qwen")) + _ = json.NewEncoder(w).Encode(map[string]any{"success": true}) + case r.URL.Path == "/models/import": + imported = true + w.WriteHeader(http.StatusOK) + default: + http.Error(w, "unexpected", http.StatusTeapot) + } + })) + DeferCleanup(srv.Close) + + Expect(New(srv.URL, "").SetAlias(context.Background(), "gpt-4", "qwen")).To(Succeed()) + Expect(patched).To(BeTrue(), "PATCH should be attempted first") + Expect(imported).To(BeFalse(), "import must not run when PATCH succeeds") + }) + + It("creates a fresh alias via import when PATCH reports the model is missing", func() { + var imported bool + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodPatch: + http.Error(w, "model configuration not found", http.StatusNotFound) + case r.Method == http.MethodPost && r.URL.Path == "/models/import": + imported = true + var body map[string]any + Expect(json.NewDecoder(r.Body).Decode(&body)).To(Succeed()) + Expect(body).To(HaveKeyWithValue("name", "gpt-4")) + Expect(body).To(HaveKeyWithValue("alias", "qwen")) + _ = json.NewEncoder(w).Encode(map[string]any{"success": true}) + default: + http.Error(w, "unexpected", http.StatusTeapot) + } + })) + DeferCleanup(srv.Close) + + Expect(New(srv.URL, "").SetAlias(context.Background(), "gpt-4", "qwen")).To(Succeed()) + Expect(imported).To(BeTrue(), "import should create the alias on a 404") + }) + + It("surfaces a non-404 PATCH error without attempting import", func() { + var imported bool + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/models/import" { + imported = true + } + http.Error(w, "target is an alias", http.StatusBadRequest) + })) + DeferCleanup(srv.Close) + + err := New(srv.URL, "").SetAlias(context.Background(), "gpt-4", "bad") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("target is an alias")) + Expect(imported).To(BeFalse(), "a 400 swap error must not trigger create") + }) + }) +}) + var _ = Describe("ErrHTTPNotFound", func() { Context("on a clean 404 status", func() { var ( diff --git a/pkg/mcp/localaitools/httpapi/routes.go b/pkg/mcp/localaitools/httpapi/routes.go index 79504dc1b..cc552b728 100644 --- a/pkg/mcp/localaitools/httpapi/routes.go +++ b/pkg/mcp/localaitools/httpapi/routes.go @@ -16,6 +16,8 @@ const ( routeModelsAvail = "/models/available" routeModelsGall = "/models/galleries" routeModelsImport = "/models/import-uri" + routeModelImport = "/models/import" + routeAliases = "/api/aliases" routeModelsReload = "/models/reload" routeBackends = "/backends" routeBackendsKnown = "/backends/known" diff --git a/pkg/mcp/localaitools/inproc/client.go b/pkg/mcp/localaitools/inproc/client.go index 6e047d751..e62934ccc 100644 --- a/pkg/mcp/localaitools/inproc/client.go +++ b/pkg/mcp/localaitools/inproc/client.go @@ -9,6 +9,8 @@ import ( "encoding/json" "errors" "fmt" + "os" + "path/filepath" "github.com/google/uuid" "github.com/mudler/LocalAI/core/config" @@ -25,7 +27,9 @@ import ( localaitools "github.com/mudler/LocalAI/pkg/mcp/localaitools" "github.com/mudler/LocalAI/pkg/model" "github.com/mudler/LocalAI/pkg/system" + "github.com/mudler/LocalAI/pkg/utils" "github.com/mudler/LocalAI/pkg/vram" + "gopkg.in/yaml.v3" ) // Client implements localaitools.LocalAIClient by calling LocalAI services @@ -298,6 +302,80 @@ func (c *Client) ReloadModels(_ context.Context) error { return c.ConfigLoader.LoadModelConfigsFromPath(c.SystemState.Model.ModelsPath) } +// ---- Model aliases ---- + +// SetAlias is swap-first to match the httpapi client: PatchConfig swaps an +// existing alias's target (validating it and preserving other fields) and +// returns ErrNotFound when the config doesn't exist yet, which is the signal +// to create it. createAlias mirrors the create path of ImportModelEndpoint. +func (c *Client) SetAlias(ctx context.Context, name, target string) error { + if name == "" { + return errors.New("name is required") + } + if target == "" { + return errors.New("target is required") + } + _, err := c.modelAdmin.PatchConfig(ctx, name, map[string]any{"alias": target}) + if err == nil { + return nil + } + if !errors.Is(err, modeladmin.ErrNotFound) { + return err + } + return c.createAlias(name, target) +} + +// createAlias writes a fresh `{name, alias}` config to disk and reloads, +// mirroring localai.ImportModelEndpoint's create path: validate, validate the +// alias target, verify the path is trusted, write, reload, best-effort preload. +func (c *Client) createAlias(name, target string) error { + if c.SystemState == nil { + return errors.New("system state not available") + } + cfg := config.ModelConfig{Name: name, Alias: target} + if valid, vErr := cfg.Validate(); !valid { + if vErr != nil { + return vErr + } + return errors.New("invalid alias configuration") + } + if err := c.ConfigLoader.ValidateAliasTarget(&cfg); err != nil { + return err + } + modelsPath := c.SystemState.Model.ModelsPath + if err := utils.VerifyPath(name+".yaml", modelsPath); err != nil { + return fmt.Errorf("model path not trusted: %w", err) + } + // Marshal only the user-provided fields (not the full struct with Go + // zero values), matching what the import endpoint persists for an alias. + yamlData, err := yaml.Marshal(map[string]any{"name": name, "alias": target}) + if err != nil { + return fmt.Errorf("marshal alias config: %w", err) + } + // 0600: the LocalAI process is the sole reader/writer of model configs, + // and a tighter mode keeps the gosec G306 scan clean for this new write. + if err := os.WriteFile(filepath.Join(modelsPath, name+".yaml"), yamlData, 0600); err != nil { + return fmt.Errorf("write alias config: %w", err) + } + if err := c.ConfigLoader.LoadModelConfigsFromPath(modelsPath, c.AppConfig.ToConfigLoaderOptions()...); err != nil { + return fmt.Errorf("reload configs: %w", err) + } + // Preload is best-effort - a failure here doesn't undo the create. + _ = c.ConfigLoader.Preload(modelsPath) + return nil +} + +func (c *Client) ListAliases(_ context.Context) ([]localaitools.AliasInfo, error) { + // Mirror localai.ListAliasesEndpoint: every config whose Alias is set. + out := []localaitools.AliasInfo{} + for _, cfg := range c.ConfigLoader.GetAllModelsConfigs() { + if cfg.IsAlias() { + out = append(out, localaitools.AliasInfo{Name: cfg.Name, Target: cfg.Alias}) + } + } + return out, nil +} + // ---- Backends ---- func (c *Client) ListBackends(_ context.Context) ([]localaitools.Backend, error) { diff --git a/pkg/mcp/localaitools/inproc/client_test.go b/pkg/mcp/localaitools/inproc/client_test.go index 1da00602a..e385897c7 100644 --- a/pkg/mcp/localaitools/inproc/client_test.go +++ b/pkg/mcp/localaitools/inproc/client_test.go @@ -3,6 +3,8 @@ package inproc import ( "context" "errors" + "os" + "path/filepath" "time" . "github.com/onsi/ginkgo/v2" @@ -47,3 +49,78 @@ var _ = Describe("inproc.Client cancellation", func() { Expect(errors.Is(err, context.Canceled)).To(BeTrue(), "got: %v", err) }) }) + +var _ = Describe("inproc.Client model aliases", func() { + var ( + ctx context.Context + tempDir string + cl *config.ModelConfigLoader + c *Client + seedModel func(name, body string) + ) + + BeforeEach(func() { + ctx = context.Background() + tempDir = GinkgoT().TempDir() + systemState, err := system.GetSystemState(system.WithModelPath(tempDir)) + Expect(err).ToNot(HaveOccurred()) + appConfig := config.NewApplicationConfig(config.WithSystemState(systemState)) + cl = config.NewModelConfigLoader(tempDir) + // Gallery/model loaders are unused by the alias methods, so nil is fine. + c = New(appConfig, systemState, cl, nil, nil) + + seedModel = func(name, body string) { + Expect(os.WriteFile(filepath.Join(tempDir, name+".yaml"), []byte(body), 0644)).To(Succeed()) + Expect(cl.LoadModelConfigsFromPath(tempDir)).To(Succeed()) + } + }) + + Describe("ListAliases", func() { + It("returns only configs whose alias field is set", func() { + seedModel("real", "name: real\nbackend: llama-cpp\n") + seedModel("gpt-4", "name: gpt-4\nalias: real\n") + + out, err := c.ListAliases(ctx) + Expect(err).ToNot(HaveOccurred()) + Expect(out).To(ConsistOf(localaitools.AliasInfo{Name: "gpt-4", Target: "real"})) + }) + + It("returns an empty slice when there are no aliases", func() { + seedModel("real", "name: real\nbackend: llama-cpp\n") + out, err := c.ListAliases(ctx) + Expect(err).ToNot(HaveOccurred()) + Expect(out).To(BeEmpty()) + }) + }) + + Describe("SetAlias", func() { + It("creates a new alias config on disk when the name is unused", func() { + seedModel("real", "name: real\nbackend: llama-cpp\n") + + Expect(c.SetAlias(ctx, "gpt-4", "real")).To(Succeed()) + + Expect(filepath.Join(tempDir, "gpt-4.yaml")).To(BeAnExistingFile()) + out, err := c.ListAliases(ctx) + Expect(err).ToNot(HaveOccurred()) + Expect(out).To(ConsistOf(localaitools.AliasInfo{Name: "gpt-4", Target: "real"})) + }) + + It("swaps an existing alias's target in place", func() { + seedModel("real", "name: real\nbackend: llama-cpp\n") + seedModel("other", "name: other\nbackend: llama-cpp\n") + seedModel("gpt-4", "name: gpt-4\nalias: real\n") + + Expect(c.SetAlias(ctx, "gpt-4", "other")).To(Succeed()) + + out, err := c.ListAliases(ctx) + Expect(err).ToNot(HaveOccurred()) + Expect(out).To(ConsistOf(localaitools.AliasInfo{Name: "gpt-4", Target: "other"})) + }) + + It("rejects an alias whose target does not exist", func() { + err := c.SetAlias(ctx, "gpt-4", "missing") + Expect(err).To(HaveOccurred()) + Expect(filepath.Join(tempDir, "gpt-4.yaml")).ToNot(BeAnExistingFile()) + }) + }) +}) diff --git a/pkg/mcp/localaitools/server.go b/pkg/mcp/localaitools/server.go index fd9f5da00..4b662f66b 100644 --- a/pkg/mcp/localaitools/server.go +++ b/pkg/mcp/localaitools/server.go @@ -43,6 +43,7 @@ func NewServer(client LocalAIClient, opts Options) *mcp.Server { }) registerModelTools(srv, client, opts) + registerAliasTools(srv, client, opts) registerBackendTools(srv, client, opts) registerConfigTools(srv, client, opts) registerSystemTools(srv, client, opts) diff --git a/pkg/mcp/localaitools/server_test.go b/pkg/mcp/localaitools/server_test.go index eb1579449..052ca1e8b 100644 --- a/pkg/mcp/localaitools/server_test.go +++ b/pkg/mcp/localaitools/server_test.go @@ -88,10 +88,12 @@ var expectedFullCatalog = sortedStrings( ToolInstallModel, ToolListBackends, ToolListGalleries, + ToolListAliases, ToolListInstalledModels, ToolListKnownBackends, ToolListNodes, ToolReloadModels, + ToolSetAlias, ToolSetBranding, ToolSystemInfo, ToolToggleModelPinned, @@ -110,6 +112,7 @@ var expectedReadOnlyCatalog = sortedStrings( ToolGetPIIEvents, ToolGetRouterDecisions, ToolGetUsageStats, + ToolListAliases, ToolListBackends, ToolListGalleries, ToolListInstalledModels, @@ -165,6 +168,8 @@ var _ = Describe("Tool dispatch", func() { {ToolReloadModels, struct{}{}, "ReloadModels"}, {ToolToggleModelState, map[string]any{"name": "foo", "action": "enable"}, "ToggleModelState"}, {ToolToggleModelPinned, map[string]any{"name": "foo", "action": "pin"}, "ToggleModelPinned"}, + {ToolSetAlias, map[string]any{"name": "gpt-4", "target": "real"}, "SetAlias"}, + {ToolListAliases, struct{}{}, "ListAliases"}, } for _, c := range cases { diff --git a/pkg/mcp/localaitools/tools.go b/pkg/mcp/localaitools/tools.go index c7bf620c3..263bd791e 100644 --- a/pkg/mcp/localaitools/tools.go +++ b/pkg/mcp/localaitools/tools.go @@ -36,6 +36,11 @@ const ( ToolToggleModelState = "toggle_model_state" ToolToggleModelPinned = "toggle_model_pinned" ToolSetBranding = "set_branding" + ToolSetAlias = "set_alias" + + // ToolListAliases is read-only but lives here so the alias tools stay + // grouped; the catalog tests assert its read-only placement. + ToolListAliases = "list_aliases" ) // DefaultServerName is the MCP Implementation.Name surfaced when diff --git a/pkg/mcp/localaitools/tools_aliases.go b/pkg/mcp/localaitools/tools_aliases.go new file mode 100644 index 000000000..6b75619c1 --- /dev/null +++ b/pkg/mcp/localaitools/tools_aliases.go @@ -0,0 +1,48 @@ +package localaitools + +import ( + "context" + + "github.com/modelcontextprotocol/go-sdk/mcp" +) + +// registerAliasTools wires the conversational alias-management tools. An +// alias redirects all traffic for one model name to another configured +// model; list_aliases enumerates them, set_alias creates or swaps the +// target. Deletion reuses the existing delete_model tool, which works on +// any config including an alias. +func registerAliasTools(s *mcp.Server, client LocalAIClient, opts Options) { + mcp.AddTool(s, &mcp.Tool{ + Name: ToolListAliases, + Description: "List every configured model alias and the target model it routes to.", + }, func(ctx context.Context, _ *mcp.CallToolRequest, _ struct{}) (*mcp.CallToolResult, any, error) { + aliases, err := client.ListAliases(ctx) + if err != nil { + return errorResult(err), nil, nil + } + return jsonResult(aliases), nil, nil + }) + + if opts.DisableMutating { + return + } + + mcp.AddTool(s, &mcp.Tool{ + Name: ToolSetAlias, + Description: "Create a model alias (name -> target) or swap an existing alias's target. The target must be an existing, non-alias, enabled model. Requires user confirmation per safety rule 1.", + }, func(ctx context.Context, _ *mcp.CallToolRequest, args struct { + Name string `json:"name" jsonschema:"The alias name clients will call."` + Target string `json:"target" jsonschema:"The existing model the alias routes to."` + }) (*mcp.CallToolResult, any, error) { + if args.Name == "" { + return errorResultf("name is required"), nil, nil + } + if args.Target == "" { + return errorResultf("target is required"), nil, nil + } + if err := client.SetAlias(ctx, args.Name, args.Target); err != nil { + return errorResult(err), nil, nil + } + return jsonResult(AliasInfo{Name: args.Name, Target: args.Target}), nil, nil + }) +} diff --git a/swagger/docs.go b/swagger/docs.go index 19cb95fd2..20a1f5a3f 100644 --- a/swagger/docs.go +++ b/swagger/docs.go @@ -500,6 +500,25 @@ const docTemplate = `{ } } }, + "/api/aliases": { + "get": { + "tags": [ + "models" + ], + "summary": "List model aliases", + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "array", + "items": { + "$ref": "#/definitions/localai.AliasInfo" + } + } + } + } + } + }, "/api/backend-logs": { "get": { "description": "Returns a sorted list of model IDs that have captured backend process output", @@ -3486,6 +3505,17 @@ const docTemplate = `{ } } }, + "localai.AliasInfo": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "target": { + "type": "string" + } + } + }, "localai.BrandingResponse": { "type": "object", "properties": { diff --git a/swagger/swagger.json b/swagger/swagger.json index e23b81cea..09e03581b 100644 --- a/swagger/swagger.json +++ b/swagger/swagger.json @@ -497,6 +497,25 @@ } } }, + "/api/aliases": { + "get": { + "tags": [ + "models" + ], + "summary": "List model aliases", + "responses": { + "200": { + "description": "OK", + "schema": { + "type": "array", + "items": { + "$ref": "#/definitions/localai.AliasInfo" + } + } + } + } + } + }, "/api/backend-logs": { "get": { "description": "Returns a sorted list of model IDs that have captured backend process output", @@ -3483,6 +3502,17 @@ } } }, + "localai.AliasInfo": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "target": { + "type": "string" + } + } + }, "localai.BrandingResponse": { "type": "object", "properties": { diff --git a/swagger/swagger.yaml b/swagger/swagger.yaml index 719b72f6c..a25674539 100644 --- a/swagger/swagger.yaml +++ b/swagger/swagger.yaml @@ -281,6 +281,13 @@ definitions: type: string type: array type: object + localai.AliasInfo: + properties: + name: + type: string + target: + type: string + type: object localai.BrandingResponse: properties: favicon_url: @@ -2780,6 +2787,18 @@ paths: summary: Execute an agent task by name tags: - agent-jobs + /api/aliases: + get: + responses: + "200": + description: OK + schema: + items: + $ref: '#/definitions/localai.AliasInfo' + type: array + summary: List model aliases + tags: + - models /api/backend-logs: get: description: Returns a sorted list of model IDs that have captured backend process From aef10723c9d92f5ef8c3fa1219de4af91ce675a9 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 22:44:44 +0200 Subject: [PATCH 18/99] feat(config): prefix caching default + consolidate scattered defaults (#10415) * feat(config): enable cross-request prefix caching for serving (Phase 2) The llama.cpp backend ships n_cache_reuse=0 (cross-request KV prefix reuse via shifting disabled). Enable it by default (256) so repeated prefixes - system prompts, RAG context, agent scaffolds, multi-turn chat - aren't recomputed. This is the universally-useful part of 'paged attention' (shared-prefix reuse, which the upstream maintainers themselves identify as where paged attn actually helps) and needs none of the block-KV machinery. Lives in a serving_defaults.go sibling to hardware_defaults.go (device-driven vs serving-policy defaults); both run from SetDefaults and only fill unset values. Explicit cache_reuse/n_cache_reuse always wins. Device-independent, so it propagates to distributed nodes via the model options with no router change. Shares the backendOptionSet helper with the Phase-1 parallel default. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto * refactor(config): extract generic fallback defaults into ApplyGenericDefaults Behavior-preserving: move the inline sampling-param + runtime-flag fallbacks out of SetDefaults into ApplyGenericDefaults, completing the domain-grouped tiers (ApplyInferenceDefaults=family, ApplyHardwareDefaults=device, ApplyServingDefaults =serving, ApplyGenericDefaults=generic fallbacks). SetDefaults is now a clean orchestrator. Same order (runs after the family/hardware/serving tiers so those win) and same conditions (TopK gated on UsesLlamaSamplerDefaults, MMap on XPU). No behavior change; full config suite green. (NGPULayers stays in the GGUF-read path for now - it's device-driven but coupled to model-size detection; a separate follow-up.) Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- core/config/generic_defaults.go | 115 +++++++++++++++++++++++++++ core/config/generic_defaults_test.go | 36 +++++++++ core/config/hardware_defaults.go | 14 +--- core/config/model_config.go | 104 ++---------------------- core/config/serving_defaults.go | 56 +++++++++++++ core/config/serving_defaults_test.go | 30 +++++++ 6 files changed, 246 insertions(+), 109 deletions(-) create mode 100644 core/config/generic_defaults.go create mode 100644 core/config/generic_defaults_test.go create mode 100644 core/config/serving_defaults.go create mode 100644 core/config/serving_defaults_test.go diff --git a/core/config/generic_defaults.go b/core/config/generic_defaults.go new file mode 100644 index 000000000..57cfba514 --- /dev/null +++ b/core/config/generic_defaults.go @@ -0,0 +1,115 @@ +package config + +import "os" + +// ApplyGenericDefaults fills the generic fallback values applied after the +// higher-priority tiers (ApplyInferenceDefaults for the model family, +// ApplyHardwareDefaults for the device, ApplyServingDefaults for serving +// policy): sampling parameters and a few runtime flags. Like the other tiers it +// only fills values still left unset, so model-family / explicit config wins. +func ApplyGenericDefaults(cfg *ModelConfig) { + if cfg == nil { + return + } + + // https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22 + defaultTopP := 0.95 + defaultTopK := 40 + defaultMinP := 0.0 + defaultTemp := 0.9 + // https://github.com/mudler/LocalAI/issues/2780 + defaultMirostat := 0 + defaultMirostatTAU := 5.0 + defaultMirostatETA := 0.1 + defaultTypicalP := 1.0 + defaultTFZ := 1.0 + defaultZero := 0 + + trueV := true + falseV := false + + if cfg.Seed == nil { + // random number generator seed + defaultSeed := RAND_SEED + cfg.Seed = &defaultSeed + } + + // top_k=40 is llama.cpp's sampling default and is wrong for backends whose + // native default differs (issue #6632). Only inject it for the llama.cpp + // family and the empty/auto backend; leave TopK nil for known non-llama + // backends (e.g. mlx, whose intended default is top_k=0) so the wire value + // is 0 rather than a silently-changed 40. + if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) { + cfg.TopK = &defaultTopK + } + + if cfg.MinP == nil { + cfg.MinP = &defaultMinP + } + + if cfg.TypicalP == nil { + cfg.TypicalP = &defaultTypicalP + } + + if cfg.TFZ == nil { + cfg.TFZ = &defaultTFZ + } + + if cfg.MMap == nil { + // MMap is enabled by default + + // Only exception is for Intel GPUs + if os.Getenv("XPU") != "" { + cfg.MMap = &falseV + } else { + cfg.MMap = &trueV + } + } + + if cfg.MMlock == nil { + // MMlock is disabled by default + cfg.MMlock = &falseV + } + + if cfg.TopP == nil { + cfg.TopP = &defaultTopP + } + if cfg.Temperature == nil { + cfg.Temperature = &defaultTemp + } + + if cfg.Maxtokens == nil { + cfg.Maxtokens = &defaultZero + } + + if cfg.Mirostat == nil { + cfg.Mirostat = &defaultMirostat + } + + if cfg.MirostatETA == nil { + cfg.MirostatETA = &defaultMirostatETA + } + + if cfg.MirostatTAU == nil { + cfg.MirostatTAU = &defaultMirostatTAU + } + + if cfg.LowVRAM == nil { + cfg.LowVRAM = &falseV + } + + if cfg.Embeddings == nil { + cfg.Embeddings = &falseV + } + + if cfg.Reranking == nil { + cfg.Reranking = &falseV + } + + if cfg.PromptCacheAll == nil { + // Match upstream llama.cpp's default (common/common.h: cache_prompt = true) + // and let cache_idle_slots / kv_unified actually do useful work; users can + // opt out with an explicit `prompt_cache_all: false` in the model YAML. + cfg.PromptCacheAll = &trueV + } +} diff --git a/core/config/generic_defaults_test.go b/core/config/generic_defaults_test.go new file mode 100644 index 000000000..7cb080c0b --- /dev/null +++ b/core/config/generic_defaults_test.go @@ -0,0 +1,36 @@ +package config_test + +import ( + . "github.com/mudler/LocalAI/core/config" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("ApplyGenericDefaults (generic fallback tier)", func() { + It("fills sampling + runtime fallbacks when unset", func() { + cfg := &ModelConfig{} // empty backend uses the llama sampler defaults + ApplyGenericDefaults(cfg) + Expect(cfg.TopP).ToNot(BeNil()) + Expect(*cfg.TopP).To(Equal(0.95)) + Expect(*cfg.TopK).To(Equal(40)) + Expect(*cfg.Temperature).To(Equal(0.9)) + Expect(*cfg.MMap).To(BeTrue()) + Expect(*cfg.MMlock).To(BeFalse()) + Expect(*cfg.PromptCacheAll).To(BeTrue()) + }) + + It("never overrides explicit values", func() { + tk := 7 + tp := 0.5 + cfg := &ModelConfig{} + cfg.TopK = &tk + cfg.TopP = &tp + ApplyGenericDefaults(cfg) + Expect(*cfg.TopK).To(Equal(7)) + Expect(*cfg.TopP).To(Equal(0.5)) + }) + + It("no-ops on nil", func() { + Expect(func() { ApplyGenericDefaults(nil) }).ToNot(Panic()) + }) +}) diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go index 2ed54265f..114785ce4 100644 --- a/core/config/hardware_defaults.go +++ b/core/config/hardware_defaults.go @@ -111,19 +111,9 @@ func EnsureParallelOption(opts []string, gpu GPU) []string { } // hasParallelOption reports whether the model already sets parallel/n_parallel -// (backend options are "name:value" strings) so we never override an explicit value. +// so we never override an explicit value (helper shared with serving_defaults.go). func hasParallelOption(opts []string) bool { - for _, o := range opts { - name := o - if i := strings.IndexByte(o, ':'); i >= 0 { - name = o[:i] - } - switch strings.TrimSpace(strings.ToLower(name)) { - case "parallel", "n_parallel": - return true - } - } - return false + return backendOptionSet(opts, "parallel", "n_parallel") } // localGPU builds a GPU descriptor from local detection, used by SetDefaults on diff --git a/core/config/model_config.go b/core/config/model_config.go index 50836b99e..9586beea3 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -1126,107 +1126,17 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) { // heuristics for the selected node's GPU before loading. Explicit config wins. ApplyHardwareDefaults(cfg, localGPU()) - // https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22 - defaultTopP := 0.95 - defaultTopK := 40 - defaultMinP := 0.0 - defaultTemp := 0.9 - // https://github.com/mudler/LocalAI/issues/2780 - defaultMirostat := 0 - defaultMirostatTAU := 5.0 - defaultMirostatETA := 0.1 - defaultTypicalP := 1.0 - defaultTFZ := 1.0 - defaultZero := 0 + // Apply serving-policy defaults (device-independent): cross-request prefix + // caching. Propagates to distributed nodes via the model options. + ApplyServingDefaults(cfg) + + // Generic fallback defaults (sampling params + runtime flags), applied after + // the model-family / hardware / serving tiers above. Only fills unset values. + ApplyGenericDefaults(cfg) trueV := true falseV := false - if cfg.Seed == nil { - // random number generator seed - defaultSeed := RAND_SEED - cfg.Seed = &defaultSeed - } - - // top_k=40 is llama.cpp's sampling default and is wrong for backends whose - // native default differs (issue #6632). Only inject it for the llama.cpp - // family and the empty/auto backend; leave TopK nil for known non-llama - // backends (e.g. mlx, whose intended default is top_k=0) so the wire value - // is 0 rather than a silently-changed 40. - if cfg.TopK == nil && UsesLlamaSamplerDefaults(cfg.Backend) { - cfg.TopK = &defaultTopK - } - - if cfg.MinP == nil { - cfg.MinP = &defaultMinP - } - - if cfg.TypicalP == nil { - cfg.TypicalP = &defaultTypicalP - } - - if cfg.TFZ == nil { - cfg.TFZ = &defaultTFZ - } - - if cfg.MMap == nil { - // MMap is enabled by default - - // Only exception is for Intel GPUs - if os.Getenv("XPU") != "" { - cfg.MMap = &falseV - } else { - cfg.MMap = &trueV - } - } - - if cfg.MMlock == nil { - // MMlock is disabled by default - cfg.MMlock = &falseV - } - - if cfg.TopP == nil { - cfg.TopP = &defaultTopP - } - if cfg.Temperature == nil { - cfg.Temperature = &defaultTemp - } - - if cfg.Maxtokens == nil { - cfg.Maxtokens = &defaultZero - } - - if cfg.Mirostat == nil { - cfg.Mirostat = &defaultMirostat - } - - if cfg.MirostatETA == nil { - cfg.MirostatETA = &defaultMirostatETA - } - - if cfg.MirostatTAU == nil { - cfg.MirostatTAU = &defaultMirostatTAU - } - - if cfg.LowVRAM == nil { - cfg.LowVRAM = &falseV - } - - if cfg.Embeddings == nil { - cfg.Embeddings = &falseV - } - - if cfg.Reranking == nil { - cfg.Reranking = &falseV - } - - if cfg.PromptCacheAll == nil { - // Match upstream llama.cpp's default (common/common.h: cache_prompt = true) - // and let cache_idle_slots / kv_unified actually do useful work; users can - // opt out with an explicit `prompt_cache_all: false` in the model YAML. - cfg.PromptCacheAll = &trueV - } - if threads == 0 { // Threads can't be 0 threads = 4 diff --git a/core/config/serving_defaults.go b/core/config/serving_defaults.go new file mode 100644 index 000000000..3b10e7000 --- /dev/null +++ b/core/config/serving_defaults.go @@ -0,0 +1,56 @@ +package config + +import ( + "fmt" + "strings" + + "github.com/mudler/xlog" +) + +// Serving-policy model-config defaults. +// +// Sibling to hardware_defaults.go: those fill values driven by the target +// *device* (Blackwell batch, VRAM-scaled parallel slots); these fill values +// that improve multi-request / multi-user *serving* regardless of the GPU. They +// run together from SetDefaults and only ever fill values the user left unset. + +// DefaultCacheReuse is the minimum shared-prefix chunk (in tokens) the backend +// reuses across requests via KV-cache shifting. The llama.cpp backend ships this +// disabled (n_cache_reuse = 0); we enable it so repeated prefixes (system +// prompts, RAG context, agent scaffolds, multi-turn chat) are not recomputed. +// This is the universally-useful part of "paged attention" (cross-request prefix +// sharing) and needs none of the block-KV machinery. +const DefaultCacheReuse = 256 + +// ApplyServingDefaults fills serving-policy ModelConfig values the user left +// unset. Currently: enable cross-request prefix caching. Explicit +// cache_reuse/n_cache_reuse in the model options always wins. +func ApplyServingDefaults(cfg *ModelConfig) { + if cfg == nil { + return + } + if !backendOptionSet(cfg.Options, "cache_reuse", "n_cache_reuse") { + cfg.Options = append(cfg.Options, fmt.Sprintf("cache_reuse:%d", DefaultCacheReuse)) + xlog.Debug("[serving_defaults] enabling cross-request prefix cache", + "cache_reuse", DefaultCacheReuse) + } +} + +// backendOptionSet reports whether the backend options already set any of names. +// Options are "name:value" strings (or bare "name"); used so we never override +// an explicit value. Shared with hardware_defaults.go. +func backendOptionSet(opts []string, names ...string) bool { + for _, o := range opts { + name := o + if i := strings.IndexByte(o, ':'); i >= 0 { + name = o[:i] + } + name = strings.TrimSpace(strings.ToLower(name)) + for _, n := range names { + if name == n { + return true + } + } + } + return false +} diff --git a/core/config/serving_defaults_test.go b/core/config/serving_defaults_test.go new file mode 100644 index 000000000..2a5bba72a --- /dev/null +++ b/core/config/serving_defaults_test.go @@ -0,0 +1,30 @@ +package config_test + +import ( + . "github.com/mudler/LocalAI/core/config" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Serving-policy config defaults", func() { + Describe("ApplyServingDefaults (cross-request prefix cache)", func() { + It("enables cache_reuse when unset", func() { + cfg := &ModelConfig{} + ApplyServingDefaults(cfg) + Expect(cfg.Options).To(ContainElement("cache_reuse:256")) + }) + It("never overrides an explicit cache_reuse", func() { + cfg := &ModelConfig{Options: []string{"cache_reuse:0"}} + ApplyServingDefaults(cfg) + Expect(cfg.Options).To(Equal([]string{"cache_reuse:0"})) + }) + It("recognizes the n_cache_reuse alias", func() { + cfg := &ModelConfig{Options: []string{"n_cache_reuse:512"}} + ApplyServingDefaults(cfg) + Expect(cfg.Options).To(Equal([]string{"n_cache_reuse:512"})) + }) + It("no-ops on nil", func() { + Expect(func() { ApplyServingDefaults(nil) }).ToNot(Panic()) + }) + }) +}) From 23f225260c530f1bac8fcba6c3321a022333060f Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 22:58:36 +0200 Subject: [PATCH 19/99] refactor(config): single source of truth for default values (#10418) refactor(config): single source of truth for default values across config + backend Defaults were decided in two areas with duplicated/drifted literals: the config SetDefaults tiers vs core/backend/options.go's grpcModelOpts (which translates a ModelConfig to the backend wire format and supplied its own fallbacks). They had drifted - n_gpu_layers 9999999 (options.go) vs 99999999 (gguf.go), two 512 batch constants, context 1024 (gguf) vs 4096 (backend) scattered as bare literals. Introduce core/config/defaults.go as the canonical home (DefaultContextSize=4096, GGUFFallbackContextSize=1024, DefaultNGPULayers=99999999, DefaultFlashAttention= auto). gguf.go / hooks_llamacpp.go use them directly; core/backend references them (backend imports config, never the reverse) so DefaultContextSize/DefaultBatchSize and the flash-attn / n_gpu_layers fallbacks resolve to one place. The two context values (1024 GGUF-no-estimate vs 4096 general) are kept distinct but now named + documented, not blind literals. Behavior-preserving; config + backend suites green. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- core/backend/options.go | 11 ++++++----- core/config/defaults.go | 30 ++++++++++++++++++++++++++++++ core/config/gguf.go | 9 ++------- core/config/hardware_defaults.go | 2 +- core/config/hooks_llamacpp.go | 2 +- 5 files changed, 40 insertions(+), 14 deletions(-) create mode 100644 core/config/defaults.go diff --git a/core/backend/options.go b/core/backend/options.go index efe6c649f..18c3b7f27 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -90,10 +90,11 @@ func getSeed(c config.ModelConfig) int32 { // DefaultContextSize and DefaultBatchSize are the backend's fallbacks when a // model config leaves them unset. Exported so callers that must respect the // effective decode window — notably the router's prompt trimmer — resolve the -// same numbers grpcModelOpts does instead of guessing. +// same numbers grpcModelOpts does instead of guessing. The values are owned by +// core/config (single source of truth shared with the config default tiers). const ( - DefaultContextSize = 4096 - DefaultBatchSize = 512 + DefaultContextSize = config.DefaultContextSize + DefaultBatchSize = config.DefaultPhysicalBatch ) // EffectiveContextSize is the context window the backend will run with: the @@ -129,7 +130,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { ctxSize := EffectiveContextSize(c) b := EffectiveBatchSize(c) - flashAttention := "auto" + flashAttention := config.DefaultFlashAttention if c.FlashAttention != nil { flashAttention = *c.FlashAttention @@ -175,7 +176,7 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { mmlock = *c.MMlock } - nGPULayers := 9999999 + nGPULayers := config.DefaultNGPULayers if c.NGPULayers != nil { nGPULayers = *c.NGPULayers } diff --git a/core/config/defaults.go b/core/config/defaults.go new file mode 100644 index 000000000..18625fab3 --- /dev/null +++ b/core/config/defaults.go @@ -0,0 +1,30 @@ +package config + +// Canonical default values. +// +// These are owned here so the two layers that need them share a single source +// of truth: the config tiers (ApplyInference/Hardware/Serving/Generic — which +// *decide* defaults) and core/backend/options.go (which *translates* a +// ModelConfig to the backend wire format and supplies the same fallbacks +// defensively). Previously these were duplicated as literals across both +// packages and had drifted (e.g. n_gpu_layers 9999999 vs 99999999, two batch +// constants of 512). core/backend imports core/config, so backend references +// these; config never imports backend. +const ( + // DefaultContextSize is the fallback context window when none is configured + // or estimable from the model. + DefaultContextSize = 4096 + + // GGUFFallbackContextSize is the context window for a GGUF model whose + // metadata yields no usable estimate (see guessGGUFFromFile). Deliberately + // smaller than DefaultContextSize to stay conservative on memory there. + GGUFFallbackContextSize = 1024 + + // DefaultNGPULayers means "offload all layers"; the backend (fit_params) + // clamps to what actually fits in device memory. + DefaultNGPULayers = 99999999 + + // DefaultFlashAttention is the flash-attention mode default; "auto" lets the + // backend enable it when the model + backend support it. + DefaultFlashAttention = "auto" +) diff --git a/core/config/gguf.go b/core/config/gguf.go index 5e04f5693..16e43c914 100644 --- a/core/config/gguf.go +++ b/core/config/gguf.go @@ -14,11 +14,6 @@ import ( "github.com/gpustack/gguf-parser-go/util/ptr" ) -const ( - defaultContextSize = 1024 - defaultNGPULayers = 99999999 -) - // reservedNonChatModel reports whether the operator reserved this model for an // internal primitive — the router score classifier or the PII NER // token_classify tier. Such a model has no chat template and must not be @@ -38,7 +33,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { cSize := int(ctxSize) cfg.ContextSize = &cSize } else { - defaultCtx = defaultContextSize + defaultCtx = GGUFFallbackContextSize cfg.ContextSize = &defaultCtx } } @@ -52,7 +47,7 @@ func guessGGUFFromFile(cfg *ModelConfig, f *gguf.GGUFFile, defaultCtx int) { if cfg.NGPULayers == nil { // we assume we want to offload all layers - defaultHigh := defaultNGPULayers + defaultHigh := DefaultNGPULayers cfg.NGPULayers = &defaultHigh } diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go index 114785ce4..18c321639 100644 --- a/core/config/hardware_defaults.go +++ b/core/config/hardware_defaults.go @@ -37,7 +37,7 @@ type GPU struct { // Physical batch (n_batch / n_ubatch) defaults. const ( // DefaultPhysicalBatch is the conservative default when no hardware-specific - // tuning applies. Matches backend.DefaultBatchSize. + // tuning applies. core/backend.DefaultBatchSize references this (single source). DefaultPhysicalBatch = 512 // BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs // (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical diff --git a/core/config/hooks_llamacpp.go b/core/config/hooks_llamacpp.go index 4ced8a9b1..09bdbe868 100644 --- a/core/config/hooks_llamacpp.go +++ b/core/config/hooks_llamacpp.go @@ -34,7 +34,7 @@ func llamaCppDefaults(cfg *ModelConfig, modelPath string) { // Default context size if not set, regardless of whether GGUF parsing succeeds defer func() { if cfg.ContextSize == nil { - ctx := defaultContextSize + ctx := GGUFFallbackContextSize cfg.ContextSize = &ctx } }() From 3e96d811b7c507dd5e093f0625c00fff16d4a514 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 20 Jun 2026 23:25:29 +0200 Subject: [PATCH 20/99] fix(ui): keep row action menu anchored and stop scroll snap on /app/manage (#10419) Opening a model row's kebab (ActionMenu) on the Manage dashboard snapped the page scroll to the top and rendered the menu detached from its trigger, making it impossible to operate. Two compounding causes: - The menu auto-focus called el.focus() without preventScroll, so the browser scrolled the focused element into view, yanking the page to the top. - The position:fixed Popover was rendered inline inside the table row. The editorial UI overhaul added hover transforms to rows/cards, and a transformed ancestor re-anchors position:fixed to itself instead of the viewport, so the menu (positioned from the trigger's viewport rect) landed in the wrong place. Fix: portal the Popover to document.body so position:fixed always resolves against the viewport, position it before paint with useLayoutEffect (no {0,0} flash), and pass preventScroll:true to both focus calls. Adds an e2e regression test that reproduces the symptom (scroll jumped from 564 to 0 on the old code) and asserts the menu tracks its trigger. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- .../e2e/manage-action-menu-position.spec.js | 50 +++++++++++++++++++ .../react-ui/src/components/ActionMenu.jsx | 6 ++- core/http/react-ui/src/components/Popover.jsx | 21 ++++++-- 3 files changed, 70 insertions(+), 7 deletions(-) create mode 100644 core/http/react-ui/e2e/manage-action-menu-position.spec.js diff --git a/core/http/react-ui/e2e/manage-action-menu-position.spec.js b/core/http/react-ui/e2e/manage-action-menu-position.spec.js new file mode 100644 index 000000000..3f4301abe --- /dev/null +++ b/core/http/react-ui/e2e/manage-action-menu-position.spec.js @@ -0,0 +1,50 @@ +import { test, expect } from './coverage-fixtures.js' + +// Regression: opening a row's kebab (ActionMenu) on /app/manage used to snap +// the page scroll to the top and render the menu detached from its trigger, +// making it impossible to operate. Two causes: the menu auto-focus scrolled +// the page (no preventScroll), and the position:fixed popover was rendered +// inside a row whose hover `transform` re-anchored it. Fix portals the popover +// to document.body, positions it before paint, and focuses without scrolling. +test.describe('Manage Page - Action menu positioning', () => { + test('opening a row menu keeps scroll stable and places the menu by its trigger', async ({ page }) => { + // Small viewport so the page is scrollable and a scroll jump is observable. + await page.setViewportSize({ width: 1024, height: 500 }) + await page.goto('/app/manage') + await expect(page.locator('.table')).toBeVisible({ timeout: 10_000 }) + + const trigger = page.locator('button.action-menu__trigger').first() + await expect(trigger).toBeVisible() + + // Bring the trigger into view ourselves first, so the only scroll we then + // measure is the one the menu would (wrongly) cause - not Playwright's own + // scroll-into-view before the click. + await trigger.scrollIntoViewIfNeeded() + const scrollBefore = await page.evaluate(() => window.scrollY) + await trigger.click() + + const menu = page.locator('[role="menu"]') + await expect(menu).toBeVisible() + + // Behavioural symptom 1: focusing the menu must not yank the page scroll. + const scrollAfter = await page.evaluate(() => window.scrollY) + expect(scrollAfter).toBe(scrollBefore) + + // Behavioural symptom 2: the menu must sit next to its trigger, not float + // at the top of the window where it can't be operated. + const triggerBox = await trigger.boundingBox() + const menuBox = await menu.boundingBox() + expect(triggerBox).not.toBeNull() + expect(menuBox).not.toBeNull() + // Menu top is within ~24px of the trigger's bottom (below) or above it + // (flipped) — in all cases it tracks the trigger, never floating at y≈0. + const tracksTrigger = + Math.abs(menuBox.y - (triggerBox.y + triggerBox.height)) < 24 || + Math.abs((menuBox.y + menuBox.height) - triggerBox.y) < 24 + expect(tracksTrigger).toBe(true) + + // Mechanism: the popover must be portaled to document.body so position:fixed + // resolves against the viewport, not a transformed ancestor row. + await expect(page.locator('body > .popover')).toHaveCount(1) + }) +}) diff --git a/core/http/react-ui/src/components/ActionMenu.jsx b/core/http/react-ui/src/components/ActionMenu.jsx index 5c58ecd78..55010102c 100644 --- a/core/http/react-ui/src/components/ActionMenu.jsx +++ b/core/http/react-ui/src/components/ActionMenu.jsx @@ -95,9 +95,11 @@ export default function ActionMenu({ items, ariaLabel = 'Actions', triggerLabel, className="action-menu" onKeyDown={handleMenuKeyDown} // Capture focus when the menu opens so arrow keys work without the - // user clicking inside first. + // user clicking inside first. preventScroll: the popover is portaled + // and positioned by the trigger rect, so focusing it must not scroll + // the page (that yanked the view to the top before it was placed). tabIndex={-1} - ref={el => { if (el && open) el.focus() }} + ref={el => { if (el && open) el.focus({ preventScroll: true }) }} > {visible.map((item, i) => { if (item.divider) { diff --git a/core/http/react-ui/src/components/Popover.jsx b/core/http/react-ui/src/components/Popover.jsx index 96a9e217e..7d002d348 100644 --- a/core/http/react-ui/src/components/Popover.jsx +++ b/core/http/react-ui/src/components/Popover.jsx @@ -1,10 +1,17 @@ -import { useEffect, useRef, useState, useCallback } from 'react' +import { useEffect, useLayoutEffect, useRef, useState, useCallback } from 'react' +import { createPortal } from 'react-dom' // Minimal popover: positions itself below-right of the trigger's bounding box, // flips above when there isn't room below, closes on outside click or Escape, // returns focus to the trigger. Uses the existing .card surface so it picks // up theme/border/shadow automatically — no new theming work. // +// Rendered through a portal on document.body: the popover is position:fixed and +// positioned from the trigger's viewport rect, so it must escape any ancestor +// that establishes a containing block (a row/card with a hover `transform` +// would otherwise re-anchor `position:fixed` to itself, throwing the menu to +// the wrong spot and making it unusable). +// // Props: // anchor: ref to the trigger DOMElement (required) // open: boolean @@ -30,7 +37,9 @@ export default function Popover({ anchor, open, onClose, children, ariaLabel }) setPos({ top, left: Math.max(8, left), flipped }) }, [anchor]) - useEffect(() => { + // useLayoutEffect so we measure + place the popover before the browser + // paints — otherwise it flashes at its initial {0,0} for a frame. + useLayoutEffect(() => { if (!open) return reposition() window.addEventListener('resize', reposition) @@ -65,14 +74,15 @@ export default function Popover({ anchor, open, onClose, children, ariaLabel }) if (!open && anchor?.current) { // requestAnimationFrame so the close is painted before focus jumps; // otherwise screen readers announce the trigger mid-transition. - const raf = requestAnimationFrame(() => anchor.current?.focus?.()) + // preventScroll: focusing the trigger must not yank the page scroll. + const raf = requestAnimationFrame(() => anchor.current?.focus?.({ preventScroll: true })) return () => cancelAnimationFrame(raf) } }, [open, anchor]) if (!open) return null - return ( + return createPortal(
{children} -
+ , + document.body ) } From c6303104c77040c8d16e7ae226b3b783d25b1e3e Mon Sep 17 00:00:00 2001 From: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com> Date: Sun, 21 Jun 2026 17:02:31 +0200 Subject: [PATCH 21/99] fix(vllm): structured outputs silently ignored on vLLM >= 0.23 (GuidedDecodingParams removed) (#10343) fix(vllm): structured outputs silently ignored on vLLM >= 0.23 vLLM >= 0.23 removed GuidedDecodingParams (now StructuredOutputsParams) and renamed the SamplingParams field guided_decoding -> structured_outputs. The import failed, HAS_GUIDED_DECODING became False, and the whole guided-decoding block was skipped, so response_format / grammar constraints were silently ignored. Adapt the existing request.Grammar path to the new class/field. Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com> --- backend/python/vllm/backend.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 5d5662857..20064e233 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -48,8 +48,10 @@ try: except ImportError: HAS_REASONING_PARSERS = False +# vLLM >= 0.23 renamed GuidedDecodingParams -> StructuredOutputsParams and the +# SamplingParams field guided_decoding -> structured_outputs. try: - from vllm.sampling_params import GuidedDecodingParams + from vllm.sampling_params import StructuredOutputsParams HAS_GUIDED_DECODING = True except ImportError: HAS_GUIDED_DECODING = False @@ -536,13 +538,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): if value not in (None, 0, [], False): setattr(sampling_params, param_field, value) - # Guided decoding: use Grammar field to pass JSON schema or BNF + # Structured-output decoding: use Grammar field to pass JSON schema or BNF if HAS_GUIDED_DECODING and request.Grammar: try: json.loads(request.Grammar) # valid JSON = JSON schema - sampling_params.guided_decoding = GuidedDecodingParams(json=request.Grammar) + sampling_params.structured_outputs = StructuredOutputsParams(json=request.Grammar) except json.JSONDecodeError: - sampling_params.guided_decoding = GuidedDecodingParams(grammar=request.Grammar) + sampling_params.structured_outputs = StructuredOutputsParams(grammar=request.Grammar) # Extract image paths and process images prompt = request.Prompt From cf7f9573a2a1e3d01927c3ebb785623e47822684 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=95=AA=E8=8C=84=E6=91=94=E6=88=90=E7=95=AA=E8=8C=84?= =?UTF-8?q?=E9=85=B1?= <68098251+fqscfqj@users.noreply.github.com> Date: Sun, 21 Jun 2026 23:03:33 +0800 Subject: [PATCH 22/99] fix(crispasr): filter garbage words from parakeet word-level timestamps (#10421) The parakeet-specific word accessors can return stale initialisation data (model name, binary blobs) for segments with no real speech. Add isValidWord() to filter out words that have: - empty or whitespace-only text - U+FFFD replacement characters (from binary data scrubbing) - negative timestamps - zero duration (end <= start) Also skip empty segments entirely when they have no recognisable content (empty text AND no valid words), preventing spurious subtitle entries like '00:45:33,592 --> 00:45:33,592 parakeet@rH\u000b\ufffdI'. Applies to both AudioTranscription and AudioTranscriptionStream. Signed-off-by: fqscfqj --- backend/go/crispasr/gocrispasr.go | 54 +++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/backend/go/crispasr/gocrispasr.go b/backend/go/crispasr/gocrispasr.go index af1f1a95c..2cbfb0d4a 100644 --- a/backend/go/crispasr/gocrispasr.go +++ b/backend/go/crispasr/gocrispasr.go @@ -224,6 +224,28 @@ func (w *CrispASR) VAD(req *pb.VADRequest) (pb.VADResponse, error) { }, nil } +// isValidWord reports whether a TranscriptWord contains recognisable speech +// content. The parakeet-specific word accessors can return stale initialisation +// data (model name, binary blobs) when a segment has no real speech. A word is +// considered valid only when: +// - the text is non-empty after trimming, +// - it contains no U+FFFD replacement characters (from binary data scrubbing), +// - both timestamps are non-negative, +// - the word has positive duration (end > start). +func isValidWord(w *pb.TranscriptWord) bool { + txt := strings.TrimSpace(w.Text) + if txt == "" { + return false + } + if strings.ContainsRune(txt, '\uFFFD') { + return false + } + if w.Start < 0 || w.End < 0 || w.End <= w.Start { + return false + } + return true +} + func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRequest) (pb.TranscriptResult, error) { if err := ctx.Err(); err != nil { return pb.TranscriptResult{}, status.Error(codes.Canceled, "transcription cancelled") @@ -311,22 +333,35 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe if wordCount == 0 && i == 0 { wordCount = CppGetParakeetWordCount() for j := 0; j < wordCount; j++ { - words = append(words, &pb.TranscriptWord{ + w := &pb.TranscriptWord{ Start: CppGetParakeetWordT0(j) * (10000000), End: CppGetParakeetWordT1(j) * (10000000), Text: strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "�"), - }) + } + if isValidWord(w) { + words = append(words, w) + } } } else { for j := 0; j < wordCount; j++ { - words = append(words, &pb.TranscriptWord{ + w := &pb.TranscriptWord{ Start: CppGetWordT0(i, j) * (10000000), End: CppGetWordT1(i, j) * (10000000), Text: strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "�"), - }) + } + if isValidWord(w) { + words = append(words, w) + } } } + // Skip empty segments with no recognisable content (e.g. trailing + // silence segments that parakeet emits with stale init data). + trimmed := strings.TrimSpace(txt) + if trimmed == "" && len(words) == 0 { + continue + } + segment := &pb.TranscriptSegment{ Id: int32(i), Text: txt, @@ -336,7 +371,7 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe segments = append(segments, segment) - text += " " + strings.TrimSpace(txt) + text += " " + trimmed } return pb.TranscriptResult{ @@ -428,13 +463,20 @@ func (w *CrispASR) AudioTranscriptionStream(ctx context.Context, opts *pb.Transc s := CppGetSegmentStart(i) * 10000000 t := CppGetSegmentEnd(i) * 10000000 txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "�") + + // Skip empty segments (e.g. trailing silence that parakeet emits + // with stale init data). + trimmed := strings.TrimSpace(txt) + if trimmed == "" && s == t { + continue + } + segments = append(segments, &pb.TranscriptSegment{ Id: int32(i), Text: txt, Start: s, End: t, }) - trimmed := strings.TrimSpace(txt) if trimmed == "" { continue } From 01fa12e0dee93fedf777922996aa63995fb7495a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=95=AA=E8=8C=84=E6=91=94=E6=88=90=E7=95=AA=E8=8C=84?= =?UTF-8?q?=E9=85=B1?= <68098251+fqscfqj@users.noreply.github.com> Date: Sun, 21 Jun 2026 23:04:19 +0800 Subject: [PATCH 23/99] feat(nemo): enable word-level timestamps for ASR models (#10297) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(nemo): enable word-level timestamps for ASR models The nemo backend ignored timestamp_granularities and always returned a single segment with start=0 end=0, making word-level timestamps impossible to obtain even though the NeMo models (parakeet-tdt, etc.) fully support them. Changes: - Add _get_stride_seconds() to compute frame duration from the model's preprocessor window_stride and encoder subsampling_factor. - Add _build_segments_with_words() that extracts word offsets from the NeMo Hypothesis.timestamp dict and converts frame indices to nanosecond timestamps. - Support 'word' granularity (one segment per word) and 'segment' granularity (merge at time-gap boundaries using a dynamic threshold). - Populate TranscriptSegment.words with TranscriptWord entries so callers get both segment-level and word-level timing. - Only request timestamps from NeMo when the caller actually asks for them (timestamp_granularities is non-empty), keeping the fast path unchanged for callers that don't need timestamps. Tested with nvidia/parakeet-tdt-0.6b-v3 on the JFK "ask not" clip: curl -X POST /v1/audio/transcriptions \ -F file=@jfk.wav -F model=nemo-parakeet-tdt-0.6b \ -F 'timestamp_granularities[]=word' -F response_format=verbose_json → each word has correct start/end times in seconds. Signed-off-by: fqscfqj * fix(nemo): address Copilot review feedback - Narrow exception handling in _get_stride_seconds to catch only AttributeError, KeyError, TypeError instead of bare Exception, and emit a warning when falling back to the hardcoded stride. - Remove explicit return_hypotheses=False when timestamps are requested; timestamps=True already forces NeMo to return Hypothesis objects. - Add a warning when NeMo does not return Hypothesis objects despite timestamps being requested. Signed-off-by: fqscfqj --------- Signed-off-by: fqscfqj --- backend/python/nemo/backend.py | 202 ++++++++++++++++++++++++++++++--- 1 file changed, 186 insertions(+), 16 deletions(-) diff --git a/backend/python/nemo/backend.py b/backend/python/nemo/backend.py index ccbff7cd2..a5c30694e 100644 --- a/backend/python/nemo/backend.py +++ b/backend/python/nemo/backend.py @@ -84,6 +84,135 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): return backend_pb2.Result(message="Model loaded successfully", success=True) + def _get_stride_seconds(self): + """Compute the seconds-per-frame stride for the loaded model. + + stride = preprocessor_window_stride * encoder_subsampling_factor + """ + try: + preprocessor = self.model.preprocessor + window_stride = preprocessor._cfg.get('window_stride', 0.01) + subsampling_factor = getattr(self.model.encoder, 'subsampling_factor', 8) + return window_stride * subsampling_factor + except (AttributeError, KeyError, TypeError) as err: + print( + f"Warning: could not compute stride from model config ({err}), " + f"falling back to 0.08s/frame", + file=sys.stderr, + ) + return 0.08 + + def _build_segments_with_words(self, hypothesis, stride, timestamp_granularities=None): + """Build TranscriptSegment list from a NeMo Hypothesis with timestamps. + + Supports two granularity modes: + - "word": one TranscriptSegment per word, each with a single TranscriptWord entry + - "segment" (default): merge consecutive words into sentence-level segments, + splitting at word-level time gaps that exceed a dynamic threshold. + """ + if not hypothesis or not isinstance(hypothesis.timestamp, dict): + return [] + + word_offsets = hypothesis.timestamp.get('word', []) + if not word_offsets: + return [] + + granularities = list(timestamp_granularities) if timestamp_granularities else [] + granularity = "word" if "word" in granularities else "segment" + + # Build a flat list of (text, start_ns, end_ns) from NeMo word offsets + transcript_words = [] + for wo in word_offsets: + word_text = wo.get('word', '') + if not word_text: + continue + start_offset = wo.get('start_offset', 0) + end_offset = wo.get('end_offset', start_offset) + start_ns = int(start_offset * stride * 1_000_000_000) + end_ns = int(end_offset * stride * 1_000_000_000) + transcript_words.append({ + 'text': word_text, + 'start': start_ns, + 'end': end_ns, + }) + + if not transcript_words: + return [] + + if granularity == "word": + # One segment per word + result = [] + for idx, tw in enumerate(transcript_words): + word = backend_pb2.TranscriptWord( + start=tw['start'], end=tw['end'], text=tw['text'] + ) + result.append(backend_pb2.TranscriptSegment( + id=idx, + start=tw['start'], + end=tw['end'], + text=tw['text'], + words=[word], + )) + return result + + # segment mode — merge at word-level time-gap boundaries + # Compute gap threshold: median inter-word gap * 3, clamped to [0.3, 2.0]s + gaps = [] + for i in range(1, len(transcript_words)): + gap = (transcript_words[i]['start'] - transcript_words[i - 1]['end']) / 1_000_000_000 + if gap > 0: + gaps.append(gap) + if gaps: + gaps.sort() + median_gap = gaps[len(gaps) // 2] + threshold_ns = int(max(0.3, min(median_gap * 3, 2.0)) * 1_000_000_000) + else: + threshold_ns = int(0.5 * 1_000_000_000) + + result = [] + buf_words = [] # list of TranscriptWord protobuf + buf_start = None + buf_end = 0 + buf_text = [] + prev_end = None + + for tw in transcript_words: + # Detect word-level time gap + if prev_end is not None and (tw['start'] - prev_end) >= threshold_ns and buf_text: + seg_text = ' '.join(buf_text) + result.append(backend_pb2.TranscriptSegment( + id=len(result), + start=buf_start, + end=buf_end, + text=seg_text, + words=list(buf_words), + )) + buf_words = [] + buf_text = [] + buf_start = None + + if buf_start is None: + buf_start = tw['start'] + buf_end = tw['end'] + buf_text.append(tw['text']) + buf_words.append(backend_pb2.TranscriptWord( + start=tw['start'], end=tw['end'], text=tw['text'] + )) + prev_end = tw['end'] + + # flush remaining + if buf_text and buf_start is not None: + seg_text = ' '.join(buf_text) + result.append(backend_pb2.TranscriptSegment( + id=len(result), + start=buf_start, + end=buf_end, + text=seg_text, + words=list(buf_words), + )) + + return result + def AudioTranscription(self, request, context): result_segments = [] text = "" @@ -93,26 +222,67 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): print(f"Error: Audio file not found: {audio_path}", file=sys.stderr) return backend_pb2.TranscriptResult(segments=[], text="") - # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts - results = self.model.transcribe([audio_path]) + # Determine requested timestamp granularity + timestamp_granularities = list(request.timestamp_granularities) if request.timestamp_granularities else [] + want_timestamps = bool(timestamp_granularities) - if not results or len(results) == 0: - return backend_pb2.TranscriptResult(segments=[], text="") + if want_timestamps: + # Request timestamps from NeMo. + # timestamps=True forces NeMo to return Hypothesis objects with + # the timestamp dict populated, so we omit return_hypotheses to + # let NeMo choose the correct return type. + results = self.model.transcribe([audio_path], timestamps=True) - # Get the transcript text from the first result. - # CTC models return List[str], TDT/RNNT models return List[Hypothesis] - # where the actual text lives in Hypothesis.text. - result = results[0] - if isinstance(result, str): - text = result + if results and len(results) > 0: + hypotheses = results[0] if isinstance(results[0], list) else results + if hypotheses and len(hypotheses) > 0: + hypothesis = hypotheses[0] + + # Hypothesis object should have .timestamp populated + if not hasattr(hypothesis, 'timestamp') or not isinstance(hypothesis.timestamp, dict): + print( + "Warning: timestamps were requested but NeMo did not return " + "Hypothesis objects; falling back to untimestamped output", + file=sys.stderr, + ) + + # Extract text + if hasattr(hypothesis, 'text'): + text = hypothesis.text or "" + elif isinstance(hypothesis, str): + text = hypothesis + + # Build segments with word-level timestamps + stride = self._get_stride_seconds() + result_segments = self._build_segments_with_words( + hypothesis, stride, timestamp_granularities + ) + + # If no word offsets but we have text, fall back to single segment + if not result_segments and text: + result_segments.append(backend_pb2.TranscriptSegment( + id=0, start=0, end=0, text=text + )) else: - text = getattr(result, 'text', None) or "" + # Simple transcription without timestamps + # NEMO's transcribe method accepts a list of audio paths and returns a list of transcripts + results = self.model.transcribe([audio_path]) - if text: - # Create a single segment with the full transcription - result_segments.append(backend_pb2.TranscriptSegment( - id=0, start=0, end=0, text=text - )) + if results and len(results) > 0: + # Get the transcript text from the first result. + # CTC models return List[str], TDT/RNNT models return List[Hypothesis] + # where the actual text lives in Hypothesis.text. + result = results[0] + if isinstance(result, str): + text = result + else: + text = getattr(result, 'text', None) or "" + + if text: + # Create a single segment with the full transcription + result_segments.append(backend_pb2.TranscriptSegment( + id=0, start=0, end=0, text=text + )) except Exception as err: print(f"Error in AudioTranscription: {err}", file=sys.stderr) From b4c0dc67fe8471176bd7bb702333b27527e8d59f Mon Sep 17 00:00:00 2001 From: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com> Date: Sun, 21 Jun 2026 17:07:15 +0200 Subject: [PATCH 24/99] feat(vllm): progressive streaming via parser.extract_tool_calls_streaming (follow-up to #10346) (#10351) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(vllm): don't stream raw tool-call markup as content when a tool parser is active When a tool_parser is configured and the request carries tools, the streaming loop emitted every text delta as delta.content — including the model's raw tool-call markup (e.g. ...) — because extract_tool_calls only runs on the full output after the stream. Clients streaming a tool call therefore saw the unparsed tool-call syntax as assistant content. Buffer the text while a tool parser is active for the request; the existing end-of-stream chat_delta already carries the parsed tool_calls (or the cleaned content), which the Go side converts to SSE deltas. Non-tool-parser streaming is unchanged. Add a server-less regression test covering both the tool-call case (no raw markup leaked as content) and the plain-text case (content delivered exactly once — guards against double-emitting the buffered content). Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com> * test(vllm): add expectedFailure test for progressive streaming with tool parser (Case 3, #582) Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com> * test(vllm): add Cases 4+5 — marker split across chunks + false-positive prefix (TDD, Option B state machine, #582) Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com> * feat(vllm): progressive streaming via parser.extract_tool_calls_streaming When a tool parser is active for a tool-enabled streaming request, #10346 buffers the entire generation and surfaces it on the final chunk to prevent raw tool-call markup from leaking as delta.content. This is correct but turns the request into effectively non-streaming for plain-text responses — the client sees nothing until the model stops. Every concrete tool parser shipped with vLLM 0.23+ already implements extract_tool_calls_streaming (Granite4, Qwen3Coder, DeepSeekV31, Jamba, Ernie45, Hermes2Pro, llama3_json, mistral, …). Use it: instantiate the parser before the streaming loop and call its streaming method per delta, emitting DeltaMessage(content=…) or DeltaMessage(tool_calls=[…]) when the parser is ready. Falls back to the existing #10346 buffer path when: - the parser does not have extract_tool_calls_streaming, OR - extract_tool_calls_streaming raises mid-stream (logged, the rest of the request finishes via post-loop extract_tool_calls). Tests (TestStreamingToolParser): 1. Buffer path: no markup leaked, no content duplication 2. Native streaming: plain-text response streams progressively 3. Native streaming: tool_call structured, no markup leaked 4. Native streaming exception → graceful fallback, no markup, no crash 5. No tool parser → unchanged per-delta content stream E2E verified against qwen3_coder on vLLM 0.23.0 (NVIDIA GB10 / arm64 / CUDA 13). Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com> * docs(vllm): add server-side TTFT benchmark for the streaming tool-parser path Self-contained stdlib-only script that measures time-to-first-token (TTFT) for the vLLM backend's two streaming scenarios: - tool_call: request mentions a tool; model is expected to call it - plain_text: request offers a tool but explicitly asks for prose Use this to compare: - the buffer-all path (#10346) → plain_text TTFT ≈ total response time - the native-streaming path (this PR) → plain_text TTFT ≈ true first-token time python examples/vllm-bench/ttft_streaming_tool_parser.py \\ --url http://localhost:8080 --model my-coder --runs 3 Lives under examples/ so it does not interfere with the test suite. Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com> * examples/vllm-bench: add long-text scenario (8 paragraphs, 1500 tokens) The long-text scenario shows the buffering vs streaming difference most dramatically: with the buffer-all path, the client receives nothing for 20+ seconds and then the entire 1500-token response at once. With native streaming, the first token arrives in tens of milliseconds and the response flows progressively. Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com> --------- Signed-off-by: pos-ei-don <1822533+pos-ei-don@users.noreply.github.com> Co-authored-by: Philipp Wacker --- backend/python/vllm/backend.py | 161 +++++++++-- backend/python/vllm/test.py | 259 +++++++++++++++++- examples/vllm-bench/README.md | 54 ++++ .../vllm-bench/ttft_streaming_tool_parser.py | 175 ++++++++++++ 4 files changed, 631 insertions(+), 18 deletions(-) create mode 100644 examples/vllm-bench/README.md create mode 100755 examples/vllm-bench/ttft_streaming_tool_parser.py diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 20064e233..a38849137 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -598,23 +598,124 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): # Stream the results generated_text = "" + generated_token_ids: list[int] = [] last_output = None + + # Tool-parsing strategy decision (made once, before the loop): + # + # When a tool parser is active, the model's raw tool-call markup + # (e.g. ...) must not be streamed verbatim as delta.content + # — clients would see the unparsed syntax. Two paths: + # + # (A) native streaming via parser.extract_tool_calls_streaming. All + # concrete tool parsers shipped with vLLM 0.23+ implement this + # (Granite4, Qwen3Coder, DeepSeekV31, Jamba, Ernie45, Hermes, + # llama3_json, mistral, …). The parser decides per-delta whether + # to emit content or suppress tool-call markup, and emits a + # structured DeltaMessage(tool_calls=[...]) when a call is ready. + # (B) buffer fallback — used only when the parser surprisingly lacks + # the streaming method or it raises mid-stream. The post-loop + # extract_tool_calls assembles the final chat_delta. Same correctness + # guarantee as a non-streaming response, at the cost of a delayed + # final chunk. + has_tool_parser = bool(self.tool_parser_cls and request.Tools) + tp_instance = None + tp_request = None + native_streaming = False + native_streaming_error = False + if has_tool_parser: + try: + tools_for_parser = json.loads(request.Tools) + except json.JSONDecodeError: + tools_for_parser = [] + try: + tp_instance = self.tool_parser_cls(self.tokenizer, tools=tools_for_parser) + except TypeError: + tp_instance = self.tool_parser_cls(self.tokenizer) + # Build a minimal ChatCompletionRequest so the streaming method + # sees the tools list. We do not need any other request fields — + # parsers only read .tools (and sometimes .tool_choice, which we + # leave at default). + try: + from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest as _CCR, + ) + tp_request = _CCR( + model="local", + messages=[{"role": "user", "content": ""}], + tools=tools_for_parser or None, + ) + except Exception as e: + print(f"Could not build ChatCompletionRequest for streaming parser: {e}", + file=sys.stderr) + tp_request = None + native_streaming = ( + tp_request is not None + and hasattr(tp_instance, "extract_tool_calls_streaming") + ) + try: async for request_output in outputs: iteration_text = request_output.outputs[0].text last_output = request_output if streaming: - # Remove text already sent as vllm concatenates the text from previous yields delta_iteration_text = iteration_text.removeprefix(generated_text) - # Send the partial result - yield backend_pb2.Reply( - message=bytes(delta_iteration_text, encoding='utf-8'), - chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)], - ) + new_token_ids = list(request_output.outputs[0].token_ids) + delta_token_ids = new_token_ids[len(generated_token_ids):] - # Keep track of text generated + if not has_tool_parser: + # Plain streaming — unchanged from pre-tool-parser path. + yield backend_pb2.Reply( + message=bytes(delta_iteration_text, encoding='utf-8'), + chat_deltas=[backend_pb2.ChatDelta(content=delta_iteration_text)], + ) + elif native_streaming and not native_streaming_error: + # (A) Native vLLM extract_tool_calls_streaming. + try: + msg = tp_instance.extract_tool_calls_streaming( + previous_text=generated_text, + current_text=iteration_text, + delta_text=delta_iteration_text, + previous_token_ids=generated_token_ids, + current_token_ids=new_token_ids, + delta_token_ids=delta_token_ids, + request=tp_request, + ) + except Exception as e: + print(f"Streaming tool parser error (falling back to " + f"buffer for the rest of the stream): {e}", + file=sys.stderr) + native_streaming_error = True + msg = None + if msg is not None: + tc_protos = [] + for tc in (msg.tool_calls or []): + fn = tc.function or None + tc_protos.append(backend_pb2.ToolCallDelta( + index=tc.index, + id=tc.id or "", + name=(fn.name if fn and fn.name else "") or "", + arguments=(fn.arguments if fn and fn.arguments else "") or "", + )) + cd_kwargs = {} + if msg.content: + cd_kwargs["content"] = msg.content + if msg.reasoning: + cd_kwargs["reasoning_content"] = msg.reasoning + if tc_protos: + cd_kwargs["tool_calls"] = tc_protos + if cd_kwargs: + yield backend_pb2.Reply( + message=bytes(msg.content or "", encoding='utf-8'), + chat_deltas=[backend_pb2.ChatDelta(**cd_kwargs)], + ) + # (B) buffer fallback — emit nothing during the stream. + # The post-loop extract_tool_calls block builds the final chunk. + + # Keep track of text + token_ids generated generated_text = iteration_text + generated_token_ids = list(request_output.outputs[0].token_ids) finally: await outputs.aclose() @@ -639,16 +740,19 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): except Exception as e: print(f"Reasoning parser error: {e}", file=sys.stderr) - if self.tool_parser_cls and request.Tools: + # When (A) native streaming ran cleanly, per-delta yields above already + # delivered everything — do NOT extract again on the full text or we'd + # duplicate content/tool_calls into the final chunk. + if has_tool_parser and not (native_streaming and not native_streaming_error): try: - tools = json.loads(request.Tools) - # Some concrete parsers only accept the tokenizer; only the - # abstract base declares the tools kwarg. Try with tools first, - # fall back to tokenizer-only. - try: - tp = self.tool_parser_cls(self.tokenizer, tools=tools) - except TypeError: - tp = self.tool_parser_cls(self.tokenizer) + tp = tp_instance + if tp is None: + # Defensive: tp_instance build failed earlier; reconstruct. + tools = json.loads(request.Tools) + try: + tp = self.tool_parser_cls(self.tokenizer, tools=tools) + except TypeError: + tp = self.tool_parser_cls(self.tokenizer) info = tp.extract_tool_calls(content, request=None) if info.tools_called: content = info.content or "" @@ -661,6 +765,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): )) except Exception as e: print(f"Tool parser error: {e}", file=sys.stderr) + elif native_streaming and not native_streaming_error: + # Per-delta path already emitted content + tool_calls; the final + # chat_delta should carry only metadata (token counts, logprobs). + content = "" # Extract token counts prompt_tokens = 0 @@ -700,7 +808,26 @@ class BackendServicer(backend_pb2_grpc.BackendServicer): ) if streaming: - # Final chunk with structured data + # Final chunk with structured data. + # + # If we used the buffer fallback (has_tool_parser=True AND native + # streaming did NOT run cleanly) and the parser found no tool call, + # flush the buffered content as ONE content delta — and clear the + # final chat_delta's content so the metadata chunk does not repeat + # what we just sent. This is the plain-text-with-tool-parser path. + buffered_fallback = ( + has_tool_parser + and not (native_streaming and not native_streaming_error) + ) + if buffered_fallback and not tool_calls_proto and content: + yield backend_pb2.Reply( + message=bytes(content, encoding='utf-8'), + chat_deltas=[backend_pb2.ChatDelta(content=content)], + ) + chat_delta = backend_pb2.ChatDelta( + reasoning_content=reasoning_content, + tool_calls=tool_calls_proto, + ) yield backend_pb2.Reply( message=b"", prompt_tokens=prompt_tokens, diff --git a/backend/python/vllm/test.py b/backend/python/vllm/test.py index 25a7f54e6..d00595f01 100644 --- a/backend/python/vllm/test.py +++ b/backend/python/vllm/test.py @@ -278,4 +278,261 @@ class TestBackendServicer(unittest.TestCase): print(err) self.fail("Embedding service failed") finally: - self.tearDown() \ No newline at end of file + self.tearDown() + + +class TestStreamingToolParser(unittest.TestCase): + """ + Server-less unit tests for the streaming + tool-parser machinery in + BackendServicer._predict. These tests instantiate BackendServicer + directly and mock the vLLM engine + tool parser, so they do not need + a GPU, a model, or a running gRPC server. Kept in a separate class to + avoid the parent setUp() which spawns a subprocess. + + Covers #582 (follow-up to #10346): + 1. Markup-leak prevention with a non-streaming parser (buffer fallback) + 2. No content duplication on the plain-text path with the buffer fallback + 3. Native streaming progressive plain-text emission + 4. Native streaming structured tool_call, no markup leak + 5. Parser exception → graceful fallback to buffer, still no markup + 6. No-tool-parser regression: unchanged per-delta content stream + """ + + @staticmethod + def _make_generate(chunks): + """Build a fake vLLM engine.generate that yields cumulative chunks.""" + from types import SimpleNamespace + async def gen(*a, **k): + for i, t in enumerate(chunks): + yield SimpleNamespace( + outputs=[SimpleNamespace( + text=t, + token_ids=list(range(i + 1)), + logprobs=None, + )], + prompt_token_ids=[0], + ) + return lambda *a, **k: gen() + + @staticmethod + def _collect(servicer, req): + import asyncio + async def run(): + return [r async for r in servicer._predict(req, None, streaming=True)] + return asyncio.run(run()) + + def _new_servicer(self): + import sys, os + sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + from backend import BackendServicer + s = BackendServicer() + s.reasoning_parser_cls = None + s.tool_parser_cls = None + s.tokenizer = None + return s + + # ── Case 1+2: parser without streaming method → buffer fallback ── + def test_buffer_path_no_markup_no_duplication(self): + from types import SimpleNamespace + + def parser_cls(called, content_text, calls): + class _P: + def __init__(self, tokenizer, tools=None): + pass + # NOTE: NO extract_tool_calls_streaming → takes the buffer path + def extract_tool_calls(self, c, request=None): + return SimpleNamespace( + tools_called=called, content=content_text, tool_calls=calls, + ) + return _P + + tools_json = '[{"type":"function","function":{"name":"calc","parameters":{}}}]' + + # Tool-call case: no raw markup in any delta.content + s = self._new_servicer() + s.llm = SimpleNamespace(generate=self._make_generate([ + '\n{"name": "calc"', + '\n{"name": "calc", "arguments": {"x": 1}}\n', + ])) + call = SimpleNamespace(id="call_1", + function=SimpleNamespace(name="calc", arguments='{"x": 1}')) + s.tool_parser_cls = parser_cls(True, "", [call]) + req = backend_pb2.PredictOptions(Prompt="x", Tools=tools_json) + replies = self._collect(s, req) + contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content] + self.assertFalse( + any(" 0, + "Plain-text response not streamed progressively (native streaming inactive?)", + ) + assembled = "".join( + cd.content for r in replies for cd in r.chat_deltas if cd.content + ) + self.assertEqual( + assembled, "Paris is the capital of France.", + f"Assembled content wrong: {assembled!r}", + ) + + # ── Case 4: native streaming, structured tool_call, no markup ── + def test_native_streaming_tool_call_no_markup_leak(self): + from types import SimpleNamespace + + class _DeltaMsg: + def __init__(self, content=None, reasoning=None, tool_calls=None): + self.content = content + self.reasoning = reasoning + self.tool_calls = tool_calls or [] + + class _ToolCallStreamer: + def __init__(self, tokenizer, tools=None): + self._emitted = False + def extract_tool_calls(self, c, request=None): + raise AssertionError("extract_tool_calls invoked on native-streaming path") + def extract_tool_calls_streaming( + self, previous_text, current_text, delta_text, + previous_token_ids, current_token_ids, delta_token_ids, request, + ): + if "" in current_text and not self._emitted: + self._emitted = True + fn = SimpleNamespace(name="calc", arguments='{"x": 1}') + tc = SimpleNamespace(id="call_1", type="function", index=0, function=fn) + return _DeltaMsg(tool_calls=[tc]) + return None + + s = self._new_servicer() + s.llm = SimpleNamespace(generate=self._make_generate([ + '\n', + '\n{"name": "calc"', + '\n{"name": "calc", "arguments": {"x": 1}}\n', + ])) + s.tool_parser_cls = _ToolCallStreamer + req = backend_pb2.PredictOptions( + Prompt="x", + Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]', + ) + replies = self._collect(s, req) + + contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content] + self.assertFalse( + any("" in c for c in contents), + f"markup leaked as content: {contents!r}", + ) + names = [tc.name for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.name] + args = [tc.arguments for r in replies for cd in r.chat_deltas for tc in cd.tool_calls if tc.arguments] + self.assertIn("calc", names, f"tool_call name missing; got {names!r}") + self.assertIn('{"x": 1}', args, f"tool_call args missing; got {args!r}") + + # ── Case 5: parser exception → fallback to buffer, no leak ── + def test_native_streaming_parser_exception_falls_back_to_buffer(self): + from types import SimpleNamespace + call = SimpleNamespace(id="call_1", + function=SimpleNamespace(name="calc", arguments='{"x": 1}')) + + class _BrokenStreamer: + def __init__(self, tokenizer, tools=None): + pass + def extract_tool_calls(self, c, request=None): + return SimpleNamespace(tools_called=True, content="", tool_calls=[call]) + def extract_tool_calls_streaming(self, *a, **kw): + raise RuntimeError("simulated parser bug") + + s = self._new_servicer() + s.llm = SimpleNamespace(generate=self._make_generate([ + '\n{"name": "calc"', + '\n{"name": "calc", "arguments": {"x": 1}}\n', + ])) + s.tool_parser_cls = _BrokenStreamer + req = backend_pb2.PredictOptions( + Prompt="x", + Tools='[{"type":"function","function":{"name":"calc","parameters":{}}}]', + ) + replies = self._collect(s, req) + + contents = [cd.content for r in replies for cd in r.chat_deltas if cd.content] + self.assertFalse( + any("...`) from leaking as `delta.content`. That was correct +for tool-call responses, but it turned plain-text responses into effectively +non-streaming — the client received nothing until the model finished. + +With native parser-side streaming (`parser.extract_tool_calls_streaming`, +implemented by every concrete vLLM 0.23+ tool parser), each delta can be +classified per-token: emit as content, emit as a structured tool_call, or +suppress. + +## Three scenarios + +| Scenario | Request | Expected outcome | +|---|---|---| +| `tool_call` | "What is the weather in Paris? Please use the tool." | Model calls `get_weather`. `delta.tool_calls` chunks; no content leak. | +| `plain_text_short` | "Explain in 3 short sentences what a hash table is. Do NOT call any tool." | Model writes ~3 sentences. | +| `plain_text_long` | "Write a thorough 8-paragraph explanation of how Python's GIL works…" | Model writes ~1500 tokens of prose. | + +The **long scenario** is where the streaming/buffering difference is most +dramatic: with the buffer-all path, the client sees nothing for 20+ seconds +and then everything at once; with native streaming, the first token arrives +in <100ms and the response flows progressively. + +## What the script reports + +For each scenario, across N runs: + +- `ttf_content_s` — time until the first `delta.content` chunk +- `ttf_tool_s` — time until the first `delta.tool_calls` chunk +- `n_content_chunks` — total content deltas (1 = bundled, >>1 = streamed) +- `n_tool_chunks` — total tool_call deltas +- `total_s` — total wall-clock until `[DONE]` +- `finish_reason` — `tool_calls` / `stop` / `length` + +The big tell is **`n_content_chunks` vs `total_s` ratio**: +- Buffer-all: `n_content_chunks` ≈ 1, `ttf_content_s` ≈ `total_s` (one chunk at end) +- Streaming: `n_content_chunks` ≈ token count, `ttf_content_s` ≈ first-token latency + +## Usage + +```bash +python ttft_streaming_tool_parser.py --url http://localhost:8080 --model my-coder --runs 3 +``` + +JSON results are written to `ttft_bench_