Merge branch 'master' into worktree-feat+paged-attention (llama.cpp pin -> 9d5d882d)

Sync to master (12 commits) + the llama.cpp pin bump 8be759e6 -> 9d5d882d. Conflicts resolved: - Makefile .NOTPARALLEL: union (keep both backends/llama-cpp-localai-paged and master's backends/privacy-filter-darwin). - gallery/index.yaml: our 2 base NVFP4 entries (qwen3.6-27b-nvfp4, qwen3.6-35b-a3b-nvfp4) for the paged backend prepended to master's full list; master keeps its own *-nvfp4-mtp variants (distinct entries). Go build + YAML validated; the 8 duplicate gallery names are pre-existing in master, not introduced here. The patchset still needs re-verification against the new tip (pin-sync, next step). Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-26 17:37:07 -04:00 · 2026-06-26 13:16:13 +00:00
parent 167768cac3 5b3572f8b8
commit 30a2b590d9
55 changed files with 711 additions and 246 deletions
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=8be759e6f70d629638a7eb70db3824cbdcea370b
+LLAMA_VERSION?=9d5d882d8cd0f0a9283d87ed5e6fe3ee0d925fb1
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 # LLAMA_PAGED controls whether the vendored paged-attention patch series
 # (patches/paged/) is applied on top of the pinned llama.cpp. Default on; set
--- a/backend/cpp/llama-cpp/run.sh
+++ b/backend/cpp/llama-cpp/run.sh
@@ -2,7 +2,7 @@
 set -ex

 # Get the absolute current dir where the script is located
-CURDIR=$(dirname "$(realpath $0)")
+CURDIR=$(dirname "$(realpath "$0")")

 cd /

@@ -16,37 +16,37 @@ BINARY=llama-cpp-fallback
 # CPU_ALL_VARIANTS: ggml's backend registry dlopens the best libggml-cpu-*.so for this
 # host, so no shell-side AVX probing. GPU images (cublas/sycl/vulkan/hipblas) ship only
 # llama-cpp-fallback (the accelerator does the compute), so fall back to it when absent.
-if [ -e $CURDIR/llama-cpp-cpu-all ]; then
+if [ -e "$CURDIR"/llama-cpp-cpu-all ]; then
 	BINARY=llama-cpp-cpu-all
 fi

 if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
-	if [ -e $CURDIR/llama-cpp-grpc ]; then
+	if [ -e "$CURDIR"/llama-cpp-grpc ]; then
 		BINARY=llama-cpp-grpc
 	fi
 fi
 
 # Extend ld library path with the dir where this script is located/lib
 if [ "$(uname)" == "Darwin" ]; then
-	export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
-	#export DYLD_FALLBACK_LIBRARY_PATH=$CURDIR/lib:$DYLD_FALLBACK_LIBRARY_PATH
+	export DYLD_LIBRARY_PATH="$CURDIR"/lib:$DYLD_LIBRARY_PATH
+	#export DYLD_FALLBACK_LIBRARY_PATH="$CURDIR"/lib:$DYLD_FALLBACK_LIBRARY_PATH
 else
-	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+	export LD_LIBRARY_PATH="$CURDIR"/lib:$LD_LIBRARY_PATH
 	# Tell rocBLAS where to find TensileLibrary data (GPU kernel tuning files)
 	if [ -d "$CURDIR/lib/rocblas/library" ]; then
-		export ROCBLAS_TENSILE_LIBPATH=$CURDIR/lib/rocblas/library
+		export ROCBLAS_TENSILE_LIBPATH="$CURDIR"/lib/rocblas/library
 	fi
 fi

 # If there is a lib/ld.so, use it
-if [ -f $CURDIR/lib/ld.so ]; then
+if [ -f "$CURDIR"/lib/ld.so ]; then
 	echo "Using lib/ld.so"
 	echo "Using binary: $BINARY"
-	exec $CURDIR/lib/ld.so $CURDIR/$BINARY "$@"
+	exec "$CURDIR"/lib/ld.so "$CURDIR"/$BINARY "$@"
 fi

 echo "Using binary: $BINARY"
-exec $CURDIR/$BINARY "$@"
+exec "$CURDIR"/$BINARY "$@"

 # We should never reach this point, however just in case we do, run fallback
-exec $CURDIR/llama-cpp-fallback "$@"
+exec "$CURDIR"/llama-cpp-fallback "$@"