diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index bcbc4aebd..40c5855dd 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -5073,7 +5073,7 @@ includeDarwin: lang: "go" # llama-cpp-localai-paged on Darwin: same bespoke CPU_ALL_VARIANTS + Metal build # as stock llama-cpp (driven by make backends/llama-cpp-localai-paged-darwin), - # reusing backend/cpp/llama-cpp sources with LLAMA_PAGED=on. lang=go selects the + # reusing backend/cpp/llama-cpp sources, with the paged patch series applied by the wrapper. lang=go selects the # runner/toolchain only; the source path is C++. Metal delivers paged-KV (the # NVFP4 FP4-MMA fast path is CUDA/Blackwell-only) and the GDN/conv fused ops have # no Metal kernel, so a gated-DeltaNet (qwen35) model falls back to the CPU diff --git a/.github/scripts/paged-canary-apply.sh b/.github/scripts/paged-canary-apply.sh index 548e29249..1de59a9df 100755 --- a/.github/scripts/paged-canary-apply.sh +++ b/.github/scripts/paged-canary-apply.sh @@ -1,14 +1,14 @@ #!/usr/bin/env bash # # paged-canary-apply.sh - apply the vendored paged-attention patch series -# (backend/cpp/llama-cpp/patches/paged/0001-0030) to a llama.cpp checkout, the +# (backend/cpp/llama-cpp-localai-paged/patches/paged/0001-0030) to a llama.cpp checkout, the # same way the build does, but tolerating the ONE known-benign pre-existing # quirk in the series. Used by the early-warning canary # (.github/workflows/llama-cpp-paged-canary.yml) so it only goes red on a REAL # upstream break, never on that quirk. # # Usage: paged-canary-apply.sh -# is normally backend/cpp/llama-cpp/patches (it holds the +# is normally backend/cpp/llama-cpp-localai-paged/patches (it holds the # top-level base series 0*.patch, currently empty, and the paged/ subseries). # # Exit 0 = the whole series applied -> patches still fit upstream. @@ -27,7 +27,7 @@ # missing-file hunk rejects the whole patch - and because 0021/0022/0026/0028 # build on 0019's code, the rejection cascades to them too. This is a # PRE-EXISTING shipped-series defect, present identically on every pin, NOT an -# upstream break (see backend/cpp/llama-cpp/patches/paged/PIN_SYNC_c299a92c.md +# upstream break (see backend/cpp/llama-cpp-localai-paged/patches/paged/PIN_SYNC_c299a92c.md # and README.md). We exclude ONLY that dev-doc path and still # apply 0019's real code hunks atomically, so a genuine code-hunk break in 0019 # still fails the canary. prepare.sh tolerates the same hunk via @@ -53,7 +53,7 @@ apply_one() { echo "paged-canary: applying $(basename "$p")" if ! git apply --verbose "$@" "$p"; then echo "::error::paged patch no longer applies to the upstream llama.cpp tip: $(basename "$p")" - echo "::error::upstream drifted past the vendored paged series - run a PIN_SYNC (backend/cpp/llama-cpp/patches/paged/PIN_SYNC_c299a92c.md), do NOT bump the pin blindly" + echo "::error::upstream drifted past the vendored paged series - run a PIN_SYNC (backend/cpp/llama-cpp-localai-paged/patches/paged/PIN_SYNC_c299a92c.md), do NOT bump the pin blindly" exit 1 fi } diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index f3df0750d..3ba1053cf 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -11,7 +11,7 @@ jobs: matrix: # NOTE: there is intentionally NO entry for the llama-cpp-localai-paged # backend. It carries a vendored paged-attention patch series - # (backend/cpp/llama-cpp/patches/paged/) hand-verified bit-exact against + # (backend/cpp/llama-cpp-localai-paged/patches/paged/) hand-verified bit-exact against # ONE specific llama.cpp tip; a naive nightly bump would move the tip out # from under the patches and break `git apply` at build time. Its pin is # therefore decoupled (its own LLAMA_VERSION in diff --git a/.github/workflows/llama-cpp-paged-canary.yml b/.github/workflows/llama-cpp-paged-canary.yml index 4bae7a4b7..46ed6940e 100644 --- a/.github/workflows/llama-cpp-paged-canary.yml +++ b/.github/workflows/llama-cpp-paged-canary.yml @@ -1,7 +1,7 @@ name: 'llama.cpp paged patches: upstream canary' # EARLY-WARNING CANARY for the vendored paged-attention patch series -# (backend/cpp/llama-cpp/patches/paged/0001-0030). +# (backend/cpp/llama-cpp-localai-paged/patches/paged/0001-0030). # # WHY THIS EXISTS # The paged backend (backend/cpp/llama-cpp-localai-paged) pins its OWN verified @@ -17,7 +17,7 @@ name: 'llama.cpp paged patches: upstream canary' # RED HERE means: time to run a PIN_SYNC (rebase the patches onto the new tip, # pass the bit-exact gate on the GPU, re-export the .patch files, THEN advance # the pin in backend/cpp/llama-cpp-localai-paged/Makefile). See -# backend/cpp/llama-cpp/patches/paged/PIN_SYNC_c299a92c.md. +# backend/cpp/llama-cpp-localai-paged/patches/paged/PIN_SYNC_c299a92c.md. # # SIGNAL-ONLY: this workflow moves no pinned version, ships nothing, and is fully # decoupled from bump_deps - so the main dep-bump PR stays green regardless. A @@ -91,7 +91,7 @@ jobs: run: | bash .github/scripts/paged-canary-apply.sh \ /tmp/llama.cpp \ - "$PWD/backend/cpp/llama-cpp/patches" + "$PWD/backend/cpp/llama-cpp-localai-paged/patches" echo "- apply: full paged series applies to the upstream tip :white_check_mark:" >> "$GITHUB_STEP_SUMMARY" compile: @@ -141,12 +141,16 @@ jobs: cp -a /opt/grpc/. /usr/local/ # Pre-populate the llama.cpp checkout at the latest tip with the - # paged series applied via the tolerant canary apply (so the benign - # 0019 dev-doc hunk does not abort the build). Because - # backend/cpp/llama-cpp/llama.cpp now exists, the Makefile - # llama.cpp target (strict clone + git apply) is skipped and - # prepare.sh sees the paged sentinel and skips re-applying - so we - # drive the REAL grpc-server build path on top of our apply. + # paged series applied via the tolerant canary apply. Because + # backend/cpp/llama-cpp/llama.cpp now exists, the stock Makefile's + # llama.cpp target (clone + base-patch apply) is skipped and the + # now patch-free prepare.sh only copies the grpc-server sources - + # so we drive the REAL grpc-server build path on top of our paged + # apply. The stock llama-cpp backend no longer carries the paged + # series (it lives in backend/cpp/llama-cpp-localai-paged/patches/ + # paged); we build it here in the stock dir only because that is + # where the shared build infra (Makefile / grpc-server.cpp / + # CMakeLists.txt / prepare.sh) lives. cd backend/cpp/llama-cpp/ mkdir -p llama.cpp cd llama.cpp @@ -157,15 +161,16 @@ jobs: cd /LocalAI bash .github/scripts/paged-canary-apply.sh \ backend/cpp/llama-cpp/llama.cpp \ - "$PWD/backend/cpp/llama-cpp/patches" + "$PWD/backend/cpp/llama-cpp-localai-paged/patches" # Cheapest real CUDA build that proves the patches compile: one - # CUDA arch, cublas, paged on. CMAKE_ARGS is passed via the - # environment (not as a make arg) so the Makefile += flags are - # still appended, exactly like .docker/llama-cpp-localai-paged-compile.sh. + # CUDA arch, cublas. CMAKE_ARGS is passed via the environment (not + # as a make arg) so the Makefile += flags are still appended, + # exactly like .docker/llama-cpp-localai-paged-compile.sh. The paged + # series is already applied to the checkout above, so the stock + # build just compiles the patched tree. cd backend/cpp/llama-cpp/ BUILD_TYPE=cublas \ - LLAMA_PAGED=on \ CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=80" \ make grpc-server test -x grpc-server diff --git a/Makefile b/Makefile index 9f01273ed..e6a073bf1 100644 --- a/Makefile +++ b/Makefile @@ -1142,8 +1142,8 @@ backends/llama-cpp-darwin: build ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)" # llama-cpp-localai-paged on Darwin: same bespoke CPU_ALL_VARIANTS + Metal build as -# stock llama-cpp (otool dylib bundling), driven through the paged wrapper Makefile -# with LLAMA_PAGED=on. Mirrors backends/llama-cpp-darwin. +# stock llama-cpp (otool dylib bundling), driven through the paged wrapper Makefile, +# which applies its own vendored paged patch series. Mirrors backends/llama-cpp-darwin. backends/llama-cpp-localai-paged-darwin: build bash ./scripts/build/llama-cpp-localai-paged-darwin.sh ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp-localai-paged.tar)" @@ -1198,7 +1198,7 @@ BACKEND_IK_LLAMA_CPP = ik-llama-cpp|ik-llama-cpp|.|false|false # Reuses backend/cpp/llama-cpp grpc-server sources via a thin wrapper Makefile. BACKEND_TURBOQUANT = turboquant|turboquant|.|false|false # llama-cpp-localai-paged = stock llama.cpp grpc-server + the LocalAI paged-attention -# patch series (LLAMA_PAGED=on). Reuses backend/cpp/llama-cpp sources via a thin +# patch series (vendored in this wrapper backend). Reuses backend/cpp/llama-cpp sources via a thin # wrapper Makefile (same upstream pin as stock llama-cpp; no fork, no patch-grpc-server). BACKEND_LLAMA_CPP_LOCALAI_PAGED = llama-cpp-localai-paged|llama-cpp-localai-paged|.|false|false # ds4 is antirez/ds4, a DeepSeek V4 Flash-specific inference engine. diff --git a/backend/cpp/llama-cpp-localai-paged/Makefile b/backend/cpp/llama-cpp-localai-paged/Makefile index fd4d0d1f5..13921220c 100644 --- a/backend/cpp/llama-cpp-localai-paged/Makefile +++ b/backend/cpp/llama-cpp-localai-paged/Makefile @@ -1,33 +1,36 @@ # llama-cpp-localai-paged is LocalAI's paged-attention llama.cpp variant. It # builds upstream llama.cpp with the LocalAI paged-attention patch series -# (backend/cpp/llama-cpp/patches/paged/) applied on top (LLAMA_PAGED=on). It -# reuses backend/cpp/llama-cpp's grpc-server.cpp / CMakeLists.txt / prepare.sh -# sources verbatim via a thin wrapper. +# (patches/paged/, vendored in THIS backend) applied on top. It reuses +# backend/cpp/llama-cpp's grpc-server.cpp / CMakeLists.txt / prepare.sh / Makefile +# sources verbatim via a thin wrapper - the stock llama-cpp backend is pure +# upstream and carries NONE of the paged patches; this backend OWNS them. # # Pin handling (mirrors the turboquant wrapper, the precedent this is modelled # on): the paged patch series is hand-verified bit-exact against ONE specific # llama.cpp tip and re-exported by the manual PIN_SYNC process -# (backend/cpp/llama-cpp/patches/paged/PIN_SYNC_*.md). A naive pin bump would -# move the tip out from under the patches and break `git apply` at build time, -# so this backend OWNS its pin (LLAMA_VERSION below) instead of inheriting the -# auto-bumped stock pin from backend/cpp/llama-cpp/Makefile. The override is -# forced into every copied build via `LLAMA_VERSION=$(LLAMA_VERSION)`. There is -# deliberately NO bump_deps.yaml entry for it: it is advanced ONLY by PIN_SYNC, -# never nightly. (turboquant CAN auto-bump because its fork branch carries the -# patches; the paged series is vendored as .patch files here, so it cannot.) +# (patches/paged/PIN_SYNC_*.md). A naive pin bump would move the tip out from +# under the patches and break `git apply` at build time, so this backend OWNS +# its pin (LLAMA_VERSION below) instead of inheriting the auto-bumped stock pin +# from backend/cpp/llama-cpp/Makefile. The override is forced into every copied +# build via `LLAMA_VERSION=$(LLAMA_VERSION)`. There is deliberately NO +# bump_deps.yaml entry for it: it is advanced ONLY by PIN_SYNC, never nightly. +# (turboquant CAN auto-bump because its fork branch carries the patches; the +# paged series is vendored as .patch files here, so it cannot.) # -# - NO patch-grpc-server.sh and NO apply-patches.sh: the shared -# grpc-server.cpp already carries the (runtime-gated) paged option hooks, -# and the paged patch series is applied by the copied llama-cpp Makefile's -# own `llama.cpp` target whenever LLAMA_PAGED=on (which we force below). +# - NO patch-grpc-server.sh and NO apply-patches.sh: the shared grpc-server.cpp +# already carries the (runtime-gated) paged option hooks, and the paged patch +# series (patches/paged/) is applied by THIS Makefile's own apply step onto +# the freshly cloned tree, using the same strict `git apply` method the stock +# build uses for base patches. The stock llama-cpp Makefile applies only its +# own (currently empty) base patches/ series, never the paged one. # Manually pin-synced llama.cpp tip the paged patch series is verified against. # Decoupled from the auto-bumped stock pin in backend/cpp/llama-cpp/Makefile so # the nightly llama.cpp bump cannot silently break the vendored paged patches. # Advance ONLY via the PIN_SYNC process (rebase patches + bit-exact gate + # re-export), then update this value. See: -# backend/cpp/llama-cpp/patches/paged/PIN_SYNC_*.md +# backend/cpp/llama-cpp-localai-paged/patches/paged/PIN_SYNC_*.md # # This pin = the manual, verified sync. The signal telling you WHEN to do the # next sync is the early-warning canary @@ -47,28 +50,49 @@ ARCH?=$(shell uname -m) CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) LLAMA_CPP_DIR := $(CURRENT_MAKEFILE_DIR)/../llama-cpp +# OUR vendored paged-attention patch series. Owned by this backend; the stock +# llama-cpp backend no longer carries it. Applied onto each freshly cloned +# llama.cpp tree by apply-paged-patches below (strict git apply). +PAGED_PATCHES_DIR := $(CURRENT_MAKEFILE_DIR)/patches/paged GREEN := \033[0;32m RESET := \033[0m +# Apply OUR vendored paged-attention patch series (patches/paged/0*.patch) onto a +# freshly cloned llama.cpp tree ($(1)) using the SAME strict git-apply method the +# stock build uses for its base patches (backend/cpp/llama-cpp/Makefile `llama.cpp` +# target). Strict: any patch that no longer applies aborts the build (exit 1) - +# that is the signal to run a PIN_SYNC, never to bump the pin blindly. The series +# is owned by THIS backend, not by the now-pure stock llama-cpp backend. +define apply-paged-patches + cd $(1) && \ + for p in $(PAGED_PATCHES_DIR)/0*.patch; do \ + [ -e "$$p" ] || continue; \ + echo "applying llama.cpp PAGED patch: $$p"; \ + git apply --verbose "$$p" || { echo "paged patch failed: $$p"; exit 1; }; \ + done +endef + # Each flavor target: # 1. copies backend/cpp/llama-cpp/ (grpc-server.cpp + prepare.sh + # CMakeLists.txt + Makefile) into a sibling # llama-cpp-localai-paged--build directory; -# 2. clones the SAME upstream llama.cpp pin into that copy and applies the -# base AND paged patch series via the copy's own `llama.cpp` target with -# LLAMA_PAGED=on; -# 3. runs the copy's `grpc-server` target (LLAMA_PAGED=on) and copies the -# produced binary up as llama-cpp-localai-paged-. -# We patch only the *copy*, never the original under backend/cpp/llama-cpp/, so -# the stock llama-cpp build stays untouched. +# 2. clones OUR pinned upstream llama.cpp into that copy via the copy's own +# `llama.cpp` target (which applies the stock base patches/ series, normally +# empty), then applies THIS backend's paged patch series (patches/paged/) +# onto the cloned tree with strict `git apply` (apply-paged-patches); +# 3. runs the copy's `grpc-server` target and copies the produced binary up as +# llama-cpp-localai-paged-. +# We clone+patch only the *copy*, never the original under backend/cpp/llama-cpp/, +# so the stock llama-cpp build stays untouched and patch-free. define paged-build rm -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build purge $(info $(GREEN)I llama-cpp-localai-paged build info:$(1)$(RESET)) - LLAMA_VERSION=$(LLAMA_VERSION) LLAMA_PAGED=on $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build llama.cpp - CMAKE_ARGS="$(CMAKE_ARGS) $(2)" TARGET="$(3)" LLAMA_VERSION=$(LLAMA_VERSION) LLAMA_PAGED=on \ + LLAMA_VERSION=$(LLAMA_VERSION) $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build llama.cpp + $(call apply-paged-patches,$(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build/llama.cpp) + CMAKE_ARGS="$(CMAKE_ARGS) $(2)" TARGET="$(3)" LLAMA_VERSION=$(LLAMA_VERSION) \ $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build grpc-server cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-$(1)-build/grpc-server llama-cpp-localai-paged-$(1) endef @@ -97,8 +121,9 @@ llama-cpp-localai-paged-cpu-all: cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build purge $(info $(GREEN)I llama-cpp-localai-paged build info:cpu-all-variants$(RESET)) - LLAMA_VERSION=$(LLAMA_VERSION) LLAMA_PAGED=on $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build llama.cpp - SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" LLAMA_VERSION=$(LLAMA_VERSION) LLAMA_PAGED=on \ + LLAMA_VERSION=$(LLAMA_VERSION) $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build llama.cpp + $(call apply-paged-patches,$(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build/llama.cpp) + SHARED_LIBS=ON EXTRA_CMAKE_ARGS="-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON" TARGET="--target grpc-server --target ggml" LLAMA_VERSION=$(LLAMA_VERSION) \ $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build grpc-server cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-localai-paged-cpu-all-build/grpc-server llama-cpp-localai-paged-cpu-all rm -rf ggml-shared-libs && mkdir -p ggml-shared-libs diff --git a/backend/cpp/llama-cpp/patches/BENCHMARKS.md b/backend/cpp/llama-cpp-localai-paged/patches/BENCHMARKS.md similarity index 100% rename from backend/cpp/llama-cpp/patches/BENCHMARKS.md rename to backend/cpp/llama-cpp-localai-paged/patches/BENCHMARKS.md diff --git a/backend/cpp/llama-cpp/patches/README.md b/backend/cpp/llama-cpp-localai-paged/patches/README.md similarity index 84% rename from backend/cpp/llama-cpp/patches/README.md rename to backend/cpp/llama-cpp-localai-paged/patches/README.md index 3748e9dd2..fa777ee44 100644 --- a/backend/cpp/llama-cpp/patches/README.md +++ b/backend/cpp/llama-cpp-localai-paged/patches/README.md @@ -16,7 +16,7 @@ patch needs fixing, and the failure points at exactly which step the upstream ch | # | Patch | What | Verifies | |---|-------|------|----------| -| 0001 | `0001-vendor-paged-kv-manager.patch` | Add `src/paged-kv-manager.{h,cpp}` (vLLM-parity block manager, CPU foundation) + CMake; no behavior change | builds; unit-tested separately under `../paged/` | +| 0001 | `0001-vendor-paged-kv-manager.patch` | Add `src/paged-kv-manager.{h,cpp}` (vLLM-parity block manager, CPU foundation) + CMake; no behavior change | builds; unit-tested separately | | 0002 | `0002-paged-kv-storage.patch` | Shared block-pool KV tensor + `set_rows`-by-slot writes, behind `LLAMA_KV_PAGED` | builds; write/gather round-trip | | 0003 | `0003-paged-gather-read.patch` | `build_attn_paged` gather-read in `llama-graph.cpp` | **Gate 0**: token-identical greedy gen, single + multi-seq | | 0004 | `0004-paged-ondemand-alloc.patch` | On-demand block allocation via PagedKVManager | max concurrent seqs before OOM | @@ -35,21 +35,25 @@ git checkout git checkout -b paged # 2. apply the current series (each becomes a commit), or develop the next patch -git am /path/to/backend/cpp/llama-cpp/patches/00*.patch # or `git apply` + commit per patch +git am /path/to/backend/cpp/llama-cpp-localai-paged/patches/paged/00*.patch # or `git apply` + commit per patch # 3. iterate a phase as ONE commit, then export the whole series 1:1 -git format-patch ..paged -o /path/to/backend/cpp/llama-cpp/patches/ --zero-commit -N +git format-patch ..paged -o /path/to/backend/cpp/llama-cpp-localai-paged/patches/paged/ --zero-commit -N # 4. on a pin bump: rebase `paged` onto the new pin; only conflicting patches need edits; re-export. ``` ## Build integration -`../Makefile`'s `llama.cpp:` target runs, after `git checkout -b build $(LLAMA_VERSION)`: +The series is owned by this backend (`backend/cpp/llama-cpp-localai-paged`), not by the stock +`llama-cpp` backend, which is pure upstream. `../Makefile` (the paged wrapper) clones the pinned +`llama.cpp` via the copied stock build infra, then applies this series onto the cloned tree with the +same strict `git apply` the stock build uses for base patches: ``` -for p in $(CURRENT_MAKEFILE_DIR)/patches/0*.patch; do git apply --verbose "$p"; done +for p in $(PAGED_PATCHES_DIR)/0*.patch; do git apply --verbose "$p" || exit 1; done ``` -All variants (avx/avx2/avx512/cuda/…) copy the patched `llama.cpp/` tree, so the series ships everywhere. +All variants (avx/avx2/avx512/cuda/…) clone + apply into their own build copy, so the series ships +everywhere without ever touching the stock `llama-cpp` source tree. ## Status @@ -78,5 +82,5 @@ by itself reach vLLM throughput parity, because the measured prefill bottleneck (Lever 3: `mul_mat_q` ~22 TFLOP/s, ~27× behind vLLM) — a *per-token compute* gap that paging does not touch. Paged attention closes the **concurrency/memory** gap (more sequences, prefix reuse); the prefill/throughput gap additionally needs the tcgen05/CUTLASS grouped-GEMM (deferred, upstream-grade, no shortcut — see -`../paged/UPSTREAM_GGML_ISSUE.md` and `DGX_BLACKWELL_PLAN.md`). So full vLLM parity = this series **AND** the +`paged/README.md`). So full vLLM parity = this series **AND** the kernel; neither alone suffices. diff --git a/backend/cpp/llama-cpp/patches/kernel/0001-fp4-grouped-moe-scaffold.patch b/backend/cpp/llama-cpp-localai-paged/patches/kernel/0001-fp4-grouped-moe-scaffold.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/kernel/0001-fp4-grouped-moe-scaffold.patch rename to backend/cpp/llama-cpp-localai-paged/patches/kernel/0001-fp4-grouped-moe-scaffold.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0001-vendor-paged-kv-manager.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0001-vendor-paged-kv-manager.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0001-vendor-paged-kv-manager.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0001-vendor-paged-kv-manager.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0002-paged-kv-block-placement-env-LLAMA_KV_PAGED.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0002-paged-kv-block-placement-env-LLAMA_KV_PAGED.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0002-paged-kv-block-placement-env-LLAMA_KV_PAGED.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0002-paged-kv-block-placement-env-LLAMA_KV_PAGED.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0003-paged-gather-read-env-LLAMA_KV_PAGED.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0003-paged-gather-read-env-LLAMA_KV_PAGED.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0003-paged-gather-read-env-LLAMA_KV_PAGED.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0003-paged-gather-read-env-LLAMA_KV_PAGED.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0004-paged-on-demand-block-allocation-env-LLAMA_KV_PAGED.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0004-paged-on-demand-block-allocation-env-LLAMA_KV_PAGED.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0004-paged-on-demand-block-allocation-env-LLAMA_KV_PAGED.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0004-paged-on-demand-block-allocation-env-LLAMA_KV_PAGED.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0006-paged-cross-request-prefix-caching-env-LLAMA_KV_PAGED.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0006-paged-cross-request-prefix-caching-env-LLAMA_KV_PAGED.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0006-paged-cross-request-prefix-caching-env-LLAMA_KV_PAGED.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0006-paged-cross-request-prefix-caching-env-LLAMA_KV_PAGED.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0007-paged-engine-prefix-recompute-skip-env-LLAMA_KV_PAGED.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0007-paged-engine-prefix-recompute-skip-env-LLAMA_KV_PAGED.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0007-paged-engine-prefix-recompute-skip-env-LLAMA_KV_PAGED.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0007-paged-engine-prefix-recompute-skip-env-LLAMA_KV_PAGED.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0008-paged-server-cross-request-prefix-share-env-LLAMA_KV_PAGED.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0008-paged-server-cross-request-prefix-share-env-LLAMA_KV_PAGED.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0008-paged-server-cross-request-prefix-share-env-LLAMA_KV_PAGED.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0008-paged-server-cross-request-prefix-share-env-LLAMA_KV_PAGED.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0009-paged-in-kernel-decode-read-env-LLAMA_KV_PAGED-patch.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0009-paged-in-kernel-decode-read-env-LLAMA_KV_PAGED-patch.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0009-paged-in-kernel-decode-read-env-LLAMA_KV_PAGED-patch.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0009-paged-in-kernel-decode-read-env-LLAMA_KV_PAGED-patch.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0010-paged-tile-in-kernel-read-and-dispatch-guard-env-LLAMA_KV_PAGED.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0010-paged-tile-in-kernel-read-and-dispatch-guard-env-LLAMA_KV_PAGED.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0010-paged-tile-in-kernel-read-and-dispatch-guard-env-LLAMA_KV_PAGED.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0010-paged-tile-in-kernel-read-and-dispatch-guard-env-LLAMA_KV_PAGED.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0011-paged-decode-route-GQA-grouped-tile-kernel-by-defaul.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0011-paged-decode-route-GQA-grouped-tile-kernel-by-defaul.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0011-paged-decode-route-GQA-grouped-tile-kernel-by-defaul.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0011-paged-decode-route-GQA-grouped-tile-kernel-by-defaul.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0012-paged-mask-pad-invariant-assert.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0012-paged-mask-pad-invariant-assert.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0012-paged-mask-pad-invariant-assert.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0012-paged-mask-pad-invariant-assert.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0013-paged-decoupled-prefill-token-budget.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0013-paged-decoupled-prefill-token-budget.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0013-paged-decoupled-prefill-token-budget.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0013-paged-decoupled-prefill-token-budget.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0014-paged-expert-aware-moe-token-tile-cap.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0014-paged-expert-aware-moe-token-tile-cap.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0014-paged-expert-aware-moe-token-tile-cap.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0014-paged-expert-aware-moe-token-tile-cap.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0015-paged-expert-density-aware-moe-token-tile-auto-select.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0015-paged-expert-density-aware-moe-token-tile-auto-select.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0015-paged-expert-density-aware-moe-token-tile-auto-select.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0015-paged-expert-density-aware-moe-token-tile-auto-select.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0016-paged-dynamic-prefill-budget-continuous-batch.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0016-paged-dynamic-prefill-budget-continuous-batch.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0016-paged-dynamic-prefill-budget-continuous-batch.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0016-paged-dynamic-prefill-budget-continuous-batch.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0017-fp4-gemm-decode-tile-tune.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0017-fp4-gemm-decode-tile-tune.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0017-fp4-gemm-decode-tile-tune.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0017-fp4-gemm-decode-tile-tune.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0018-qwen35-ssm-decode-inplace-state.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0018-qwen35-ssm-decode-inplace-state.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0018-qwen35-ssm-decode-inplace-state.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0018-qwen35-ssm-decode-inplace-state.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0019-qwen35-ssm-decode-fused-gather.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0019-qwen35-ssm-decode-fused-gather.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0019-qwen35-ssm-decode-fused-gather.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0019-qwen35-ssm-decode-fused-gather.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0020-qwen35-gdn-oproj-mmq-reshape.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0020-qwen35-gdn-oproj-mmq-reshape.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0020-qwen35-gdn-oproj-mmq-reshape.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0020-qwen35-gdn-oproj-mmq-reshape.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0021-qwen35-conv-state-inplace-fusion.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0021-qwen35-conv-state-inplace-fusion.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0021-qwen35-conv-state-inplace-fusion.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0021-qwen35-conv-state-inplace-fusion.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0022-qwen35-gdn-recurrence-occupancy-retune.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0022-qwen35-gdn-recurrence-occupancy-retune.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0022-qwen35-gdn-recurrence-occupancy-retune.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0022-qwen35-gdn-recurrence-occupancy-retune.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0023-qwen35moe-nvfp4-quant-dedup.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0023-qwen35moe-nvfp4-quant-dedup.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0023-qwen35moe-nvfp4-quant-dedup.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0023-qwen35moe-nvfp4-quant-dedup.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0024-paged-pool-burst-reclaim.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0024-paged-pool-burst-reclaim.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0024-paged-pool-burst-reclaim.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0024-paged-pool-burst-reclaim.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0025-qwen35moe-nvfp4-moe-decode-regraph.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0025-qwen35moe-nvfp4-moe-decode-regraph.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0025-qwen35moe-nvfp4-moe-decode-regraph.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0025-qwen35moe-nvfp4-moe-decode-regraph.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0028-qwen35-recurrent-state-gather-fusion.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0028-qwen35-recurrent-state-gather-fusion.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0028-qwen35-recurrent-state-gather-fusion.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0028-qwen35-recurrent-state-gather-fusion.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0029-qwen35-blocktable-within-step-cache.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0029-qwen35-blocktable-within-step-cache.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0029-qwen35-blocktable-within-step-cache.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0029-qwen35-blocktable-within-step-cache.patch diff --git a/backend/cpp/llama-cpp/patches/paged/0030-fused-op-backend-gate.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0030-fused-op-backend-gate.patch similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/0030-fused-op-backend-gate.patch rename to backend/cpp/llama-cpp-localai-paged/patches/paged/0030-fused-op-backend-gate.patch diff --git a/backend/cpp/llama-cpp/patches/paged/LOCALAI_LLAMACPP_BACKEND_PLAN.md b/backend/cpp/llama-cpp-localai-paged/patches/paged/LOCALAI_LLAMACPP_BACKEND_PLAN.md similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/LOCALAI_LLAMACPP_BACKEND_PLAN.md rename to backend/cpp/llama-cpp-localai-paged/patches/paged/LOCALAI_LLAMACPP_BACKEND_PLAN.md diff --git a/backend/cpp/llama-cpp/patches/paged/PAGED_BITEXACT_NOTE.md b/backend/cpp/llama-cpp-localai-paged/patches/paged/PAGED_BITEXACT_NOTE.md similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/PAGED_BITEXACT_NOTE.md rename to backend/cpp/llama-cpp-localai-paged/patches/paged/PAGED_BITEXACT_NOTE.md diff --git a/backend/cpp/llama-cpp/patches/paged/PIN_SYNC_c299a92c.md b/backend/cpp/llama-cpp-localai-paged/patches/paged/PIN_SYNC_c299a92c.md similarity index 96% rename from backend/cpp/llama-cpp/patches/paged/PIN_SYNC_c299a92c.md rename to backend/cpp/llama-cpp-localai-paged/patches/paged/PIN_SYNC_c299a92c.md index fa74a6771..ff69560f5 100644 --- a/backend/cpp/llama-cpp/patches/paged/PIN_SYNC_c299a92c.md +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/PIN_SYNC_c299a92c.md @@ -21,7 +21,8 @@ Unlike the `9d5d882d` sync (which needed 4 patch re-exports), this bump required **zero patch changes**. The already-shipped source-only series (the result of the `7e1832b8` strip that removed all stray dev-doc hunks) applies to a fresh clean `ggml-org/llama.cpp` checkout at `c299a92c` with the build's own **strict -`git apply`** (the `llama.cpp` target in `backend/cpp/llama-cpp/Makefile`: +`git apply`** (the `apply-paged-patches` step in +`backend/cpp/llama-cpp-localai-paged/Makefile`: `git apply --verbose "$p" || exit 1`) and reaches **exit 0** - every one of the 28 patches reported "Applied patch ... cleanly", the sentinel `src/paged-kv-manager.cpp` was created, and there are **zero** stray @@ -94,7 +95,7 @@ here to keep the pin-bump diff minimal. ## Source of truth -The shipped `.patch` files under `backend/cpp/llama-cpp/patches/paged/` are the +The shipped `.patch` files under `backend/cpp/llama-cpp-localai-paged/patches/paged/` are the source of truth and are unchanged by this bump. The DGX dev tree (`~/llama-paged-dev`, branch `paged`) was advanced to `c299a92c` for consistency; the pre-bump state is retained at `paged-prebump-9d5d882d-backup`. diff --git a/backend/cpp/llama-cpp/patches/paged/README.md b/backend/cpp/llama-cpp-localai-paged/patches/paged/README.md similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/README.md rename to backend/cpp/llama-cpp-localai-paged/patches/paged/README.md diff --git a/backend/cpp/llama-cpp/patches/paged/final_benchmark.csv b/backend/cpp/llama-cpp-localai-paged/patches/paged/final_benchmark.csv similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/final_benchmark.csv rename to backend/cpp/llama-cpp-localai-paged/patches/paged/final_benchmark.csv diff --git a/backend/cpp/llama-cpp/patches/paged/paged-burst-bench.cpp b/backend/cpp/llama-cpp-localai-paged/patches/paged/paged-burst-bench.cpp similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/paged-burst-bench.cpp rename to backend/cpp/llama-cpp-localai-paged/patches/paged/paged-burst-bench.cpp diff --git a/backend/cpp/llama-cpp/patches/paged/paged-reclaim-unit.cpp b/backend/cpp/llama-cpp-localai-paged/patches/paged/paged-reclaim-unit.cpp similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/paged-reclaim-unit.cpp rename to backend/cpp/llama-cpp-localai-paged/patches/paged/paged-reclaim-unit.cpp diff --git a/backend/cpp/llama-cpp/patches/paged/qwen36_dense_decode_vs_npl.png b/backend/cpp/llama-cpp-localai-paged/patches/paged/qwen36_dense_decode_vs_npl.png similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/qwen36_dense_decode_vs_npl.png rename to backend/cpp/llama-cpp-localai-paged/patches/paged/qwen36_dense_decode_vs_npl.png diff --git a/backend/cpp/llama-cpp/patches/paged/qwen36_moe_decode_vs_npl.png b/backend/cpp/llama-cpp-localai-paged/patches/paged/qwen36_moe_decode_vs_npl.png similarity index 100% rename from backend/cpp/llama-cpp/patches/paged/qwen36_moe_decode_vs_npl.png rename to backend/cpp/llama-cpp-localai-paged/patches/paged/qwen36_moe_decode_vs_npl.png diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index bc404ee6c..3e486b922 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -6,14 +6,6 @@ # bump and is advanced only by the manual PIN_SYNC process. LLAMA_VERSION?=9d5d882d8cd0f0a9283d87ed5e6fe3ee0d925fb1 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp -# LLAMA_PAGED controls whether the vendored paged-attention patch series -# (patches/paged/) is applied on top of the pinned llama.cpp. Default on; set -# LLAMA_PAGED=off to build a clean-against-upstream backend (e.g. to unblock a -# dep-bump if an upstream change breaks a paged hook - the paged carry is then -# fixed independently). Runtime behaviour stays gated by the LLAMA_KV_PAGED env -# regardless, so an LLAMA_PAGED=on build is byte-identical to stock until that -# env is set. -LLAMA_PAGED?=on CMAKE_ARGS?= BUILD_TYPE?= @@ -187,23 +179,14 @@ llama.cpp: [ -e "$$p" ] || continue; \ echo "applying llama.cpp patch: $$p"; \ git apply --verbose "$$p" || { echo "patch failed: $$p"; exit 1; }; \ - done && \ - if [ "$(LLAMA_PAGED)" = "off" ]; then \ - echo "LLAMA_PAGED=off: skipping paged-attention patch series"; \ - else \ - for p in $(CURRENT_MAKEFILE_DIR)patches/paged/0*.patch; do \ - [ -e "$$p" ] || continue; \ - echo "applying llama.cpp PAGED patch: $$p"; \ - git apply --verbose "$$p" || { echo "paged patch failed: $$p"; exit 1; }; \ - done; \ - fi + done llama.cpp/tools/grpc-server: llama.cpp mkdir -p llama.cpp/tools/grpc-server - LLAMA_PAGED=$(LLAMA_PAGED) bash prepare.sh + bash prepare.sh rebuild: - LLAMA_PAGED=$(LLAMA_PAGED) bash prepare.sh + bash prepare.sh rm -rf grpc-server $(MAKE) grpc-server diff --git a/backend/cpp/llama-cpp/paged/.gitignore b/backend/cpp/llama-cpp/paged/.gitignore deleted file mode 100644 index a3bc88ec9..000000000 --- a/backend/cpp/llama-cpp/paged/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -tests/test_free_block_queue -tests/test_block_pool -tests/test_paged_kv_manager -tests/test_prefix_cache -tests/test_ggml_paged_rw -tests/test_ggml_paged_attn -paged-bench diff --git a/backend/cpp/llama-cpp/paged/BLACKWELL_KERNEL_GAPS.md b/backend/cpp/llama-cpp/paged/BLACKWELL_KERNEL_GAPS.md deleted file mode 100644 index 34d4d4657..000000000 --- a/backend/cpp/llama-cpp/paged/BLACKWELL_KERNEL_GAPS.md +++ /dev/null @@ -1,105 +0,0 @@ -# Blackwell (GB10 / sm_121) kernel gaps — measured + the corrected strategy - -Supersedes the "greenfield tcgen05 FP4 grouped GEMM" framing in `FP4_GROUPED_MOE_KERNEL.md`. Research + -profiling reframed the problem: the kernels we need **already exist in ggml**; they're just **untuned for -Blackwell**. And the parity target is far lower than the headline vLLM number implied. - -## 1. The parity target was wrong — it's ~3,300 t/s single-stream, not 24,444 - -vLLM's dense "24,444 t/s" is **aggregate concurrent-batch** throughput, not single-sequence. The GB10 -compute roofline caps **single-stream** Qwen3-32B prefill at **~3,300 t/s (BF16/INT8 ceiling)** / **~6,600 -(FP4 ceiling)**. So: don't chase 24,444 with one kernel. Aggregate parity = (a kernel at the ceiling) + -(batched-prefill scheduling). The *kernel* job is to reach ~3,300 (matches vLLM, which on GB10 also runs at -the BF16 ceiling) or ~6,600 (beats it, via FP4). - -## 2. GB10 per-precision DENSE peaks (measured, not spec) - -| precision | dense peak | vs BF16 | -|---|---|---| -| BF16 / FP16 | ~213 TFLOP/s | 1.0× | -| INT8 | ~215 TOPS | **1.0×** | -| FP4 (MXFP4/NVFP4) | ~427–500 TFLOP/s | **2.0×** | - -Memory: ~273 GB/s LPDDR5X (the bottleneck for *decode*; prefill is compute-bound). **Critical:** GB10 is -**1:1:2** (BF16:INT8:FP4), NOT datacenter Blackwell's 1:2:4 — **INT8 gives ZERO speedup over BF16 here.** So -int8-MMQ has no precision advantage; only FP4 does. (NVIDIA spec sheets still claim 1:2:4 — contradicted by -direct GB10 measurement; on-the-record discrepancy.) - -## 3. Measured gaps (nsys, GB10) - -| path | kernel | % of prefill | achieved | % of ceiling | -|---|---|---|---|---| -| **Dense** Q4_K_M | `mul_mat_q` (int8 MMQ) | 80% | ~46 TFLOP/s | **~21% of 215** | -| **MoE** MXFP4 | `mul_mat_q` (FP4 MMA) | 37% | ~22 TFLOP/s | **~4–5% of 500** (or ~10% of BF16) | - -Both kernels are **engaged correctly but untuned for Blackwell** — llama.cpp's MMQ was "tuned primarily for -RTX 3000/4000" (Ampere/Ada). The headroom (4–5×) is recoverable; it's not an architectural ceiling. - -## 4. ggml's current quantized-matmul paths (what exists) - -- **MMQ** (int8): quantizes activations to Q8_1, int8 `mma.sync`/`dp4a`. Prefill path. **Untuned for sm_12x.** -- **FP4 MMA** (#17906, merged): native MXFP4/NVFP4 `m16n8k64` block-scaled FP4 mma for cc≥12.0. Works on GB10 - for MoE (we measured 3441 t/s MXFP4 prefill) — but underutilized (~5% of FP4 peak). On **sm_121** it's hit - by build-flag (`120f`) + nvcc `-O3` miscompile (#18331) + capability-gating issues. -- **dequant→cuBLAS-FP16**: unfused fallback (materializes FP16 weights, round-trips memory). Not a fused - Marlin. (Our `GGML_CUDA_FORCE_CUBLAS` no-op = this didn't even engage for Q4_K.) -- **NO fused Marlin-style W4A16 kernel** (dequant 4-bit→BF16 in-shared-mem → BF16 tensor cores). Real gap. - -## 5. Strategy — match vs beat (this replaces the tcgen05-greenfield plan) - -**To MATCH vLLM (~3,300 single-stream): FP4 is NOT required.** Because INT8 == BF16 on GB10, a tuned MMQ and -a BF16 Marlin kernel share the *same* ceiling — and vLLM hits parity via W4A16 Marlin (BF16), since its FP4 -is also broken on sm_121. - -Ranked, by effort: -1. **Probe: tune the existing int8 MMQ for Blackwell** (dense). Cheapest. We're at 21% of the ceiling — - recover via tile sizes, async copy (`cp.async`), double-buffered shared-mem pipeline, occupancy. Caveat: - the `nwarps*tile_C::I==mmq_y` static_assert (found earlier) couples the constants; and the Q8_1 - activation-quant overhead caps pure-MMQ tuning. Bounded upside, but a fast experiment. -2. **Build a Marlin-style W4A16 BF16 GEMM** (dense) — the robust path to ~3,300 (4.3× over today's 765). - Dequant 4-bit→BF16 in shared memory, MMA on BF16 tensor cores, `cp.async` multi-buffer, offline weight - reshuffle. Mirrors vLLM's actual GB10 path; keeps activations BF16 (better quality than int8 MMQ); fills a - genuine ggml gap. **This is the recommended kernel to MATCH.** - -**To BEAT vLLM (~6,600, 2×): fix — don't rewrite — the FP4 path on sm_121.** -3. **Get the existing FP4 MMA (#17906/#20644) fully working + tuned on sm_121.** It already works on sm_120 - (RTX 5090: +43–68% prefill) and on GB10 for MoE. The blockers are the `120f` arch flag, the `-O3` - miscompile (#18331), capability gating — **build/compiler fixes, not a new kernel.** Then tune the FP4 MMQ - (it's at ~5% of FP4 peak). This is where upstream momentum already is, and the only route past vLLM. - -**Dropped:** the from-scratch tcgen05/CUTLASS grouped GEMM (the old scaffold). It aimed past the matchable -ceiling, duplicates work the FP4-MMA path already does, and FP4 on sm_121 is a *fix* problem not a *write* -problem. The `fp4-grouped-moe.cu` scaffold/hook stays as a useful dispatch seam, but the kernel behind it -should be one of (1)/(2)/(3), not a greenfield CUTLASS collective. - -## 6. Cheap experiment — RESULT: MXFP4 dense = free 1.44×, but not parity (kernel still untuned) - -Requantized Qwen3-32B dense → MXFP4 (forced attn+ffn to mxfp4 via `--tensor-type`, `--allow-requantize`, -speed-only test) and benched prefill: - -| quant | kernel | pp512 | pp2048 | vs Q4_K | -|---|---|---|---|---| -| Q4_K_M | int8-MMQ | 765 | 763 | 1.0× | -| **MXFP4** | **FP4-MMA** | **1099** | **1153** | **1.44×** | - -**Findings:** -- **MXFP4 dense is a real, free 1.44× over Q4_K** — just a requantize, the existing FP4-MMA path engages for - dense weights on GB10. Worth shipping as a **Blackwell dense-quant recommendation** in the gallery (no kernel). -- **But it is NOT parity.** 1153 t/s = **~17% of the FP4 ceiling (~6,600)** / ~35% of the BF16 ceiling. So the - **FP4-MMA kernel is itself untuned** (consistent with the MoE measurement, ~5% of FP4 peak). MXFP4 moves dense - from the int8 path (765) onto the FP4 path (1153), but the FP4 kernel leaves ~4–6× on the table. -- **So the kernel work is confirmed and now precise: tune the FP4-MMA kernel** (it's the highest-value, since it - serves both dense-MXFP4 and MoE, and FP4 is the only path that can *beat* vLLM). Strategy item (3) — fix + - tune the existing FP4-MMA on sm_121 — is the priority; a Marlin-style W4A16 BF16 kernel (2) is the alternative - to *match* on the BF16 ceiling if FP4 tuning stalls. - -Conclusion: the cheap test did NOT collapse the kernel problem (the kernels are untuned, not just the quant), but -it (a) gives a free 1.44× to ship now, and (b) sharpens the target to **tuning the FP4-MMA kernel**. - -## Sources -GB10 peaks (measured): forums.developer.nvidia.com/t/351993, /360142, /373618. Marlin: github.com/IST-DASLab/marlin, -arxiv 2408.11743, developers.redhat.com Marlin/Machete. MMQ untuned: llama.cpp docs/build.md, discussions/16578, -DandinPower/llama.cpp_bench. FP4 landing/sm121: llama.cpp PR #17906/#20644, issues #19662/#18331. Roofline: -vllm.ai/blog/2026-06-01-vllm-dgx-spark, lmsys.org DGX Spark. - -> **Correction (measured):** the earlier `GGML_CUDA_FORCE_CUBLAS` env test was a no-op because it's a *compile-time* `#ifdef`, not a runtime flag — cuBLAS never engaged. A real rebuild with `-DGGML_CUDA_FORCE_CUBLAS=ON` shows cuBLAS is **slower** than MMQ for dense Q4 (pp2048 690 vs 750) and runs an **Ampere `cutlass_80_tensorop` FP16 kernel** — cuBLAS-13.0 has no sm_121-tuned GEMM and falls back to sm_80. So *both* MMQ and cuBLAS sit at ~46 TFLOP/s (~21% of the 213 BF16 peak); there is **no library shortcut** to the ceiling on GB10 — a hand-tuned sm_120a kernel (Marlin-style) is required. diff --git a/backend/cpp/llama-cpp/paged/CHUNKED_PREFILL_PLAN.md b/backend/cpp/llama-cpp/paged/CHUNKED_PREFILL_PLAN.md deleted file mode 100644 index 4dc90f97b..000000000 --- a/backend/cpp/llama-cpp/paged/CHUNKED_PREFILL_PLAN.md +++ /dev/null @@ -1,334 +0,0 @@ -# Chunked prefill + n_batch/n_ubatch decouple — implementation plan - -Scope: LocalAI's llama.cpp backend (`backend/cpp/llama-cpp/`). Companion to -`PHASED_VLLM_PARITY_PLAN.md` Phase 3. This document is the concrete, file-cited -plan for what the brief called "chunked prefill". - -Line numbers below are from two trees: -- LocalAI: `backend/cpp/llama-cpp/grpc-server.cpp`, `core/backend/options.go`, - `backend/backend.proto`, `core/backend/hardware_defaults.go` — exact. -- Vendored upstream scheduler: `llama.cpp/tools/server/server-context.cpp`. The - build copies `llama.cpp/tools/server/*` into `tools/grpc-server/` (`prepare.sh` - lines 15-17) and only overrides `grpc-server.cpp` + `CMakeLists.txt`. So - `update_slots()` is **inherited upstream code, not LocalAI code**. Line numbers - cited for it are from a same-era checkout (`d12cc3d`, 2026-04-09); the pin is - `f3e1828` (Makefile line 2). The structure is identical; exact lines may drift - a few rows at the pin — match on the quoted comment strings, not the integers. - ---- - -## TL;DR — the headline finding - -**Chunked prefill with prefill/decode interleaving is ALREADY implemented** in the -llama.cpp server scheduler that LocalAI vendors. It is not a missing feature on -this version. `update_slots()` in `server-context.cpp`: - -1. **Adds ongoing decode tokens first** — "first, add sampled tokens from any - ongoing sequences" (≈ line 2088). Every `SLOT_STATE_GENERATING` slot gets its - one sampled token into the shared `llama_batch` before any prefill is added. -2. **Then fills the remaining `n_batch` budget with prompt (prefill) tokens** — - "next, batch any pending prompts without exceeding n_batch" (≈ line 2166), - gated by `params_base.cont_batching` (LocalAI sets `cont_batching = true` by - default, `grpc-server.cpp:547`). The per-slot prefill fill loop - (≈ line 2552) is `while (slot.prompt.n_tokens() < slot.task->n_tokens() && - batch.n_tokens < n_batch)` — i.e. it caps each slot's prefill contribution to - the **remaining** budget and defers the rest to the next iteration. -3. **Decodes the combined batch in one pass** (≈ line 2728-2741): decode tokens - and prefill-chunk tokens go through the **same `llama_decode`**, which then - splits internally into `n_ubatch` physical sub-batches. - -This is exactly the behavior the abandoned-looking draft **upstream PR #10718** -("server : chunked prefill support") asked for — "the first task is no longer -blocked by the second long prompt processing task." That PR is still marked OPEN -but its goal was absorbed into the natural evolution of `update_slots()`; we do -**not** need to port it. A long prefill no longer stalls the decode batch: decode -slots are serviced first every iteration, prefill consumes only the leftover -budget. - -**Therefore: do not re-implement chunked prefill.** The real LocalAI gap is -narrow and is the rest of this plan: - -- **Phase A (the actual gap): the `n_batch`/`n_ubatch` decouple.** LocalAI ties - the scheduler token budget (`n_batch`) to the physical forward width - (`n_ubatch`) at `grpc-server.cpp:515` + `:519`. This forces - `n_batch == n_ubatch`, so the logical scheduling window can never be wider than - one physical ubatch. You cannot keep `n_ubatch` at the Blackwell GEMM sweet - spot (2048) while widening `n_batch` so concurrent prefills + decodes co-batch - into a larger logical window. There is no first-class `batch:`/`ubatch:` split - on the Go side, and there is only a one-directional `ubatch` override on the C++ - side (you can shrink ubatch below the coupled value, never grow n_batch above - it). -- **Phase B (optional policy lever): a decode-headroom prefill cap.** Upstream - caps prefill at the full `n_batch` shared with decode. Under heavy mixed load - one fat prefill chunk per iteration still adds inter-token latency (ITL) jitter - to the decoders sharing that forward. vLLM exposes - `long_prefill_token_threshold` / `max_num_partial_prefills` for this. A - LocalAI-specific per-iteration prefill cap (a patch to vendored `update_slots`) - bounds that jitter. This is genuinely not in upstream and is the only place a - scheduler-policy change is warranted. - ---- - -## 1. Current behavior — precise citations - -### 1.1 The scheduler is upstream, inherited verbatim -- `prepare.sh:15-17` copies all of `llama.cpp/tools/server/*` into the - `grpc-server` build dir; `grpc-server.cpp` (LocalAI) replaces only the HTTP/gRPC - service + `params_parse` + `parse_options`. `update_slots()`, the slot state - machine, and the batch builder are **upstream `server-context.cpp`**, untouched - by LocalAI today. -- Slot states: `server-context.cpp:36-42` — - `SLOT_STATE_IDLE / WAIT_OTHER / STARTED / PROCESSING_PROMPT / DONE_PROMPT / - GENERATING`. - -### 1.2 Decode-first, then prefill-fill, one shared batch -- `common_batch_clear(batch)` (≈ 2078) — one batch per `update_slots` iteration. -- Decode phase (≈ 2088-2156): for each `SLOT_STATE_GENERATING` slot, - `common_batch_add(batch, slot.sampled, …, /*logits=*/true)` adds exactly one - token. Decode is guaranteed a seat before prefill runs. -- Budget fetch (≈ 2158-2160): `n_batch = llama_n_batch(ctx)`, - `n_ubatch = llama_n_ubatch(ctx)`. -- Prefill phase (≈ 2166): `if (params_base.cont_batching || batch.n_tokens == 0)` - → with cont_batching ON, prefill is added to the **same** batch as decode. -- Per-slot prefill fill (≈ 2552-2597): - `while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch)` - — adds prompt tokens until the slot is done **or** the shared budget is hit. - Whatever does not fit stays for the next iteration (the slot remains - `SLOT_STATE_PROCESSING_PROMPT`). -- Whole-prompt completion (≈ 2603-2615): when the slot's prompt is fully consumed - it flips to `SLOT_STATE_DONE_PROMPT`, sets `batch.logits[last] = true`, inits - the sampler. Next iteration it becomes `GENERATING`. -- Budget break (≈ 2693-2695): `if (batch.n_tokens >= n_batch) break;`. -- Decode (≈ 2728-2741): loops `batch_view` slices of `min(n_batch, remaining)` and - calls `llama_decode`; the physical `n_ubatch` split happens inside - `llama_decode`. - -### 1.3 The chunking is gated by `can_split()` -- `server-context.cpp:225-231`: `can_split()` returns true unless the task needs - embeddings with non-LAST pooling. So **completion/generation tasks always - chunk-and-interleave**; only embeddings/rerank force the whole prompt into one - ubatch (≈ 2234-2244 raises "input is too large… increase the physical batch - size" — this is exactly why LocalAI bumped `n_ubatch` for rerank, see below). - -### 1.4 LocalAI ties n_batch to n_ubatch (the gap) -- `grpc-server.cpp:515` — `params.n_batch = request->nbatch();` -- `grpc-server.cpp:519` — `params.n_ubatch = request->nbatch();` with the comment - that this fixes reranking being capped at the 512 default `n_ubatch`. -- `grpc-server.cpp:781-784` — the **only** decouple knob today: an `n_ubatch` / - `ubatch` option that overrides `n_ubatch` alone (added for embeddings/rerank). - There is **no** `batch` / `n_batch` option parse, so `n_batch` cannot be raised - above the coupled value from a model config. Confirmed: `grep '"n_batch"|"batch"'` - in `grpc-server.cpp` returns nothing. -- Options arrive via `request->options(i)` parsed as `optname:optval` - (`grpc-server.cpp:584-585`); these come from `ModelOptions.Options` ⟵ - `c.Options` (`core/backend/options.go:221`). - -### 1.5 Go side sends a single batch number -- `backend/backend.proto:341` — `int32 NBatch = 4;` is the only batch field; there - is **no** `NUBatch`. -- `core/backend/options.go:108-129` `EffectiveBatchSize`: returns `c.Batch` if set, - else context size for single-pass (score/embed/rerank), else - `hardwareDefaultBatchSize(512)`. -- `core/backend/options.go:228` — `NBatch: int32(b)` (single value to the - backend; becomes both `n_batch` and `n_ubatch` via 1.4). -- `core/backend/hardware_defaults.go:28,37-40` — `BlackwellBatchSize = 2048`; - on Blackwell an unset batch defaults to 2048, so today - `n_batch == n_ubatch == 2048` there. - ---- - -## 2. Why the decouple matters for serving (not just rerank) - -Invariant: `n_ubatch <= n_batch`. `n_ubatch` is the physical forward-pass GEMM -width (compute efficiency; GB10 sweet spot ≈ 2048). `n_batch` is the per-iteration -**scheduler token budget** — the logical window shared by decode + prefill chunks, -analogous to vLLM's `max_num_batched_tokens`. - -With `n_batch == n_ubatch` (today), the scheduling window cannot exceed one -physical ubatch. Consequences: -- Under concurrency, the combined (decode + multiple prefill chunks) logical batch - is capped at the physical ubatch, so aggregate prefill cannot grow past one - ubatch worth of tokens per iteration even when more slots have prompts queued. -- A user who shrinks `batch:` for memory also shrinks the physical ubatch, - degrading prefill GEMM efficiency — and vice versa. - -Decoupling lets us hold `n_ubatch = 2048` (efficient GEMM) while setting a larger -`n_batch` (e.g. 4096) so more concurrent prefill+decode tokens co-schedule into one -logical window, lifting aggregate prefill under mixed load — `llama_decode` still -tiles the physical work at 2048. - ---- - -## 3. Phased implementation - -### Phase 0 — Verification harness (do first; TDD red) -Bite-sized, no code change to the scheduler. -- **0.1 Token-identical greedy under mixed load.** Script: start the backend with - `n_parallel >= 4`, greedy sampling (temp 0, fixed seed). Fire (a) several short - decode streams and (b) one ~8k-token prompt concurrently (the exact repro from - PR #10718's body works). Capture each stream's full token id sequence. Re-run - with the prefill request absent. **Assert the short streams' token ids are - byte-identical** in both runs — proves interleaving does not perturb decode - numerics (KV/position correctness across chunk boundaries). Wire as a Ginkgo - spec under the backend e2e suite. -- **0.2 Mixed-workload throughput baseline.** Use `llama-batched-bench` (built from - the same tree) or a small driver hitting `/v1/chat/completions`: measure - aggregate prefill tok/s and decode tok/s, and p50/p99 ITL of the decode streams, - under the mixed workload. Record numbers for the current `n_batch==n_ubatch` - config. This is the before of Phase A/B. - -Expected result of Phase 0: 0.1 already passes (interleave is correct today); -0.2 gives the baseline the decouple must beat. - -### Phase A — Decouple n_batch from n_ubatch -Goal: let model config set the physical ubatch independently of the logical batch, -defaulting to today's behavior (no regression). - -- **A.1 C++: accept a `batch`/`n_batch` option (and keep `ubatch`).** - In `grpc-server.cpp`, after the existing `ubatch` branch (`:781-784`), add a - sibling branch: - ```cpp - } else if (!strcmp(optname, "n_batch") || !strcmp(optname, "batch")) { - if (optval != NULL) { - try { params.n_batch = std::stoi(optval_str); } catch (...) {} - } - ``` - This is the missing direction (raise `n_batch` above the coupled value). Order - matters: both `:515/:519` run first (coupling as default), then option parsing - overrides either independently. Add a clamp note: if a user sets - `n_ubatch > n_batch`, llama.cpp will clamp/upbatch; log a warning. Keep the - `:519` aliasing for backward compat (rerank still works with no options). - -- **A.2 Proto: add an explicit physical ubatch field.** - `backend/backend.proto:341` add `int32 NUBatch = ;` (do not reuse - 4). Regenerate with `make protogen-go` + the C++ proto build. - -- **A.3 C++: honor `NUBatch` when present.** - In `grpc-server.cpp` `params_parse`, after `:519`, add: - ```cpp - if (request->nubatch() > 0) { - params.n_ubatch = request->nubatch(); - } - ``` - so an explicit physical ubatch wins over the `n_batch` alias, with the `ubatch` - string-option as a third path for users who only edit `options:`. - -- **A.4 Go: config surface + plumbing.** - - Add `UBatch *int` (yaml `ubatch`) to the llama config struct alongside `Batch` - (search `core/config` for the `Batch` field; mirror it). - - In `core/backend/options.go`: add `EffectiveUBatchSize(c)` mirroring - `EffectiveBatchSize` (return `c.UBatch` if set, else - `min(EffectiveBatchSize(c), BlackwellBatchSize-or-512)` so the physical ubatch - stays at the hardware sweet spot while `n_batch` may be larger). Set - `NUBatch: int32(EffectiveUBatchSize(c))` next to `NBatch:` (`:228`). - - Keep the default such that when neither is set, `NUBatch == NBatch` ⇒ - byte-identical to today. - -- **A.5 Serving default (the lever).** - In `hardware_defaults.go`, introduce `BlackwellLogicalBatch = 4096` (or a - measured value) and let `EffectiveBatchSize` return it for **multi-slot serving** - configs (when `n_parallel > 1` and the model is a completion model), while - `EffectiveUBatchSize` stays at `BlackwellBatchSize = 2048`. Gate behind the same - Blackwell detection already used at `:37-40`. Single-stream/embedding/rerank - paths keep `n_batch == n_ubatch`. This is the only behavioral change shipped by - Phase A; Phase 0.2 must show it is net-positive before defaulting it on. - -- **A.6 Tests.** Extend `hardware_defaults_internal_test.go` with - `EffectiveUBatchSize` cases; add a `grpcModelOpts` test asserting - `NUBatch <= NBatch` and that unset config yields `NUBatch == NBatch`. Re-run - 0.1 (must still be token-identical) and 0.2 (must show aggregate-prefill gain or - neutral ITL) at `n_batch=4096, n_ubatch=2048`. - -### Phase B — Decode-headroom prefill cap (optional policy, vendored patch) -Only if Phase 0.2 / A shows decode ITL jitter from fat prefill chunks. This is the -one change that touches the inherited scheduler, so it lives as a patch in -`backend/cpp/llama-cpp/patches/` (applied by `prepare.sh:6-11` / Makefile -`:141-145`), never as an edit to a checked-in upstream file. - -Policy (pseudocode; insert into `update_slots()` prefill fill loop, the -`while (… && batch.n_tokens < n_batch)` at ≈ `server-context.cpp:2552`): - -``` -# token budget for THIS iteration, decode already seated: -n_decode_in_batch = batch.n_tokens # set after the decode phase -prefill_budget = n_batch # default == today - -if serving_mode and n_decode_in_batch > 0: - # leave room so decoders are not starved/jittered by one giant prefill chunk - # max_prefill_per_iter defaults to n_ubatch (one physical tile) when decode active - prefill_budget = min(n_batch, n_decode_in_batch + max_prefill_per_iter) - -# fill loop guard becomes: -while slot.prompt.n_tokens() < slot.task->n_tokens() - and batch.n_tokens < prefill_budget: - ... -``` - -- `max_prefill_per_iter` is a new `common_params` field surfaced as an - `options:` knob (`max_prefill_tokens` / `mpt`) parsed in `grpc-server.cpp` - exactly like A.1, default `0` = disabled = today's behavior. -- Semantics mirror vLLM `long_prefill_token_threshold`: cap the prefill share so - ongoing decodes keep a steady cadence; the remaining prompt rides the next - iteration (already supported by the state machine — slot stays - `PROCESSING_PROMPT`). -- **Correctness:** unchanged KV/position path — chunk boundaries already advance - `slot.prompt.tokens.pos_next()` per added token (≈ 2570) and the slot resumes - from `slot.prompt.n_tokens()` next iteration. Capping the budget only changes - *how many* tokens are added this iteration, not *which* positions, so 0.1 must - remain token-identical. - -### Phase C — Docs + defaults rollout -- Document `batch` / `ubatch` (and `max_prefill_tokens` if B ships) in - `docs/content/` model-config reference, with the serving recipe - (`n_parallel>1`, `n_batch=4096`, `ubatch=2048`). -- Note the orthogonality to paged KV (below) in - `PHASED_VLLM_PARITY_PLAN.md` Phase 3. - ---- - -## 4. Risk / correctness - -- **KV-cache & positions across chunks:** already handled upstream. Each prefill - token added advances `pos_next()` (≈ 2570) and is pushed to `slot.prompt.tokens` - (≈ 2573); the next iteration resumes from `slot.prompt.n_tokens()`. Chunk - boundaries are transparent to the KV cache because positions are absolute, not - per-chunk. Phase A changes only budgets, not positions; Phase B changes only the - per-iteration count. The 0.1 token-identical test is the guardrail. -- **Unified KV cache (LocalAI default, `n_parallel` slots share one cache):** - unaffected — co-batching prefill+decode across slots is what the unified cache is - for; positions are per-`seq_id` (`{ slot.id }` in `common_batch_add`). -- **`n_ubatch > n_batch`:** invalid; A.4 clamps `EffectiveUBatchSize <= - EffectiveBatchSize` and A.1 logs a warning if options violate it. -- **Embeddings / rerank:** must keep `n_ubatch >= prompt length` (single pass, - `can_split()==false`). The existing `:519` alias + `EffectiveBatchSize` - context-sizing for single-pass usecases (`options.go:119-124`) must be preserved - — do not let the serving `BlackwellLogicalBatch` default leak into single-pass - configs (A.5 gates on completion + `n_parallel>1`). -- **Turboquant fork:** the fork lacks some `common_params` fields (see - `LOCALAI_LEGACY_LLAMA_CPP_SPEC` precedent at `grpc-server.cpp:755`). `n_batch` / - `n_ubatch` are ancient fields and safe; if Phase B adds `max_prefill_per_iter`, - guard the new field behind a `#ifndef` like the checkpoint block does. - -## 5. Orthogonality to paged KV (Phase 2) - -Keep them independent. Paged KV (the `-kvp` / block-manager effort, draft #22569, -and `paged/`) changes **where** KV blocks live (allocation/utilization). Chunked -prefill / this decouple changes **how many tokens per iteration** the scheduler -batches (the `n_batch` budget and decode/prefill interleave). They compose: paged -KV raises the concurrency ceiling (more slots), the decouple widens the per-iter -scheduling window to feed those slots; neither touches the other's data structures. -The only contact point is `update_slots()` — if both ship a vendored patch to it, -land them as separate, ordered patches in `patches/` and keep the hunks disjoint -(paged touches allocation/seq_rm; chunked-prefill Phase B touches the prefill fill -budget). - ---- - -## 6. Bottom line - -- Chunked prefill + decode interleave: **already present and correct** on the - pinned llama.cpp — verify (Phase 0.1), do not rebuild. -- Real work: the **n_batch/n_ubatch decouple** (Phase A) — small, additive, - default-preserving — plus an **optional decode-headroom prefill cap** (Phase B) - if measurements show ITL jitter. Both are LocalAI-side: A in `grpc-server.cpp` - + proto + `options.go`; B as a vendored `patches/` hunk. diff --git a/backend/cpp/llama-cpp/paged/DECODE_OVERHEAD.md b/backend/cpp/llama-cpp/paged/DECODE_OVERHEAD.md deleted file mode 100644 index 06b75ffdd..000000000 --- a/backend/cpp/llama-cpp/paged/DECODE_OVERHEAD.md +++ /dev/null @@ -1,215 +0,0 @@ -# llama.cpp multi-user decode overhead on DGX Spark (GB10, sm_121) - -Investigation of the Qwen3-32B concurrent-decode throughput gap (llama.cpp ~547 t/s -vs vLLM ~667 t/s) on the GB10 box, build `~/llama.cpp-pr24423/build` (Release, -sm_121, `LLAMA_MAX_SEQ=256`, flash-attn on), model -`~/bench/q3-32b-gguf/Qwen3-32B-Q4_K_M.gguf`. - -## TL;DR (the result overturns the brief's premise) - -On **this** build the prime suspect is wrong and the host-overhead premise does not -hold: - -1. **CUDA graphs are NOT disabled at high concurrency.** At npl=128, 94 of 98 - decode `graph_compute` calls **replay a captured CUDA graph** (0 resets, stable - key, no property churn post-warmup). The keyed-warmup gate works. -2. **There is no ~170ms/step host hotspot here.** The GPU is **~96% active during - decode with graphs ON and ~96% active with graphs OFF**. Decode at npl=128 is - **GPU-compute-bound**, not host-bound. -3. The brief's "20% GPU util / 66ms GPU / 170ms host per step" was measured on a - different/earlier build (mainline without these graph fixes). It is not - reproducible on `llama.cpp-pr24423`. -4. Because the GPU is the bottleneck, re-enabling graphs cannot lift the number: - the clean A/B shows graphs ON vs OFF = **+1.5% at npl=128** (and +2.9% at - npl=32 - the benefit shrinks as concurrency rises and the GPU saturates). -5. The real gap to vLLM is the **quantized decode GEMM kernel**: `mul_mat_q` - (Q4_K + Q6_K) is ~68% of decode GPU time and runs ~2.1x above the GB10 - memory-bandwidth floor. Closing the gap requires Marlin/Machete-style int4 - GEMM kernels, not host-side work. This is a kernel project (the direction the - prior session's uncommitted `marlin-w4a16.cu` / `fp4-grouped-moe.cu` already - started, though those target w4a16/GPTQ-int4, not the K-quants this GGUF uses). - -## 1. Why CUDA graphs are (not) disabled - exact code + measurement - -### The gate (code) - -PR24423 refactored the CUDA-graph path into a keyed, warmup-based scheme in -`~/llama.cpp-pr24423/ggml/src/ggml-cuda/ggml-cuda.cu`: - -- `ggml_cuda_graph_get_key(cgraph)` (~L3343) keys the cached CUDA graph by - `cgraph->nodes[0]` (first-node pointer). -- `ggml_cuda_graph_check_compability(cgraph)` (~L3301) disables graphs only for: - - **split buffers** (`ggml_backend_buft_is_cuda_split`), and - - **`GGML_OP_MUL_MAT_ID`** when `src0` is non-quantized **or** - `ne[2] > get_mmvq_mmid_max(...)` (MoE expert routing needs a stream sync). - Qwen3-32B is **dense** -> no `MUL_MAT_ID` -> this condition never fires. -- `ggml_backend_cuda_graph_compute` (~L4514) warmup gate: a graph is used only - after **2 consecutive calls with no property change** (`warmup_complete`); any - property change resets warmup. `ggml_cuda_graph_update_required` (~L3347) - detects change by `memcmp` of the full `ggml_tensor` struct + per-src - data-ptr/ne/nb, with a fast path when `cgraph->uid` is unchanged. - -### Why it stays enabled across decode steps - -The graph stays stable because llama.cpp's host-side graph reuse holds during -decode, so node pointers/props (and `cgraph->uid`) do not churn: - -- `llama_kv_cache::get_n_kv` (`src/llama-kv-cache.cpp` L1223-1233) **pads n_kv to - a multiple of 256** ("so that the graph remains constant across batches and can - be reused"). For ntg<=256 within the first KV block, n_kv is constant. -- `can_reuse_kq_mask` (`src/llama-graph.cpp` L43) keeps the KQ-mask dims stable: - `ne=[n_kv, n_tokens/n_stream, 1, n_stream]` = `[256,1,1,128]` every decode step - at npl=128. -- `can_reuse` (`src/llama-context.cpp` L1283) therefore returns true, so the - scheduler is **not** reset/re-split. `graph->uid` is only reassigned inside - `ggml_backend_sched_split_graph` (`ggml/src/ggml-backend.cpp` L1033, L1485), - which is skipped on the reuse path -> stable uid -> CUDA graph replays. - -### Measurement (instrumented build, npl=128, ntg=96) - -Env-gated counters added to `ggml_backend_cuda_graph_compute` / -`ggml_cuda_graph_update_required` (since `GGML_LOG_DEBUG` is compiled out in -Release / NDEBUG). End-of-run summary: - -``` -[GTRACE-SUMMARY] calls=98 notenab=0 warming=3 warmdone=1 RESET=0 USED=94 incompat=0 distinct_keys=1 -``` - -94/98 decode `graph_compute` calls **replayed** a captured CUDA graph; **0** -warmup resets; a **single** distinct graph key for the whole decode; no node -property churn after warmup. Graphs are fully engaged at npl=128. - -(The instrumentation was reverted afterwards; the checkout is back to its -pre-task state and the `.so` rebuilt clean.) - -## 2. The per-step CPU "hotspot" - there isn't one on this build - -GPU utilization during npl=128 decode (ntg=256): - -- **Graphs ON** - `nvidia-smi` sampled every 0.7s through the decode phase: - steady **96% GPU util**, SM clock **2184 MHz** (not throttled), 45-47 W. -- **Graphs OFF** (`GGML_CUDA_DISABLE_GRAPHS=1`) - nsys CUDA trace, 8s window: - total GPU kernel time = `3,983,292,128 ns / 0.516` = **~7.72s of the 8s - window = ~96% GPU-active**. Even with every kernel launched individually from - the host, the GPU is still ~96% busy. There are essentially **no host gaps**. - -Per-step wall = 60.6s / 256 steps = **~237 ms/step**, and the sum of one decode -graph's kernel times (nsys, graphs-on capture) is ~244 ms -> GPU kernel time per -step ~= wall time per step. The host work between steps is in the low single-digit -ms (the ~4% idle), consistent with graphs ON giving only +1.5% at npl=128. - -This directly contradicts the brief's 66ms-GPU / 170ms-host split, which must have -come from a pre-graphs build. - -### Per-step GPU breakdown (nsys, npl=128 decode, graphs off, 8s window) - -| Kernel | % GPU time | ~ms/step | -|--------|-----------:|---------:| -| `mul_mat_q` Q4_K (type 12) | 51.6 | ~118 | -| `flash_attn_ext_f16` | 19.3 | ~44 | -| `mul_mat_q` Q6_K (type 14) | 16.2 | ~37 | -| `unary_gated` silu | 4.1 | ~9 | -| mmq stream-k fixup + quantize_q8_1 | ~5 | ~12 | -| rms_norm / rope / set_rows / add | ~4 | ~10 | - -Quantized matmul = **~68%** of decode GPU time (~155 ms/step). Attention ~19%. - -`perf` could not profile the host (kernel `perf_event_paranoid=4`), but it is moot: -the host is ~4% of the wall, so there is no ~170ms host hotspot to chase. - -## 3. Fix attempt + measured result - -### The requested fix (re-enable graphs / pad the decode batch) is a no-op here - -Graphs are already enabled and the batch is already stable (n_kv padded to 256, -kq_mask dims constant). The clean cold A/B (cooldowns between every run): - -| npl | graphs ON (t/s) | graphs OFF (t/s) | delta | -|----:|----------------:|-----------------:|------:| -| 32 | 242.60 | 235.75 | +2.9% | -| 64 | 398.59 | 389.06 | +2.5% | -| 128 | 543.95 | 535.71 | +1.5% | - -Baseline (separate cold runs, original non-instrumented build): -npl=32 243.9, npl=64 397.1, **npl=128 544.95** (matches the ~546 baseline). - -Graphs help, but the benefit **monotonically shrinks** as concurrency rises and -the GPU saturates. At npl=128 there is only ~1.5% of host launch overhead left to -remove, and GPU util is ~96% in both columns. **You cannot lift npl=128 decode -toward 667 by working on graphs/host overhead - the GPU is the bottleneck.** - -### Where the number actually is, and the real lever - -- vLLM 667 t/s at this concurrency = **192 ms/step**; llama.cpp 547 = **237 - ms/step**. The ~45 ms/step gap maps almost entirely onto the quantized matmul. -- GB10 memory-bandwidth floor for a 32B Q4_K_M (~19.8 GB of weights, read once - per step and shared across the 128 sequences) at ~273 GB/s is **~72 ms/step**. - llama.cpp's `mul_mat_q` spends ~155 ms/step on matmul = **~2.1x the bandwidth - floor**. vLLM's Marlin/Machete int4 GEMMs run much closer to the floor; that - efficiency difference is the ~547 -> 667 gap. -- The Q6_K matmul (`mul_mat_q` type 14) also shows pathological tail latency - (median 0.89 ms, max 5.5 ms) - the MMQ kernel is not well-tuned for the skinny - n=128 decode shape. - -**The lever to beat 547 is a faster quantized decode GEMM**, i.e. a Marlin-style -int4 kernel for the decode shapes. This is exactly the direction of the prior -session's uncommitted `ggml/src/ggml-cuda/marlin-w4a16.cu` and -`fp4-grouped-moe.cu` (already wired via -`if (!split && ggml_cuda_w4a16_mul_mat(...)) return;` in `ggml_cuda_mul_mat`). -Note those target **w4a16 / GPTQ-int4**, while this GGUF is **K-quant (Q4_K/Q6_K)**, -so they are inert for this model - a Marlin path for K-quants (or shipping the -model in a Marlin-friendly int4 format) would be required. That is a multi-day -kernel effort, out of scope for this session, but it is the only lever that can -move the number. - -### Why the "bump LLAMA_MAX_SEQ to 1024 -> 377" data point is consistent - -`llama_batch_allocr` keeps `seq_cpl` as an `LLAMA_MAX_SEQ x LLAMA_MAX_SEQ` table -(`src/llama-batch.cpp`), so per-batch seq bookkeeping scales ~O(MAX_SEQ^2). At -MAX_SEQ=1024 that host cost becomes large enough (~70 ms/step) to dominate and -drop decode to 377. At MAX_SEQ=256 the same term is ~4.4 ms/step (the ~1.5% that -graphs reclaim); lowering to 128 would save ~3 ms/step (~1%). So MAX_SEQ tuning -confirms the host term is real but tiny at 256 - not a path to 667. - -## How this would land in LocalAI - -- **No host/graph patch is warranted** for this build: graphs already engage and - the decode is GPU-bound. A "pad the decode batch / force graph capture" patch - would change nothing measurable at high concurrency. -- The actionable upstream/vendored work is a **Marlin-style int4 decode GEMM** - (extend the prior `marlin-w4a16.cu` to cover K-quants, or quantize the served - model into a Marlin-friendly int4 layout). That is where the ~547 -> 667+ lives. -- If a small host win is still wanted, keep `LLAMA_MAX_SEQ` no larger than the max - concurrency actually used (the per-batch `seq_cpl` table is O(MAX_SEQ^2)). - -## Reproduction - -``` -# baseline / A/B (cold, 30s cooldowns) -llama-batched-bench -m Qwen3-32B-Q4_K_M.gguf -npp 16 -ntg 128 -npl 32,64,128 \ - -ngl 99 -b 2048 -ub 2048 -fa on # graphs on -GGML_CUDA_DISABLE_GRAPHS=1 ...same... # graphs off - -# GPU util (graphs on): sample nvidia-smi during decode -> ~96%, 2184 MHz -# GPU active (graphs off): nsys profile -t cuda --delay=6 --duration=8 ... -# nsys stats --report cuda_gpu_kern_sum -> sum/0.516 ~= 7.72s of 8s = ~96% -``` - -## UPDATE: NVFP4 closes most of the decode gap (no Marlin-for-K-quants needed) - -The diagnosis above said the lever is "a more bandwidth-efficient int4 decode GEMM" -and feared a multi-day Marlin-for-K-quants kernel. But the FP4-MMA path is already -that kernel. Measured (npl=128, cold A/B, npp=16 ntg=128): - -| quant | decode S_TG (t/s) | vs Q4_K | vs vLLM 667 | -|---|---|---|---| -| Q4_K_M | 547 (548/546) | - | 82% | -| **NVFP4** | **619 (617/622)** | **+13%** | **93%** | - -NVFP4's `mul_mat_q` runs closer to the GB10 bandwidth floor at the thin n=128 -decode shape than Q4_K's int8-MMQ (which ran ~2.1x above it). So shipping the model -as NVFP4 closes the decode gap from ~22% to ~7% AND wins prefill (1209 vs Q4 767 / -vLLM 800). Net on GB10: llama.cpp+NVFP4 is ahead on prefill (1.5x) and within ~7% on -decode. The remaining ~7% would be incremental FP4-MMA decode-kernel tuning, NOT a -from-scratch Marlin kernel - a much smaller, optional effort. NVFP4 is the answer to -both the prefill and the decode gap. diff --git a/backend/cpp/llama-cpp/paged/DGX_BLACKWELL_PLAN.md b/backend/cpp/llama-cpp/paged/DGX_BLACKWELL_PLAN.md deleted file mode 100644 index 8a844b96d..000000000 --- a/backend/cpp/llama-cpp/paged/DGX_BLACKWELL_PLAN.md +++ /dev/null @@ -1,253 +0,0 @@ -# Closing the vLLM Gap on Blackwell (GB10 / DGX Spark) — Living Plan & Results - -Target hardware: NVIDIA **GB10** (Grace-Blackwell, `sm_121a`, 119 GiB unified LPDDR5X), `dgx.casa`. -Model under test: **Qwen3-Coder-30B-A3B-Instruct** (MoE, 128 experts, top-8, ~3B active). -Engines: llama.cpp (CUDA, `~/llama.cpp-pr24423`, build `7a6ddc5`, `CMAKE_CUDA_ARCHITECTURES=121`) vs vLLM 0.23.0 (`~/vllm-bench`, torch 2.11.0+cu130). - -> This is a working document. Each phase appends measured numbers, what was learned, and what's next. -> Methodology: `llama-bench` (single-stream pp/tg, built-in reps) and `llama-batched-bench` (`-npl` sweep, -> decode-phase aggregate `S_TG`, prefill aggregate `S_PP`); vLLM via `~/bench/vllm_conc.py` (decode-phase -> aggregate matched to `S_TG`). Same model/prompt/seed. Precision matched where possible. - ---- - -## Baseline results (established) - -### Single-stream (B=1), matched ~8-bit -| Engine / precision | prefill pp512 (t/s) | decode tg128 (t/s) | -|---|---|---| -| llama.cpp **Q8_0** | 2215 ± 15 | **54.8 / 62.2** * | -| llama.cpp **F16** | 700 ± 24 | 32.9 ± 0.05 | -| vLLM **FP8** | 9155 ± 308 | 52.45 ± 0.05 | - -\* two sessions; ~55 right after worker-stop (clocks settling), ~62 steady state. Both ≥ vLLM → **single-stream parity holds**. - -### Concurrency sweep (decode-phase aggregate `S_TG`, prefill aggregate) -| B | llama Q8 prefill | vLLM FP8 prefill | llama Q8 decode | vLLM FP8 decode | -|---|---|---|---|---| -| 1 | 1080 | 9644 | 60.1 | 48.0 | -| 8 | 2189 | 33373 | 160.8 | 312.4 | -| 32 | 2198 | 99398 | 357.1 | 1171 | -| 64 | 2194 | 151990 | 519.2 | 2064 | - -llama F16 prefill also flat: B=1 452 → B=8 723 → B=32 778. **Prefill flat at both precisions = kernel-throughput ceiling.** - -### Our paged patch (LLAMA_KV_PAGED) — concurrency effect: NONE -Same Q8 binary, paged branch confirmed firing (137 placements at B=8), throughput identical within noise: -| | B=1 | B=8 | B=32 | -|---|---|---|---| -| stock decode | 61.2 | 171.7 | 377.0 | -| paged decode | 62.7 | 170.8 | 376.8 | - -Patch is placement-only correctness prototype; doesn't implement concurrency mechanics. Single-stream-neutral, concurrency-neutral. - ---- - -## Root-cause diagnosis (nsys + code audit) - -- **74.5% of GPU compute = `mul_mat_q`** (Q8_0 int8 MMQ GEMM, the MoE experts). Only cutlass kernel seen is `cutlass_80_tensorop` = **Ampere (sm_80)**, not Blackwell. -- ggml-cuda has **NO FP8 path** (no e4m3/e5m2 GEMM, no cuBLASLt FP8). Q8_0 runs the **Ampere-class int8 `mma.sync s8.s8.s32`** even on GB10 (`mma.cuh:924`, dispatched unconditionally `mmq.cu:307`). -- ggml-cuda **DOES** have a **native Blackwell FP4 path** (MXFP4 + NVFP4, `mma...kind::mxf4...e2m1`, `mma.cuh:1126`, gated `BLACKWELL_MMA_AVAILABLE`). Merged via #17906/#20644/#21074. -- **No fused MoE grouped GEMM**, no tcgen05/wgmma (warp-level `mma.sync` only). -- **Small per-expert GEMMs**: 512-tok ubatch → ~32 tok/expert (128 exp, top-8) → thin GEMMs, memory-bound, can't fill tensor-core tiles. vLLM processes 8192 tok/step → ~512 tok/expert → compute-bound + FP8. -- **The 45–69× gap is partly apples-to-oranges**: we compared llama Q8 (Ampere int8) vs vLLM FP8 (Blackwell). Upstream/NVIDIA benches put the *real* FP4-vs-FP8 prefill gap at **~25–50% long-context**, not 45–69×. - -Key upstream refs: discussion #22042 (FP8 design: `ggml_mul_mat_ext` + scale tensors), #17906 (native MXFP4), #18250 (NVFP4-MoE closed not-planned). - ---- - -## The levers (cheap → expensive) — execution log - -### Lever 1 — NVFP4/MXFP4 model (use existing Blackwell FP4 path) + ubatch bump -Status: **IN PROGRESS** — single-stream done, concurrency next. -Quant: `llama-quantize F16 -> MXFP4_MOE` (type 38), 15.9 GiB / 4.47 BPW. (No NVFP4 in llama-quantize; MXFP4_MOE puts experts in MXFP4 = Blackwell FP4 MMA.) - -Single-stream (llama-bench), MXFP4 vs Q8 vs vLLM-FP8: -| metric | llama Q8 | **llama MXFP4** | vLLM FP8 | -|---|---|---|---| -| prefill pp512 (ub512) | 2215 | **3061 ± 22** | 9155 | -| prefill pp2048 (ub512) | ~2200 | 3137 ± 7 | — | -| prefill pp2048 (**ub2048**) | — | **3441 ± 14** | — | -| decode tg128 | 62.2 | **86.4 ± 0.3** | 52.45 | - -Findings: -- **MXFP4 decode 86.4 beats vLLM FP8 52.45 by 1.65×** (4-bit = less memory traffic; decode is memory-bound). llama wins decode outright. -- MXFP4 prefill +38% over Q8; **ub2048 lifts prefill +10%** (3137→3441). Single-stream prefill gap to vLLM: 4.1× (Q8) → **2.7× (MXFP4)**. -- Caveat: MXFP4 is 4-bit vs vLLM FP8 8-bit — not precision-matched. Fair match = vLLM NVFP4 (4-bit); pending. -Concurrency (decode-phase aggregate `S_TG`, ub2048), MXFP4 vs Q8 vs vLLM-FP8: -| B | Q8 dec | **MXFP4 dec** | vLLM dec | Q8 pp | **MXFP4 pp** | vLLM pp | -|---|---|---|---|---|---|---| -| 1 | 60.1 | **83.4** | 48.0 | 1080 | 1625 | 9644 | -| 8 | 160.8 | **267.4** | 312.4 | 2189 | 3634 | 33373 | -| 32 | 357.1 | **551.2** | 1171 | 2198 | 3651 | 99398 | -| 64 | 519.2 | **770.2** | 2064 | 2194 | 3648 | 151990 | - -**Lever-1 verdict:** MXFP4 is a large, free win — decode +50–66% over Q8, prefill plateau +66% (2200→3650). MXFP4 decode **wins at B=1, near-parity at B=8** vs vLLM; only falls behind at high concurrency. **Prefill still plateaus (~3650)** — the MoE prefill GEMM doesn't scale with batch (no fused grouped GEMM; ubatch-limited). That plateau is the real remaining structural gap → Levers 2–3. Quality caveat unchanged (MXFP4 4-bit vs vLLM FP8 8-bit; quality not yet evaluated). - -### Lever 2 — `n_ubatch` / `n_batch` tuning (standalone) -Status: **DONE + SHIPPED (auto-default implemented)** -MXFP4 pp4096 vs ubatch: ub512=2994, **ub2048=3316**, ub4096=2820(noisy), ub8192=3180. -**Verdict:** prefill saturates at ub=2048; larger ubatch gives nothing. The ~3300–3650 ceiling is the **MoE GEMM kernel**, not batch size. → No more free config wins; the rest is kernel work (Levers 3–5). -**Implemented:** `core/backend/hardware_defaults.go` — `EffectiveBatchSize` now defaults the physical batch -(n_batch→n_ubatch alias) to **2048 on Blackwell** (`xsysinfo.IsNVIDIABlackwell`, cc≥12 / sm_120/121) when the -config leaves `batch:` unset; explicit `batch:` always wins. Detection is a shared Go helper; placed at the -common ModelOptions builder so it covers the C++ llama.cpp backend too. Tests: `hardware_defaults_internal_test.go`. - -### Lever 1b — Standard Q4 vs MXFP4 (what's actually MXFP4-specific) -**Q4_K_M** (17.3 GiB) vs **MXFP4** (15.9 GiB), ub2048: -| metric | Q4_K_M | MXFP4 | Q8 | -|---|---|---|---| -| decode tg128 | **93.5** | 86.4 | 62.2 | -| prefill pp512 | 2164 | **3061** | 2215 | -| prefill pp2048 | 2953 | **3441** | ~2200 | -**Verdict:** the **decode win is just "4-bit"** — plain Q4_K_M matches/beats MXFP4 on decode (both memory-bound). -MXFP4's *only* real edge is **prefill (+41% over Q4_K_M)** via Blackwell FP4 tensor cores. So for shipping, -**"4-bit quant + ubatch=2048" captures most of the win portably**; MXFP4 is a Blackwell-only prefill extra. - -### Lever 3 — Fused FP4/FP8 MoE grouped GEMM (+ activation-quant fusion) -Status: **DESIGNED + PROFILED, not built** (multi-week kernel R&D). The single biggest remaining prefill win. - -**Decisive measurements:** -- Prefill does NOT scale with bigger single prompts (attention O(N²) confounds): MXFP4 pp2048=3295, pp8192=1524, - pp16384=2051. So the plateau is not a batch-size fix. -- Real gap is batched many-sequence prefill: B=32 llama 3651 vs vLLM 99398 = **27×**. llama.cpp MoE prefill runs - at only **~22 effective TFLOP/s** on the GB10 — far below the GPU. Large headroom. -- **nsys (MXFP4 pp2048):** `mul_mat_q` (MoE FP4 GEMM) = **37.2%**, `quantize_mmq_mxfp4` (act-quant) = 8.0%, - `mul_mat_q` (dense/attn, still Q8) = 10.1%, flash_attn = 8.8%. The native FP4 MMA *is* engaged — the - inefficiency is the **per-expert thin-tile MMQ scheduler** + **un-fused activation quant**. - -**Target (precise):** the ~45% in `mmq.cu`'s grouped MoE path (`ggml_cuda_mul_mat_q` + `ids`, `mmid.cu`). Replace -the per-expert thin-tile scheduler with a CUTLASS-style grouped GEMM (full tiles regardless of tokens/expert) and -fuse `quantize_mmq_mxfp4` into the permute/gather. Dense Q8 matmuls (10%) are the separate Lever-4 (FP8) target. -Problem (measured): the prefill ceiling is the MoE expert GEMM. Today `ggml_cuda_mul_mat_q` with `ids` -(`mmq.cu:127`) launches one grouped MMQ over a 3D grid (z = expert), but each expert's tile is thin -(~tokens/expert columns) so int8/FP4 tensor cores run underfilled; throughput is memory-bound on weight -streaming and flat vs batch. -Approach: -- Replace the per-expert thin-tile scheduler with a **CUTLASS-style grouped GEMM** that concatenates all - experts' token-blocks into one problem with per-group offsets, so tiles are always full (m16n8k64 FP4 / - m16n8k32 FP8) regardless of per-expert token count. Mirrors vLLM's `fused_moe` + cutlass grouped GEMM. -- **Fuse activation quantization into the permute/gather** (the `quantize_mmq_q8_1`/FP4 quantize currently a - separate 3.3% kernel) so the routed activations are quantized as they're scattered into expert order. -- Files: new kernel under `ggml/src/ggml-cuda/` (e.g. `moe-grouped-gemm.cu`) + dispatch hook in - `ggml_cuda_mul_mat_id` (`ggml-cuda.cu:2622`); reuse `mmid.cu` routing/`expert_bounds`. -- Effort: high (2–4 wks expert CUDA). Risk: numerics + sm_121 tile tuning. Expected payoff: the bulk of the - prefill gap (vLLM's MoE prefill advantage is mostly this). Upstream: #18250 (NVFP4-MoE) was closed - not-planned, so this would be a LocalAI patch or a fresh upstream proposal. - -### Lever 4 — FP8 (e4m3) GEMM for dense layers -Status: **DESIGNED, not built** (blocked on a core ggml API change). -Problem: ggml-cuda has no FP8 matmul (only int8/FP4). vLLM runs qkv/o_proj/lm_head in FP8 on Blackwell -tensor cores. Our dense layers run int8-MMQ or f16-cuBLAS. -Approach (two options): -- (a) **cuBLASLt FP8**: route dense `mul_mat` through `cublasLtMatmul` with `CUDA_R_8F_E4M3` A/B and FP32 - compute + scale pointers. Lowest kernel effort; gets library-tuned Blackwell FP8 immediately. Needs the - scale-tensor plumbing below. -- (b) **Hand-written sm_121 `mma.sync ...e4m3.e4m3.f32`** kernels in `mma.cuh`/`mmf.cu`. More control, more work. -- Prerequisite (both): the **`ggml_mul_mat_ext` / scale-tensor API** from upstream discussion #22042 — - per-tensor FP8 scales don't fit the block-scaled quant struct; `MUL_MAT`/`MUL_MAT_ID` must accept optional - scale tensors. This is a cross-cutting ggml change (graph + ops + all backends' fallbacks). -- Effort: high (API change is the hard part; cuBLASLt path is then moderate). Payoff: closes dense-layer - prefill/compute gap; complements Lever 3. Note: for *this* MoE model the experts dominate, so Lever 3 > 4. - -### Lever 5 — tcgen05 / wgmma-class kernels for large-prefill tiles -Status: **DESIGNED, not built** (very high effort; last increment). -Problem: ggml's tensor-core path is warp-level `mma.sync` only (no `wgmma`/`tcgen05`). Blackwell's -tensor-memory `tcgen05` MMA (what CUTLASS uses) extracts substantially more throughput at large prefill tiles. -Approach: introduce warpgroup/tcgen05 GEMM main-loops for the FP4/FP8 paths (effectively adopting CUTLASS -3.x collective mainloops for sm_120/121), used when tile size is large enough (prefill). Decode (thin) keeps -`mma.sync`. -- Effort: very high (CUTLASS-class engineering). Payoff: the final slice of large-prefill throughput; only - worth it after Levers 3–4 land. Realistically: depend on/upstream CUTLASS kernels rather than hand-roll. - ---- - -## Paged attention — complete implementation (after kernels are fair) -The placement prototype is insufficient (measured: zero concurrency benefit). A real implementation needs all -four gaps. CPU foundation already built & verified (`PagedKVManager` P0–P3, `README.md`); the in-model parts -are unbuilt. **Build order and concrete design:** - -1. **On-demand block allocation from a shared pool** (capacity win — more concurrent seqs before OOM). - - Replace `find_slot`'s ring-buffer (`llama-kv-cache.cpp:818`) with `PagedKVManager` block allocation; the - KV tensor becomes a shared block pool `[n_embd, block_size*num_blocks]`, sequences draw blocks on demand - (already prototyped on CPU: `paged_kv_manager.{h,cpp}`, `test_ggml_paged_rw.cpp`). - - Win measured where it counts: max concurrent sequences before OOM (not yet benchmarked — needs this). -2. **Gather-read** so each seq attends only its own blocks (`get_k`/`get_v` `:1145/1165` → `ggml_get_rows` - gather into scratch, then existing attention). Numerically proven on CPU (`test_ggml_paged_attn.cpp`, - 7.5e-08 vs reference). Needs `build_attn_paged` branch in `llama-graph.cpp` + Gate 0 in a real model. -3. **Continuous batching / scheduler** (no head-of-line blocking on mixed-length traffic). New scheduler in - the server slot path; admit/evict at block granularity; the dimension where paging beats llama.cpp's - current static batching. This is where the *real* concurrency win lives (vs our synthetic uniform test). -4. **Automatic prefix sharing** (block-hash dedup; `PagedKVManager::{compute_block_hashes,get_computed_blocks}` - already implemented & tested). Cross-tenant shared system prompts reuse physical blocks. - -Status: design in `2026-06-19-paged-attention-llamacpp-design.md`; CPU P0–P3 done; in-model #1–#4 unbuilt. -**Then** measure concurrency in paging's real scenarios — **memory-pressured (max seqs before OOM)** and -**mixed-length continuous batching** — on the MXFP4 (fair-quant) footing, not the uniform/over-provisioned -test that (correctly) showed no benefit. - -> Reality check from this session's data: paged attention is a **capacity + scheduling** win, not a per-token -> speed win. On GB10 with 119 GB unified memory and uniform requests we are not memory-bound at B≤64, so the -> placement prototype showed nothing. Paging's value appears under memory pressure (many/long sequences) and -> bursty mixed-length traffic. The per-token throughput gap is a **kernel** problem (Levers 1–3), separate -> from paging. - ---- - -## Implementation plan A — Lever 3: FP4 MoE GEMM to vLLM parity - -Goal: lift batched MoE prefill from ~3.65k t/s (B=32) toward vLLM's ~99k. Root cause (profiled): -`mul_mat_q` runs at ~22 effective TFLOP/s — warp-level `mma.sync`, not Blackwell tcgen05. -Cheap knobs are exhausted (ubatch saturates at 2048; `GGML_CUDA_FORCE_CUBLAS` is a no-op 3419↔3423; -tile width already full at mmq_x=128). So parity needs kernel work, done iteratively on the DGX -(`~/llama.cpp-pr24423`, editable + rebuildable; diffs captured as `patches/`). - -Phases (each: hypothesis → edit `ggml/src/ggml-cuda/` → `cmake --build build --target llama-bench` → -`llama-bench` MXFP4 pp/concurrency → record): -1. **Cheap kernel tweaks (low confidence, fast).** nwarps (occupancy), `mmq_y` tile, stream-k on/off, - FP4 load-tile path. Measure each. Likely small (<1.3x) — these don't change the warp-MMA ceiling. - - **Result (nwarps):** DEAD END. `nwarps` is locked by `static_assert(nwarps*tile_C::I == mmq_y)` - (mmq.cuh:3234) → nwarps=8 for mmq_y=128. Can't raise occupancy without co-scaling mmq_y to 256 - (nwarps=16), which blows Blackwell shared-memory limits. The MMQ constants are tightly coupled; - it is not freely tunable. Confirms parity needs the kernel rewrite (phase 3), not knobs. -2. **Fuse activation quant** (`quantize_mmq_mxfp4`, 8%) into the permute/gather. Removes a kernel + - a global round-trip. Tractable, ~1.1x. - - **Result:** NOT AVAILABLE as a cheap patch. `quantize_mmq_fp4_cuda` (mmq.cu:200) *already* takes - `ids_src1` — the gather is already fused into the quant. The only remaining fusion is quantize-on-load - *inside* the GEMM hot loop (intricate, ~8% ceiling, risky). ORippler's #24481 fuses the decode (MMVQ) - post-scale and intends a "BS>1" (prefill) follow-up — unwritten. Marginal; skip. - -**Upstream survey (2026-06):** there is NO tcgen05/CUTLASS grouped-GEMM MoE kernel in ggml — not merged, -not in-flight, not a draft (Discussion #18369 is talk, no PR; #18250 closed not-planned). CUTLASS is not a -dependency (the profile's `cutlass_80_tensorop` is cuBLAS-internal). No fork has a portable MoE kernel -(croll83/llama.cpp-dgx is GatedDeltaNet-focused). Maintainer signal (woachk on #17906): "the path forward -is to wait for cuTile C++." So **nothing to cherry-pick; phase 3 is genuinely from-scratch.** -3. **The real lever — tcgen05 / CUTLASS FP4 grouped GEMM.** Replace the per-expert MMQ scheduler with a - CUTLASS 3.x collective-mainloop grouped GEMM (sm_120a, `e2m1` block-scaled, tcgen05 tensor-memory MMA), - one problem over all experts with per-group offsets, fused act-quant. This is what vLLM/FlashInfer use. - Multi-week; the honest path to parity. Prefer **upstream ggml** (issue drafted) over a private patch. -4. **Full-model low precision.** Quantize dense layers (qkv/o_proj/lm_head, the 10% Q8) to FP4/FP8 too so - the whole prefill runs on FP4 tensor cores, not int8-MMQ. -Exit per phase: measured t/s recorded here; stop a phase when it's a dead end (recorded as such). -Matching vLLM realistically requires phase 3; phases 1–2 are the warm-up + de-risking. - -## Implementation plan B — Complete paged attention (the pivot) - -CPU foundation done (P0–P3, `README.md`): vLLM-parity block manager + ggml write/gather + attention -numerics + placement Gate 0 (token-identical in-model). Remaining = make it deliver the multi-tenant wins. -Phases: -1. **On-demand shared-block pool** — replace `find_slot` ring buffer (`llama-kv-cache.cpp:818`) with - `PagedKVManager` block allocation; KV tensor = `[n_embd, block_size*num_blocks]` shared pool. Win: - fit more concurrent seqs before OOM. Test: max concurrent seqs at fixed budget vs contiguous. -2. **Gather-read** (`get_k/get_v` `:1145/1165` → `ggml_get_rows` into scratch) + `build_attn_paged` branch - in `llama-graph.cpp`. Numerically proven on CPU (7.5e-08). Gate 0: token-identical multi-seq. -3. **Continuous batching / scheduler** — admit/evict at block granularity in the server slot path. The - real concurrency win on mixed-length traffic (where the placement prototype showed nothing). -4. **Automatic prefix sharing** — block-hash dedup (`PagedKVManager::{compute_block_hashes,get_computed_blocks}` - already implemented + tested). Cross-tenant shared system prompts reuse physical blocks. -Then benchmark in paging's real regimes — **memory-pressured** + **mixed-length continuous batching** — on -the MXFP4 (fair-quant) footing. Note: GB10's 119 GB unified memory means win-1 needs genuine pressure -(long/many seqs) to show; the win is capacity + scheduling, not per-token speed. - -## Honest scope note -Levers 3–5 and the complete paged implementation are each substantial (weeks of expert CUDA/systems work). This doc tracks what is **measured** vs **designed** vs **not-yet-built**, and never claims a number that wasn't run on the box. diff --git a/backend/cpp/llama-cpp/paged/FP4_GROUPED_MOE_KERNEL.md b/backend/cpp/llama-cpp/paged/FP4_GROUPED_MOE_KERNEL.md deleted file mode 100644 index 22f53e610..000000000 --- a/backend/cpp/llama-cpp/paged/FP4_GROUPED_MOE_KERNEL.md +++ /dev/null @@ -1,59 +0,0 @@ -# FP4 grouped-GEMM MoE kernel (Lever 3) — scaffold + implementation plan - -The one piece of work that actually closes the vLLM gap on Blackwell (GB10/sm_121). Both phases are -bottlenecked by the same kernel: `mul_mat_q` (warp-level `mma.sync` grouped MMQ, ~22 TFLOP/s) is -**37%** of prefill and **54.6%** of decode-at-B=64 GPU time (`BENCHMARKS.md`). Paged attention can't touch -it (proven). The fix is a CUTLASS-3.x collective-mainloop grouped GEMM with block-scaled `e2m1` operands via -tcgen05 tensor-memory MMA — what vLLM/FlashInfer/TRT-LLM use. - -## Scaffold (DONE — builds clean, default byte-identical) - -Lives in the DGX checkout `~/llama.cpp-pr24423/ggml/src/ggml-cuda/` (to be rebased onto the pin as a patch / -upstreamed). Captured diff: `patches/kernel/0001-fp4-grouped-moe-scaffold.patch`. - -- `fp4-grouped-moe.{cuh,cu}` — entry `ggml_cuda_fp4_grouped_moe(ctx, src0, src1, ids, dst) -> bool` - (true = handled, false = fall back to MMQ). Gated behind env `GGML_CUDA_FP4_GROUPED`. Currently always - returns false → **default build unchanged**. -- Hook in `ggml_cuda_mul_mat_id` (the MoE dispatch), before the `ggml_cuda_mul_mat_q(...ids...)` call: - `if (ggml_cuda_fp4_grouped_moe(...)) return;`. Builds via the `file(GLOB "*.cu")` (re-run cmake configure - after adding the file — GLOB is configure-time). - -This is the integration seam. The kernel fills the stub. - -## Implementation phases (each: build on GB10 → numerical parity vs `mul_mat_q` → bench) - -1. **Reference grouped GEMM (correctness first, slow OK).** Per-expert problem sizes + offsets from `ids`; - dequant `e2m1`+scales → BF16; loop CUTLASS (or cuBLAS) per group. Gate: output matches MMQ within fp tol - on a 2-expert toy + the real model (token-identical greedy). Establishes the harness + the data plumbing. -2. **CUTLASS GemmGrouped, sm_120a, BF16 operands.** Replace the loop with one `cutlass::gemm::device:: - GemmGrouped` launch over all experts (per-group offsets). Measures the grouping win alone. -3. **Block-scaled FP4 operands (the real lever).** `e2m1` A/B with `e8m0`(MX)/`e4m3`(NV) block scales via the - Blackwell scaled-MMA collective (tcgen05 tensor-memory). This is where the TFLOP/s jumps. Needs CUTLASS - 3.x + sm_120a; verify the block-scale layout matches ggml's MXFP4/NVFP4 packing. -4. **Fuse activation quant** (the F32→FP4 of src1) into the gather/permute prologue. -5. **Enable by default** on sm_120/121 when parity holds + faster; keep the env as an escape hatch. - -## Dependencies / decisions - -- **CUTLASS is not currently a ggml dependency** (the profile's `cutlass_80_tensorop` is cuBLAS-internal). - Adding it = submodule/fetch + include dir, gated to CUDA sm_120+. Float the approach with ggml maintainers - early (Discussion #18369 is the home; JohannesGaessler asked to discuss arch before big kernel work). -- Target sm_120a/121a (consumer Blackwell). Datacenter Blackwell (sm_100) is a separate tile config. -- Risk: needs ncu-driven iteration on the GB10; this is multi-week, expert-CUDA. No upstream base to fork - (exhaustive search confirmed). Net-new value upstream. - -## DENSE scope — RESOLVED (TODO #28, benchmarked): dense needs an FP4 GEMM too - -Benchmarked Qwen3-32B dense, vLLM W4A16 vs llama.cpp Q4_K_M (`BENCHMARKS.md`). **Dense prefill is 7.6–32× -behind** (llama int8-MMQ plateaus ~765 t/s; vLLM FP4 scales to 24.4k); decode ~parity at B=1, 2.2× at B=64. -So the kernel track is **two kernels, not one**: - -- **(a) Dense FP4 GEMM** — a plain non-grouped CUTLASS/tcgen05 block-scaled FP4 GEMM. **Simpler than grouped; - land this FIRST** — it's the easier first kernel, benefits every dense model, and de-risks the FP4 collective - before the grouped variant. Hook: the non-MoE `ggml_cuda_mul_mat_q` (no `ids`) path. -- **(b) MoE grouped FP4 GEMM** — the scaffold above (`ggml_cuda_fp4_grouped_moe`), per-expert offsets. - -Both share the same block-scaled `e2m1` collective; (a) is (b) with one group. Suggested order: build (a), -prove the FP4 collective + parity harness, then generalize to (b). (Aside: full W4A4 NVFP4 doesn't run on -GB10 today — FlashInfer ships no FP4 cubins for sm_121, so the dense `mm_fp4` kernel hangs/returns zeros; the -W4A16 Marlin path is the fast, correct one and is the fair comparison. See `BENCHMARKS.md` for the root cause.) diff --git a/backend/cpp/llama-cpp/paged/MXFP4_QUALITY.md b/backend/cpp/llama-cpp/paged/MXFP4_QUALITY.md deleted file mode 100644 index fc5b8adf6..000000000 --- a/backend/cpp/llama-cpp/paged/MXFP4_QUALITY.md +++ /dev/null @@ -1,140 +0,0 @@ -# MXFP4-dense vs Q4_K_M quality check (Qwen3, GB10 / DGX Spark) - -## Question - -MXFP4-quantized **dense** Qwen3-32B is measurably faster on GB10 (Blackwell) than -Q4_K_M: ~1.58x concurrent prefill, ~1.2x decode, for free (just a requantize that -routes onto the FP4-MMA kernel). Before LocalAI recommends MXFP4-dense as a Blackwell -default, we must confirm its **quality is acceptable versus Q4_K** (Q4_K is normally the -stronger 4-bit format). - -Critical caveat going in: the pre-existing `~/bench/q3-32b-mxfp4-dense.gguf` was built -with `--allow-requantize`, so it was suspected to be **double-quantized** (Q4_K_M -> -MXFP4), which would unfairly penalize MXFP4. The goal here was a *fair* answer. - -## Verdict - -**Do NOT recommend MXFP4-dense as a quality-equivalent replacement for Q4_K on -Blackwell.** A clean apples-to-apples test (same BF16 source, both 4-bit, no imatrix) -shows MXFP4-dense carries a **large** quality penalty that Q4_K does not: - -- Q4_K_M costs **+2.6%** perplexity vs the BF16 baseline. -- MXFP4-dense costs **+30.8%** perplexity vs the BF16 baseline (i.e. **+27.5% worse - than Q4_K**). - -The double-quant suspicion was correct but it was **not** the main culprit: even a clean -MXFP4-from-BF16 is dramatically worse than Q4_K. The ~1.58x prefill / ~1.2x decode -speedup is real, but it is not free on quality. MXFP4-dense output is still coherent (not -gibberish), so it is usable where raw throughput dominates and a quality hit is -acceptable, but it must not be presented as a drop-in, quality-neutral Q4_K replacement. - -## Evidence - -### 1. Provenance of the existing 32B MXFP4 (it is double-quant) - -`~/dense_mxfp4.sh` (mtime matches the `q3-32b-mxfp4-dense.gguf` mtime, Jun 20 09:47) -created it: - -``` -SRC=$HOME/bench/q3-32b-gguf/Qwen3-32B-Q4_K_M.gguf # <-- source is Q4_K_M, not F16/BF16 -OUT=$HOME/bench/q3-32b-mxfp4-dense.gguf -$QB --allow-requantize --tensor-type "attn=mxfp4" --tensor-type "ffn=mxfp4" \ - "$SRC" "$OUT" MXFP4_MOE -``` - -Confirmed **double-quantized** (Q4_K_M -> MXFP4). Any PPL measured on this file -overstates MXFP4's true penalty, so the 32B number below is a loose upper bound, not the -fair answer. - -### 2. 32B quick read (wikitext-2-raw test, 50 chunks, ctx 512, ngl 99) - -`llama-perplexity`, PR build `~/llama.cpp-pr24423/build` (sm_121): - -| 32B model | PPL | vs Q4_K | -|---|---|---| -| Qwen3-32B-Q4_K_M | **7.3865** +/- 0.177 | - | -| q3-32b-mxfp4-dense (double-quant) | **8.4638** +/- 0.206 | +14.6% | - -MXFP4 is much worse than Q4_K here, **and** it is double-quant, so the quick read is -unfair -> escalated to a clean small-model comparison. - -### 3. Fair comparison: clean small dense model (Qwen3-4B BF16) - -The MXFP4-vs-Q4_K delta is a *format* property and roughly model-size-independent, so a -small model gives a fast, clean answer. Downloaded `Qwen3-4B-BF16.gguf` (unsloth, ~7.7 -GiB) and quantized it **from that same BF16 source** to both formats with the identical -recipe used for the 32B (no `--allow-requantize` needed, no imatrix on either side): - -``` -llama-quantize q3-4b-bf16.gguf q3-4b-q4km.gguf Q4_K_M -llama-quantize --tensor-type attn=mxfp4 --tensor-type ffn=mxfp4 \ - q3-4b-bf16.gguf q3-4b-mxfp4.gguf MXFP4_MOE -``` - -Perplexity (wikitext-2-raw test, 50 chunks, ctx 512, ngl 99): - -| Qwen3-4B | size | PPL | vs BF16 | vs Q4_K | -|---|---|---|---|---| -| BF16 (baseline) | 7672 MiB | **13.3188** +/- 0.416 | - | - | -| Q4_K_M | 2497 MiB | **13.6605** +/- 0.426 | **+2.57%** | - | -| MXFP4 (clean) | 2236 MiB (4.66 BPW) | **17.4183** +/- 0.561 | **+30.78%** | **+27.5%** | - -This is the apples-to-apples quality answer: **clean MXFP4-from-BF16 is ~12x more lossy -than Q4_K relative to the BF16 baseline** (30.8% vs 2.6%). Notably the clean-4B MXFP4-vs- -Q4_K gap (+27.5%) is *wider* than the 32B double-quant gap (+14.6%), consistent with -smaller models being more quantization-sensitive - the double-quant did not invent the -problem, it is intrinsic to the format as quantized by `llama-quantize`. - -### 4. Coherence spot-check (32B, llama-simple, n=60) - -MXFP4-dense 32B is fully coherent, not degraded gibberish: - -- "The capital of France is" -> MXFP4: "...Paris, is located near the Seine River..." - (correct); Q4_K similar. -- "Q: What is 17 multiplied by 23? A:" -> MXFP4 reasons via the distributive property - (sound); Q4_K answers 391 directly (correct). -- "def fibonacci(n):" -> both emit valid Python. - -So the quality cost shows up as measurably higher perplexity (and would surface on harder -/ longer tasks), not as obviously broken text at short generation lengths. - -## Why - -`MXFP4_MOE` is a 4-bit float format (E2M1 values, shared E8M0 scale per block of 32, -round-to-nearest) designed for MoE expert tensors (gpt-oss et al.) with a coarse -per-block scale. Q4_K uses 6-bit superblock scales plus per-sub-block mins - materially -better for dense attention/FFN weights. Forcing MXFP4 onto dense layers to reach the FP4 -kernel trades ~1.58x prefill for a large accuracy loss. The FP4-MMA speed path is real, -but the weights it accepts (MXFP4 here) are lossy for dense. - -## Caveat, stated precisely - -This measures **llama.cpp's `llama-quantize` MXFP4** (OCP MX FP4, RTN, **no imatrix**) -against **llama.cpp's Q4_K_M** (k-quant superblocks, also no imatrix here). It is a fair -format-vs-format comparison of exactly what LocalAI would ship if it routed a requantize -through this path. It does **not** claim FP4 is fundamentally unviable on Blackwell: - -- An imatrix-aware MXFP4, or a better FP4 format with two-level scaling - (**NVFP4** - there are already `q3-32b-nvfp4` / `q3-32b-nvfp4a16` dirs on the box), - may close much of this gap and is the more promising Blackwell FP4 path to evaluate. -- The result is for Qwen3 dense; other families may differ in magnitude but the - format-level disadvantage of plain MXFP4 RTN vs Q4_K is expected to hold. - -## Recommendation - -- **Do not** ship a blanket "use MXFP4-dense on Blackwell" recommendation as a Q4_K - quality equivalent. The ~1.58x prefill / ~1.2x decode win comes with a real ~30% PPL - inflation (vs ~2.6% for Q4_K). Q4_K_M stays the right dense default on Blackwell. -- If exposing MXFP4-dense at all, gate it as an explicit **throughput-over-quality** - option with the perplexity caveat surfaced, not a default. -- MXFP4/FP4 remains correct where the model is trained for it (MoE / gpt-oss-style). - Pursue **NVFP4** (and/or imatrix-aware FP4) as the quality-competitive Blackwell FP4 - format before making any FP4-dense recommendation. - -## Reproduction (DGX Spark, GB10, build `~/llama.cpp-pr24423/build`, sm_121) - -- Dataset: `~/wikitext-2-raw/wiki.test.raw` (wikitext-2-raw-v1 test). -- 32B: `~/ppl32b.sh` -> `~/ppl32b.out`; coherence `~/coh32b.sh` -> `~/coh32b.out`. -- Clean 4B: `~/fair4b.sh` -> `~/fair4b.out` (quantize + 3x perplexity). -- All runs `-ngl 99`, `--chunks 50`, `-c 512`. GB10 thermal-throttles but PPL is a - correctness metric, so thermal state does not affect these numbers. diff --git a/backend/cpp/llama-cpp/paged/Makefile b/backend/cpp/llama-cpp/paged/Makefile deleted file mode 100644 index 20f830b73..000000000 --- a/backend/cpp/llama-cpp/paged/Makefile +++ /dev/null @@ -1,41 +0,0 @@ -CXX ?= g++ -CXXFLAGS ?= -std=c++17 -O2 -Wall -Wextra -I. - -TESTS = test_free_block_queue test_block_pool test_paged_kv_manager test_prefix_cache -BINS = $(addprefix tests/,$(TESTS)) - -all: $(BINS) - -tests/%: tests/%.cpp paged_kv_manager.cpp paged_kv_manager.h - $(CXX) $(CXXFLAGS) -o $@ $< paged_kv_manager.cpp - -check: all - @for t in $(BINS); do echo "== $$t =="; ./$$t || exit 1; done - -paged-bench: paged-bench.cpp paged_kv_manager.cpp paged_kv_manager.h - $(CXX) $(CXXFLAGS) -o $@ paged-bench.cpp paged_kv_manager.cpp - -bench: paged-bench - ./paged-bench - -# --- Optional ggml integration test (Phase 1: paged write/gather mechanism) --- -# Requires a built ggml. Override these to point at your checkout / build: -# make ggml-check GGML_SRC=/ggml GGML_BUILD= -GGML_SRC ?= ../../llama-cpp-fallback-build/llama.cpp/ggml -GGML_BUILD ?= /tmp/ggml-build -GGML_LIBDIR = $(GGML_BUILD)/src - -GGML_TESTS = test_ggml_paged_rw test_ggml_paged_attn -GGML_BINS = $(addprefix tests/,$(GGML_TESTS)) - -tests/test_ggml_%: tests/test_ggml_%.cpp paged_kv_manager.cpp paged_kv_manager.h - $(CXX) $(CXXFLAGS) -I$(GGML_SRC)/include -o $@ $< paged_kv_manager.cpp \ - -L$(GGML_LIBDIR) -lggml -lggml-base -lggml-cpu -Wl,-rpath,$(GGML_LIBDIR) - -ggml-check: $(GGML_BINS) - @for t in $(GGML_BINS); do echo "== $$t =="; ./$$t || exit 1; done - -clean: - rm -f $(BINS) $(GGML_BINS) paged-bench - -.PHONY: all check ggml-check clean diff --git a/backend/cpp/llama-cpp/paged/NVFP4_TEST.md b/backend/cpp/llama-cpp/paged/NVFP4_TEST.md deleted file mode 100644 index 37817617b..000000000 --- a/backend/cpp/llama-cpp/paged/NVFP4_TEST.md +++ /dev/null @@ -1,114 +0,0 @@ -# NVFP4-dense on DGX Spark (GB10, sm_121): is it the quality-preserving FP4 win MXFP4 wasn't? - -Test rig: DGX Spark GB10 (sm_121), `~/llama.cpp-pr24423/build` (PR #24423, FP4 MMA + NVFP4 -kernel), wikitext-2-raw, clean BF16 source `q3-4b-bf16.gguf` (the same source used for the -established MXFP4 / Q4_K fair test). NVFP4 and all comparison quants were produced clean from -BF16, no imatrix. - -## Verdict (short) - -YES on all the load-bearing questions, with one honest caveat: - -1. llama.cpp CAN produce an NVFP4 GGUF. -2. NVFP4 quality is Q4_K-class, NOT MXFP4-class: +7.4% PPL vs BF16 (MXFP4 was +30.8%). It is - slightly behind Q4_K (+4.8% relative) but in the same ballpark, not on the quality cliff. -3. NVFP4 routes onto the FP4 MMA kernel and gets the FP4 prefill speedup: ~1.29x Q4_K on the - 4B, tracking MXFP4 to within 5% (MXFP4 hit 1.58x on the 32B; NVFP4 should track it there too). -4. Output is coherent. - -Bottom line: NVFP4-dense IS the quality-preserving FP4 win MXFP4 wasn't. It delivers -essentially the full FP4 prefill speedup at roughly Q4_K quality, where MXFP4 paid a 27% quality -tax for the same speed. LocalAI can support/recommend NVFP4-dense on Blackwell for prefill-bound -workloads, with the caveat that it is marginally (~5%) behind Q4_K on perplexity; an imatrix-guided -NVFP4 quant would likely close most of that remaining gap. - -## 1. Feasibility: can llama-quantize produce an NVFP4 GGUF? YES - -- The type exists with a full quantize path, not just a kernel: - - `GGML_TYPE_NVFP4 = 40` (`ggml.h`), `GGML_FTYPE_MOSTLY_NVFP4 = 26` - - `quantize_nvfp4` / `quantize_row_nvfp4_ref` / `dequantize_row_nvfp4` registered in `ggml.c` - - type_name is `"nvfp4"`, block `QK_NVFP4` (per-16 FP8/E4M3 block scale + global scale) -- NVFP4 is NOT a top-level `llama-quantize` ftype (no `NVFP4` entry in the allowed-types list, - no reference in `tools/quantize/quantize.cpp` or `src/llama-quant.cpp`), BUT - `--tensor-type name=nvfp4` resolves it: `parse_ggml_type` matches the arg against - `ggml_type_name(...)`, which returns `"nvfp4"`. This is the exact same mechanism that produced - MXFP4-dense. -- Recipe used (mirrors the MXFP4-dense GGUF byte-for-byte in structure: token_embd Q8_0, all - norms F32, all 2D attn+ffn weights to FP4): - - ``` - llama-quantize --tensor-type "attn=nvfp4" --tensor-type "ffn=nvfp4" \ - q3-4b-bf16.gguf q3-4b-nvfp4.gguf Q8_0 - ``` - - Result: `q3-4b-nvfp4.gguf`, 2343.93 MiB, 4.89 BPW, ~5 s. (MXFP4-dense was 2350 MiB; same shape.) - Every `blk.N.attn_*` and `blk.N.ffn_*` reported `converting to nvfp4`; token_embd Q8_0; norms F32. - -The on-box `~/bench/q3-32b-nvfp4*` dirs are vLLM HF safetensors (already 4-bit), not GGUF, and -do not feed llama.cpp - confirmed and irrelevant. - -## 2. Quality (decisive): NVFP4 is Q4_K-class, not MXFP4-class - -`llama-perplexity -f wiki.test.raw --chunks 50 -c 512 -ngl 99`, all clean from the same BF16 4B: - -| Quant | PPL | vs BF16 | vs Q4_K | -|---------|--------|----------|----------| -| BF16 | 13.32 | - | - | -| Q4_K_M | 13.66 | +2.6% | - | -| NVFP4 | 14.31 | +7.4% | +4.8% | -| MXFP4 | 17.42 | +30.8% | +27.6% | - -(NVFP4 measured this run: Final PPL = 14.3097 +/- 0.4457.) - -NVFP4 lands much closer to Q4_K (gap 0.65 PPL) than to MXFP4 (gap 3.11 PPL). MXFP4's finer -sibling delivers: the two-level scaling (per-16 FP8 block scale + global scale) recovers almost -all of the quality MXFP4's coarse per-32 E8M0 scale threw away. It is not quite Q4_K, but it is -firmly in the "acceptable 4-bit" regime, not the lossy one. - -## 3. Speed: NVFP4 routes onto the FP4 MMA kernel - -No clean BF16 32B was on the box (only the vLLM NVFP4 safetensors and the Q4_K/MXFP4 32B GGUFs), -so per the brief this is the 4B speed signal - a 3-way cold A/B on the SAME 4B model, 45 s -cooldowns between runs (`-npp 512 -ntg 128 -npl 8,32,64 -b 2048 -ub 2048 -ngl 99`): - -Prefill S_PP (t/s): - -| B | Q4_K | NVFP4 | MXFP4 | NVFP4 / Q4_K | NVFP4 / MXFP4 | -|-----|--------|--------|--------|--------------|---------------| -| 8 | 4862 | 6313 | 6602 | 1.30x | 0.96x | -| 32 | 5020 | 6497 | 6836 | 1.29x | 0.95x | -| 64 | 5031 | 6490 | 6831 | 1.29x | 0.95x | - -- NVFP4 prefill is within ~5% of MXFP4 at every batch size -> both land on the same FP4 MMA - kernel. NVFP4 does NOT fall back to a slow path. -- NVFP4 beats Q4_K's int8-MMQ prefill by ~1.29x on the 4B. The established 32B figures were - Q4_K S_PP ~767 and MXFP4 ~1209 (1.58x); since NVFP4 tracks MXFP4 to within 5%, NVFP4 on the - 32B should likewise approach ~1.5x. (The 4B shows a smaller multiplier than the 32B because a - smaller model spends proportionally less time in the matmul the FP4 kernel accelerates.) -- Token-gen (S_TG) is comparable across all three (memory-bound), as expected. - -## 4. Coherence - -`llama-simple` (llama-cli hangs - avoided), NVFP4 4B: -- "The capital of France is" -> "...Paris. ...Germany is in Berlin. ...Italy is in Rome. - ...Spain is in Madrid. ...Netherlands is in Amsterdam." (all correct) -- "Q: What is 17 plus 25? A:" -> "42." (correct) - -Coherent and factually accurate. - -## Recommendation for LocalAI on Blackwell - -Support and recommend NVFP4-dense as the FP4 prefill option on Blackwell (sm_120/121), produced -via `--tensor-type attn=nvfp4 --tensor-type ffn=nvfp4` over a BF16 source (token_embd Q8_0, -norms F32). It gives ~the full FP4 prefill speedup (FP4 MMA kernel, ~1.3x Q4_K on 4B and -expected ~1.5x on larger models) at roughly Q4_K quality (+7.4% PPL vs BF16). This is the win -MXFP4 failed to deliver: MXFP4 paid a +30.8% quality tax for the same speed and was rejected. - -Caveats / follow-ups: -- NVFP4 is still ~4.8% behind Q4_K on PPL. For quality-first deployments where the prefill win - does not matter, Q4_K_M remains the better pick. -- These NVFP4/Q4_K numbers are clean (no imatrix). An imatrix-guided NVFP4 quant is the obvious - next step and would likely close most of the remaining gap to Q4_K - worth measuring before a - blanket recommendation. -- A direct 32B NVFP4-vs-Q4_K speed run (needs a clean BF16 32B GGUF, not on the box) would - confirm the projected ~1.5x; the 4B signal plus the MXFP4-tracking already make this very likely. diff --git a/backend/cpp/llama-cpp/paged/PAGED_KV_HIGH_CONCURRENCY.md b/backend/cpp/llama-cpp/paged/PAGED_KV_HIGH_CONCURRENCY.md deleted file mode 100644 index cb14f8221..000000000 --- a/backend/cpp/llama-cpp/paged/PAGED_KV_HIGH_CONCURRENCY.md +++ /dev/null @@ -1,115 +0,0 @@ -# Paged KV at high concurrency on a single GB10 - the datacenter-scale test - -Closes the open question left by `PR22569_EVAL.md`: that eval could not test the -"paged KV unlocks thousands of sequences" thesis because **both** KV paths hit the -`LLAMA_MAX_SEQ=256` compile cap, and the 32B-dense model it used is compute-bound -(plateaus by npl=128 for an unrelated reason). This run removes both confounders: -**recompiled `LLAMA_MAX_SEQ=2048`** and used a **bandwidth-bound model (Qwen3-1.7B-Q8_0)** -where decode aggregate is free to keep climbing with concurrency. - -Hardware: NVIDIA GB10 (sm_121, 119 GiB unified LPDDR5X, ~273 GB/s). Build: -`~/llama.cpp-pr22569` (PR #22569 paged path + the reshape fix), `LLAMA_MAX_SEQ=2048`, -sm_121 Release. Contiguous = `llama-batched-bench` (unified KV) `S_TG`. Paged = -`llama-paged -kvp --fit off` `aggregate tps`. `npp=16, ntg/n_predict=128, b=ub=2048, --ngl 99`. Cold runs, 12 s cooldowns. - -## TL;DR for the decision - -**On a single GB10, paged KV does NOT deliver a throughput or concurrency win - the -aggregate-decode ceiling is set by the hardware, not the KV layout, and contiguous KV -already reaches it.** Measured across two model regimes and concurrency up to 2048 -sequences: - -- Aggregate decode **plateaus** once the GPU saturates - for both KV layouts: - - 32B-dense (compute-bound): ~540 t/s, flat from npl=128 (prior eval). - - 1.7B (bandwidth-bound): ~3,200-3,700 t/s, flat from npl=512 (this run). -- Paged and contiguous land at the **same ceiling**; PR #22569's paged op was 12-13% - *slower* than the mature contiguous flash-attention path at equal concurrency on 32B. -- Pushing concurrency past the plateau is **actively harmful to UX**: per-sequence - throughput collapses (23 -> 1.9 tok/s) and TTFT explodes (0.6 s -> 4.3 s avg, **64 s - max**) while aggregate stays flat. - -**vLLM's ~24k aggregate headline is unreachable on a single GB10 with these models -regardless of KV layout** - it needs aggregate memory bandwidth / compute that one GB10 -does not have (i.e. many GPUs). Paged KV is a **memory-capacity / anti-fragmentation / -prefix-sharing** feature, not a single-node throughput-ceiling feature. The static -single-model benchmark deliberately does not create the memory-pressure regime where -paging pays off, which is exactly why no win appears. - -## The numbers - -### Aggregate decode vs concurrency, Qwen3-1.7B-Q8_0 (bandwidth-bound), `LLAMA_MAX_SEQ=2048` - -| npl | contiguous `S_TG` (t/s) | paged `aggregate tps` (t/s) | paged per-seq tps | paged TTFT avg / max | -|----:|------------------------:|----------------------------:|------------------:|---------------------:| -| 128 | 2,643 | 2,887 | 23-25 | - | -| 256 | 2,925 | - | - | - | -| 512 | 3,215 | 3,637 | 7.2-7.8 | 0.57 s / 0.90 s | -| 1024 | 3,118 | 3,695 | 3.7-4.2 | 1.17 s / 2.37 s | -| 2048 | (not run) | 3,608 | 1.9-14.6 | 4.28 s / **63.8 s** | - -Both paths flatten by npl~512. 8x more concurrency (128->1024) buys contiguous only -**+18%** and paged **+28%**, then both stop. (The two tools meter slightly differently - -`llama-paged` aggregate vs `batched-bench` decode-only `S_TG` - so the small paged-vs- -contiguous offset is not a real paged advantage; the prior apples-to-apples 32B eval had -paged 12-13% *behind*.) - -### Why it plateaus (the hardware ceiling, not the KV layout) - -Decode is memory-bandwidth-bound: each step reads the model weights once and shares that -read across the whole batch. Once concurrency is high enough that the shared weight-read -is amortized, the per-step cost is dominated by KV reads + attention + host work, none of -which paging makes cheaper. The GB10's ~273 GB/s sets the floor; at the plateau the GPU -is ~saturated. Adding sequences past that point cannot raise aggregate - it only divides -the same throughput across more users (per-seq tps falls, TTFT rises). The 32B-dense case -plateaus even earlier (npl=128) because it saturates on **compute** (weight matmuls), not -bandwidth - the kernel decomposition is in `VLLM_DECOMPOSITION.md`. - -## What paged KV is actually for (the honest, deliverable value) - -Paging never helps a static, uniform-length, single-model benchmark on a GPU with memory -to spare - there is no fragmentation and no over-reservation to reclaim. Its real wins, -which require the regime this hardware+benchmark does not exercise, are: - -1. **Concurrent-tenant capacity under memory pressure.** Block KV fits more *diverse* - in-flight sequences (variable, dynamically arriving/leaving contexts) without the - contiguous path's per-slot reservation/fragmentation. Pays off when KV memory, not - compute/bandwidth, is the binding constraint - i.e. at multi-GPU datacenter scale or - with very long/variable contexts. -2. **Cross-request prefix sharing.** A chained-hash block cache shares identical system - prompts / RAG preambles across requests (vLLM's `block_pool.py` + block-hash map). A - real token-budget win for shared-prefix workloads; PR #22569 defers this to a - non-existent Phase 2 (our from-scratch P0 has the machinery). - -These are measured as **max concurrent distinct tenants** and **KV memory saved**, not as -aggregate tok/s on one model. They do not move the single-GB10 throughput ceiling. - -## Recommendation - -- **Do not pitch paged KV as a single-GB10 throughput lever** - it is measured flat to - the contiguous ceiling (and PR #22569 is slower). Doing so would not survive a - benchmark. -- **The single-GB10 throughput story is already strong without paging:** llama.cpp is - ahead of vLLM single-stream (MXFP4 1153 > 800) and at ~70-81% of vLLM aggregate at - npl<=128 with a near-identical batching multiplier (`VLLM_DECOMPOSITION.md`). Ship the - MXFP4/NVFP4-dense prefill win (`NVFP4_TEST.md`) - that is the cheap, real, defensible - Blackwell number. -- **If datacenter-scale (thousands of concurrent tenants) is the genuine target,** the - lever is **multiple GPUs** plus paged KV's **capacity + prefix-sharing** features - - framed and measured as concurrent-tenant capacity and KV memory saved, on a - variable-context / shared-prefix workload. A single GB10 cannot produce the ~24k - aggregate regardless of KV layout; that is a fleet-level result. - -## Reproduction (DGX, `~/llama.cpp-pr22569`, `LLAMA_MAX_SEQ=2048`) - -```sh -M=~/bench/draft17/Qwen3-1.7B-Q8_0.gguf -# contiguous -for NPL in 128 256 512 1024; do - ./build/bin/llama-batched-bench -m $M -npp 16 -ntg 128 -npl $NPL -ngl 99 \ - -b 2048 -ub 2048 -fa on -c $((NPL*160)); done -# paged -for NPL in 512 1024 2048; do - ./build/bin/llama-paged -m $M -kvp --fit off -ngpub 32768 -ncpub 128 \ - -np $NPL -ns $NPL -n 128 -b 2048 -ub 2048 -ngl 99; done -``` diff --git a/backend/cpp/llama-cpp/paged/PAGED_KV_TARGET_READINESS.md b/backend/cpp/llama-cpp/paged/PAGED_KV_TARGET_READINESS.md deleted file mode 100644 index 3733bb300..000000000 --- a/backend/cpp/llama-cpp/paged/PAGED_KV_TARGET_READINESS.md +++ /dev/null @@ -1,170 +0,0 @@ -# Paged KV: target-readiness (correctness, dynamic benchmark, 2xH200 projection) - -Target hardware: **~2x H200** (281 GB HBM3e total, ~4.8 TB/s per GPU). The GB10 box is -the *test* rig, not the target - and several earlier "no win" findings are GB10-specific -artifacts (low bandwidth caps throughput before KV memory ever binds). This document -delivers the three things needed to push paged KV toward the real target: - -1. **Correctness** of the paged path - verified (and a blocking bug found + fixed). -2. **A dynamic-load benchmark** that actually exercises where paging wins (`paged-loadgen.cpp`). -3. **A projection** of the paged-KV payoff on 2x H200, grounded in measured GB10 numbers. - ---- - -## 1. Correctness: PASS (after fixing the auto-fit OOM) - -`test-paged-kv-e2e` checks the paged decode path against the contiguous reference -(greedy argmax + top-5 set overlap >= 4). On the box it was previously **unverified** - -it aborted at context creation. Root cause found: - -- `common_fit_paged_kv_blocks` (`common/common.cpp:1144`) **unconditionally overrides** - `n_gpu_blocks` from `ggml_backend_dev_memory`, which **over-reports free VRAM on the - GB10 integrated/unified device** (it sized **~245 GB of KV on a 119 GB box** -> - `cudaMalloc` OOM -> `GGML_ASSERT` abort in `llama-kv-cache-paged.cpp:74`). The test's - explicit `n_gpu_blocks=64` was being clobbered because `params.fit_params` defaults on. - -**Fix (item-1 patch, applied on the box):** - -```diff ---- a/tests/test-paged-kv-e2e.cpp -+++ b/tests/test-paged-kv-e2e.cpp -@@ run_paged() - params.kv_paged = true; -+ params.fit_params = false; // honor explicit n_gpu_blocks; GB10 dev_memory over-reports free VRAM - params.n_gpu_blocks = 64; -``` - -**Result (Qwen3-0.6B-Q8_0, GB10):** - -``` -test-paged-kv-e2e: top-5 argmax match: ref=3743 paged=3743 -test-paged-kv-e2e: top-5 set overlap: 5/5 (require >= 4) -test-paged-kv-e2e: PASSED -``` - -The paged op is **numerically greedy-equivalent to the contiguous path**. The reshape -bug from `PR22569_EVAL.md` (decoupled head_dim) is already applied in the checkout. - -**Target-readiness caveat (the durable fix, not just the test):** the auto-fit itself is -brittle and must be hardened before it runs on a real serving box - even though -`ggml_backend_dev_memory` reports correctly on a discrete H200, the function should still -(a) early-return when `!params.fit_params`, (b) **clamp** the computed `n_gpu_blocks` so -`n_gpu_blocks * block_bytes <= free_vram - margin` using the *actual* KV element size, and -(c) not override an explicitly-set value. One-screen change in `common_fit_paged_kv_blocks`. - ---- - -## 2. Dynamic-load benchmark - `paged-loadgen.cpp` - -**Why the existing tools show no paged win:** `llama-batched-bench` and the stock -`examples/paged/paged.cpp` both run **fixed-length, all-arrive-at-once, single-prompt** -load. That has no over-reservation and no fragmentation, so contiguous KV is already -memory-optimal and paging has nothing to reclaim (`PAGED_KV_HIGH_CONCURRENCY.md`). The -paged win only exists under **variable lengths + continuous arrival + shared prefixes** - -the real serving regime. No tool in the tree creates it. - -`paged-loadgen.cpp` (committed here) does, via the confirmed `llama_paged_scheduler_*` -API: - -- **shared system prefix** (`LG_PREFIX` tokens) prepended to every request -> exercises - cross-request prefix sharing, -- **variable prompt length** (`LG_SUFMIN..LG_SUFMAX` unique suffix), -- **bimodal generation length** (`LG_GENLONG` for `LG_LONGPCT`% of requests, else - `LG_GENSHORT`) - the over-reservation driver, -- **continuous arrival**: keeps `LG_INFLIGHT` requests live, admitting a new one each time - one finishes. - -It reports the load-bearing number for the buy decision - the **capacity ratio**: - -``` -paged peak KV = sum over live seqs of ceil(used/block)*block * kv_bytes_per_token -contiguous reserve = peak_inflight * max_ctx * kv_bytes_per_token (worst-case per slot) -CAPACITY RATIO = contiguous_reserve / paged_peak (+ prefix sharing on top) -``` - -`kv_bytes_per_token = 2 * n_layer * n_head_kv * head_dim * sizeof(f16)` - confirmed against -`llama-kv-cache-paged.cpp` (e.g. Qwen3-32B: 2*64*8*128*2 = **256 KiB/token**). - -**How to run (on the target):** drop into PR #22569's `examples/paged/`, add to its -CMakeLists next to `llama-paged`, build, then e.g. -`LG_INFLIGHT=2048 LG_LONGPCT=15 paged-loadgen -m -kvp --fit off -ngpub -ncpub -ngl 99`. -Sweep `LG_INFLIGHT` to the throughput plateau and read the capacity ratio at that point. -It is written to run on the target (2x H200) where the regime exists; on GB10 it runs but -the ratio is uninteresting because throughput plateaus before memory binds (see below). - ---- - -## 3. Projection to 2x H200 (grounded in measured GB10 numbers) - -### Measured on GB10 (this work) - -| model | decode plateau (aggregate) | plateau concurrency | bound by | -|---|---|---|---| -| Qwen3-32B-Q4_K_M (dense) | ~540 t/s | npl ~128 | compute | -| Qwen3-1.7B-Q8_0 | ~3,200 t/s | npl ~512 | bandwidth | - -### Hardware ratios (per GPU, then 2x TP at ~85% scaling) - -| | GB10 | H200 | per-GPU x | 2x H200 (TP) x | -|---|---|---|---|---| -| mem bandwidth | 273 GB/s | ~4.8 TB/s | 17.6 | ~30 | -| BF16 compute | ~213 TFLOP | ~989 TFLOP | 4.6 | ~8 | -| HBM | 119 GB | 141 GB | 1.18 | 2.4 (281 GB) | - -Decode is bandwidth-bound, so **both the aggregate ceiling and the concurrency at which it -is reached scale with bandwidth (~30x on 2x H200)**: - -- **32B-dense aggregate decode ceiling:** 540 x 30 ~= **16,000 t/s**, reached at - ~128 x 30 ~= **3,800 concurrent sequences**. - -### Why paged KV becomes the binding lever on 2x H200 (and didn't on GB10) - -To reach that ~16k t/s ceiling you must hold **~3,800 sequences** of KV. The memory math: - -- 32B weights (FP8) ~= 32 GB, sharded over 2 GPUs -> ~250 GB HBM free for KV. -- 32B KV = 256 KiB/token. At an avg held context of 2,000 tokens, **per seq = 512 MiB**. -- Contiguous unified KV (reserve for the live set) fits ~250 GB / 512 MiB ~= **~490 - sequences** - **8x short of the 3,800 needed to reach the throughput ceiling.** - -So on 2x H200 **KV memory is the binding constraint at the throughput-optimal concurrency**, -and contiguous KV strands most of the bandwidth (you'd run at a fraction of 16k t/s). This -is the gap paged KV closes. On GB10 it never appeared because GB10's 30x-lower bandwidth -caps decode at npl ~128, whose KV fits in memory trivially - the constraint order is -inverted on the real target. - -### Magnitude of the paged win - -Paging recovers concurrency two ways, both multiplicative on achievable throughput: - -1. **No over-reservation.** Contiguous must back `max_ctx` per slot; paging uses - `ceil(actual/block)`. For a realistic bimodal workload (most generations short, ~15% - long, prompts ~512) the average held context is several-fold below `max_ctx` -> - `paged-loadgen` capacity ratio typically **~4-10x** (it measures the exact number for - your workload's length distribution). -2. **Cross-request prefix sharing** of shared system prompts / RAG preambles - additional, - workload-dependent (chained-hash block cache; vLLM's `block_pool.py`). - -Net: on 2x H200, paged KV is plausibly the difference between serving **~500 and ~3,800** -concurrent 32B sequences in HBM, i.e. between a fraction of and ~all of the **~16k t/s** -decode ceiling. **That is the datacenter payoff, and it is real on the target even though -GB10 cannot exhibit it.** - -### Honest caveats for the buy case - -- These are **projections** from GB10 + spec ratios; the capacity multiplier depends on the - workload's context-length distribution (more variable -> bigger paged win) and TP - efficiency. `paged-loadgen` measures it directly once you have target-GPU time. -- The **paged op itself still needs work**: PR #22569's `ggml_paged_attn` was 12-13% - *slower* than the mature contiguous flash-attention path at equal concurrency - (`PR22569_EVAL.md`), lacks prefix sharing (deferred to a non-existent Phase 2), and has - the fit-robustness bug above. Adopting paged KV for the target means either hardening - #22569 or finishing the from-scratch P4 - the capacity win above assumes a *correct, - competitive* op, which is the remaining engineering. -- Prefill on either KV layout is compute-capped, not a paged concern. - -**Bottom line for the decision:** paged KV **is** the right lever for the 2x H200 target - -the GB10 "no win" result is a bandwidth artifact, not a verdict. The paged path is now -**correctness-verified**, the **benchmark to size the win exists**, and the projection -says the payoff is **~5-10x concurrent-tenant capacity -> several-fold higher aggregate -decode** on the target. The remaining work is hardening/finishing the paged op, not -proving the thesis. diff --git a/backend/cpp/llama-cpp/paged/PHASED_VLLM_PARITY_PLAN.md b/backend/cpp/llama-cpp/paged/PHASED_VLLM_PARITY_PLAN.md deleted file mode 100644 index df1b79131..000000000 --- a/backend/cpp/llama-cpp/paged/PHASED_VLLM_PARITY_PLAN.md +++ /dev/null @@ -1,55 +0,0 @@ -# Making llama.cpp/LocalAI a viable vLLM alternative — phased plan - -Goal: close the practical gap to vLLM for both single-user *speed* and multi-user *throughput*, while keeping -quality (no lossy quant). Grounded in measured benchmarks + research (`BENCHMARKS.md`, `BLACKWELL_KERNEL_GAPS.md`, -`VLLM_THROUGHPUT_GAP.md`). The gap is NOT one thing — each phase targets a distinct, independent lever. - -## Where vLLM actually leads (measured, GB10 / Qwen3-32B) - -- **Single-user decode:** ~parity (10.2 vs 11.7) — bandwidth-bound. vLLM's edge is **spec-dec** (lossless). -- **Multi-user decode:** gap grows to ~2.2× at B=64 (kernel + scheduler). -- **Prefill aggregate:** llama plateaus ~765, vLLM scales to 24k — **paged KV + chunked prefill + kernel**. -- Note: on GB10 vLLM's FP4 trump card is *broken* (falls back to Marlin); llama.cpp runs reliably — a real - viability point. vLLM is structurally ahead mainly via **paged KV, chunked prefill, cross-request prefix cache**. - -## Phases - -### Phase 1 — Hardware-tuned config (PR #10411) — DONE -Folded into the hardware-defaults path (`core/config/hardware_defaults.go`): -- Blackwell physical batch (n_ubatch) = 2048. -- **VRAM-scaled `n_parallel` default** (>=32GiB→8, >=8→4, >=4→2): turns on concurrency + continuous batching, - which the backend leaves OFF at its `n_parallel=1` default. Unified KV → slots share the budget (no extra - KV memory). Single-host (local GPU) + distributed router (per node). Already-good defaults confirmed: - flash-attn=auto, context=4096. - -### Phase 2 — Paged / block KV cache ← biggest structural multi-user lever -vLLM's PagedAttention lifts KV utilization ~20-38% → ~96%. llama.cpp's own A10G data (draft PR #22569): -contiguous OOMs at 26 seqs / 496 t/s → paged 247 seqs / 1256 t/s (**~9.5× concurrency, 2.5× aggregate**). -- Build on / complete **upstream draft PR #22569** (`-kvp`, block manager + paged-attn ggml op, FCFS scheduler) - rather than the from-scratch series we prototyped (`paged/`). Our CPU-verified block manager + gather-read - design informs the review/port; the upstream momentum is the place to land it. -- Phase 2b: cross-request prefix sharing (block-hash dedup) — our `PagedKVManager` already implements it. - -### Phase 3 — Prefill amortization (chunked prefill + n_batch/n_ubatch split) -llama aggregate prefill plateaus because (a) one prompt saturates compute, (b) the per-forward GEMM M-dim is -capped at `n_ubatch`=512, (c) no scheduler chunked prefill (draft #10718 abandoned). -- Split logical `n_batch` from physical `n_ubatch` (LocalAI ties them today) so concurrent prefills batch into - a larger logical batch while keeping ubatch at the Blackwell sweet spot (2048). -- Chunked prefill + prefill/decode co-batching in the server slot scheduler. - -### Phase 4 — Batched-GEMM kernel tuning (the decode 2.2× + prefill height) -Per `BLACKWELL_KERNEL_GAPS.md`: dense int8-MMQ at ~21% of ceiling, MoE FP4-MMA at ~5%. Both untuned for -Blackwell. To MATCH: tune MMQ or a Marlin-style W4A16 BF16 GEMM (FP4 not required — GB10 is INT8==BF16). To -BEAT (2×): fix+tune the existing FP4-MMA on sm_121 (build-flag/`-O3`-miscompile, not greenfield). - -### Phase 5 — Backend GPU sampling -CPU per-sequence sampling caps GPU util ~60% beyond n_parallel ~8-16 (upstream PR #17004). Track/adopt. - -### Cross-cutting — Speculative decoding (single-user speed, quality-preserving) -Dense ≥14B: lossless ~1.8-3×. llama.cpp has `-md`/`--spec-draft-*`. Wire a draft-model field in the model -config + ship Qwen3 target+draft (1.7B) pairs in the gallery. NOT for MoE-A3B (nothing to amortize). - -## Sequencing rationale -Phase 1 (config) ships now — biggest immediate multi-user win for zero kernel work (concurrency was OFF). -Phase 2 (paged KV) is the highest-leverage structural build and has upstream momentum. Phases 3-4 are deeper -(scheduler + kernel). Spec-dec is independent and can land any time for single-user speed. diff --git a/backend/cpp/llama-cpp/paged/PR17004_EVAL.md b/backend/cpp/llama-cpp/paged/PR17004_EVAL.md deleted file mode 100644 index 7ca9f0bb9..000000000 --- a/backend/cpp/llama-cpp/paged/PR17004_EVAL.md +++ /dev/null @@ -1,90 +0,0 @@ -# PR #17004 (backend / GPU sampling) evaluation on DGX Spark (GB10, sm_121) - -Date: 2026-06-21. Hardware: NVIDIA GB10 (GB10, sm_121), CUDA 13.0, cmake 3.28. -Model: `Qwen3-32B-Q4_K_M.gguf`. LocalAI pin: `LLAMA_VERSION=f3e182816421c648188b5eab269853bf1531d950` (2026-06-17). - -## TL;DR (clean negative) - -1. **PR #17004 is MERGED and is ALREADY present in our pinned llama.cpp `f3e1828`.** There is nothing to apply / cherry-pick / patch. The `-bs/--backend-sampling` CLI arg, the `llama_set_sampler` / `llama_get_sampled_*` API, and the GPU argsort/top-k/cumsum/softmax kernels are all in the pin. -2. **The prescribed benchmark cannot test the fix.** `llama-batched-bench` does ZERO sampling - it feeds random tokens (`std::rand() % n_vocab`). Its ~540 t/s plateau is therefore **not** sampling-bound, and enabling backend sampling does nothing to it. The valid tool is `llama-batched` (examples/batched), which the PR updated to drive per-sequence sampler chains and which actually exercises `-bs`. -3. **In a controlled real-sampling A/B (same `llama-batched` harness, CPU vs GPU sampler), GPU sampling gave only +25% at np=32, +3% at np=64, and CRASHED (`GGML_ASSERT(obj_new)`, graph-context alloc) at np=128 and np=256** - exactly the multi-user regime the investigation cares about. -4. **nsys at np=64: GPU kernel profile and GPU-busy time are essentially identical with and without the fix** (CPU 392.5 t/s / GPU 404.2 t/s; total GPU kernel+memop time ~4.05 s in both). Sampling kernels do not even appear among the top GPU contributors. GPU utilization did **not** rise. -5. **Conclusion: PR #17004, in the state shipped by our pin, does NOT break the ~540 plateau and does not move decode aggregate toward the ~2700 GPU-bound ceiling or past vLLM's 667.** It is modest at low parallelism and unusable (crash) at the high parallelism in question. The PR's own guidance ("recommended `--parallel 1`", "will take time to mature") matches what we measured. - -## 1. What PR #17004 does + state - -- Title: "sampling : add support for backend sampling". **State: MERGED** into `master` (PR head branch `gpu-sampling`). 44 files, +4133/-296. -- `libllama`: new `llama_context_params.samplers` / `n_samplers`, `llama_set_sampler`, `llama_get_sampled_*`, `llama_sampler_seq_config`, updated `llama_sampler_i`. Sampler chain can now run inside the compute graph on the backend (GPU) instead of on the CPU after `llama_decode`. -- CUDA: optimized/new `argsort`, `top-k`, `cumsum`, `softmax` kernels; CMake option `-DGGML_CUDA_CUB_3DOT2=ON` (builds a CCCL v3.2 prerelease for faster top-k). -- Tools: new `-bs, --backend-sampling` arg in `common/arg.cpp` (line 1921); server (`server-context.cpp`) per-slot wiring; `examples/batched/batched.cpp` updated. -- Supported backend samplers: `top-k`, `top-p`, `min-p`, `temp` (+ dist). **Limitations (from the PR): not compatible with grammar sampling; single output per sequence per batch; no save/load of sampling state; recommended only with `--parallel 1` and CUB_3DOT2.** Open follow-ups: #18547 (avoid graph reallocations), #18550 (skip inactive samplers in parallel decode). -- It DOES target the CPU-side per-sequence sampling stall we hypothesised - the mechanism is correct. Maturity is the problem. - -Note: the GitHub API reports `mergedAt: 2026-01-04`, but the PR contains June 2026 upstream-merge commits and the feature is verified present in our 2026-06-17 pin, so treat the date field as a metadata quirk. What matters: the code is in `f3e1828`. - -## 2/3. Apply + build - -No apply needed (already in pin). Built from a clean `git worktree` at `f3e1828` (`~/llama-pr17004`), to avoid disturbing the existing diffusion build: - -``` -cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \ - -DCMAKE_CUDA_ARCHITECTURES=121 -DLLAMA_MAX_SEQ=256 \ - -DGGML_CUDA_CUB_3DOT2=ON -DLLAMA_CURL=OFF -cmake --build build --target llama-batched llama-batched-bench -j20 -``` - -**Build: SUCCESS** (CUB_3DOT2=ON FetchContent fetched and compiled despite flaky net; sm_121; LLAMA_MAX_SEQ=256). `-bs/--backend-sampling` confirmed present in `llama-batched --help`. - -## 4. Decode aggregate: fix vs baseline vs vLLM - -### 4a. `llama-batched-bench` (NO sampling - reconfirms the plateau, unaffected by the fix) -`-npp 16 -ntg 128 -npl 32,64,128,256 -c 40960 -b 2048 -ub 2048` - -| npl | S_TG t/s | -|-----|----------| -| 32 | 241.8 | -| 64 | 395.1 | -| 128 | 542.6 | -| 256 | 567.2 | - -Reproduces the ~540 plateau. Because this tool never samples, `-bs` is irrelevant here - the plateau is decode/host-overhead-bound, not sampling-bound. - -### 4b. `llama-batched` real-sampling A/B (CPU sampler vs `-bs` GPU sampler, identical harness) -`-kvu -n 128 -np {32,64,128,256} -c 40960 --seed 1` (samplers: top-k 40 / top-p 0.95 / temp 0.8) - -| np | CPU sampling t/s | GPU `-bs` sampling t/s | delta | -|-----|------------------|------------------------|-------| -| 32 | 174.1 | 217.5 | +25% | -| 64 | 390.5 | 403.4 | +3.3% | -| 128 | 497.9 | **CRASH** `GGML_ASSERT(obj_new) ggml.c:1768` | - | -| 256 | 396.7 | **CRASH** `GGML_ASSERT(obj_new) ggml.c:1768` | - | - -(`llama-batched` absolute t/s is lower than `batched-bench` because it does real sampling plus per-token detokenize/string/stream work; the A/B *within* this harness isolates the sampler cost.) - -**Does the fix break the plateau? No.** GPU sampling helps only at low parallelism and the gain shrinks as np rises (+25% -> +3%), then the path crashes at np>=128 - i.e. it fails in exactly the multi-user regime where the plateau matters. It does not approach the ~2700 ceiling and does not pass vLLM's 667. The CPU-sampling curve itself peaks at np=128 (498) and *drops* at np=256 (397), confirming CPU sampling is a scaling wall - but PR #17004 as shipped does not lift it because the GPU path is unstable there. - -## 5. GPU-utilization mechanism (nsys, np=64, the highest np where `-bs` survives) - -`nsys profile -t cuda ... -n 96 -np 64` - -| mode | decode t/s | total GPU kernel+memop time | top GPU contributors | -|------|-----------|------------------------------|----------------------| -| CPU sampling | 392.5 | ~4.07 s | mul_mat_q (55%+17%), flash_attn (5.7%), mul_mat_vec (2%) | -| GPU `-bs` | 404.2 | ~4.04 s | identical set; sampling kernels not in top contributors | - -GPU-busy time and the kernel mix are **essentially unchanged** between modes. The argsort/top-k/cumsum/softmax sampling kernels are negligible in the timeline; the only visible difference is H2D memcpy *instances* rising 1,495 -> 7,076 (pinned-memory sampler transfers) at ~unchanged total memcpy time. **GPU utilization did not rise.** This directly refutes the idea that, at this workload, the GPU idle is dominated by CPU sampler arithmetic - moving the sampler onto the GPU barely changed throughput (+3%) and did not raise GPU occupancy. The ~80% idle measured elsewhere is dominated by something other than the sampler math (host-side batch construction / synchronization / detokenize), which PR #17004 does not address. - -(np=256 nsys "with fix" could not be captured: `-bs` aborts there. Fixing the crash needs the unmerged follow-ups #18547/#18550, not in our pin.) - -## LocalAI adoption path - -**The code arrives transparently with a version bump; enabling it is not transparent.** - -- `backend/cpp/llama-cpp/prepare.sh` copies all of upstream `llama.cpp/tools/server/*` (including the #17004-modified `server-context.cpp` / `server-task.cpp` / `server-common.cpp`) into `tools/grpc-server/`, and `grpc-server.cpp` `#include`s them. So once `LLAMA_VERSION` points at a commit containing #17004 (our pin `f3e1828` already does), the backend-sampling machinery compiles into `grpc-server` automatically. **No vendored patch in `patches/` is required for the code.** -- The vendored `server-context.cpp` already does the per-slot wiring (around line 1615): `backend_sampling &= task.params.sampling.backend_sampling`, also disabled for speculative decode and for pre-sampling logits (`n_probs>0`), then `llama_set_sampler(ctx_tgt, slot.id, common_sampler_get(slot.smpl))`. -- **But it is OFF unless `task.params.sampling.backend_sampling == true`.** LocalAI's `grpc-server` builds `params` itself from the gRPC request and never sets this flag (and does not pass the upstream `--backend-sampling` CLI arg). So as-is, LocalAI compiles the feature but never uses it. **A small grpc-server change is needed**: read a LocalAI model option / env and set `params.sampling.backend_sampling = true` (global or per-request). -- For performant CUDA top-k, add `-DGGML_CUDA_CUB_3DOT2=ON` to the llama-cpp CUDA `CMAKE_ARGS` in the Makefile (optional; a non-CUB fallback exists). -- **Caveats that blunt the benefit for LocalAI specifically:** grammar-constrained requests (JSON-schema / tool calls - a large share of LocalAI traffic), `logprobs`/`n_probs>0`, and speculative decoding all fall back to CPU sampling by the gating above; and the GPU path crashes at np>=128 in this pin. So even after wiring the flag, the multi-user throughput case would not benefit (and would crash) until the follow-up PRs (#18547/#18550) land and stabilise high-parallelism backend sampling. - -### Recommendation -Do **not** adopt PR #17004 as the multi-user throughput fix yet. It is already in the tree but is immature at the parallelism that matters (crashes at np>=128, modest gains below). The measured bottleneck at this workload is not the sampler arithmetic (nsys shows GPU-busy unchanged when sampling moves to GPU). Re-evaluate after #18547/#18550 merge into a future pin; revisit the host-side decode/batch-construction overhead as the more likely real lever. diff --git a/backend/cpp/llama-cpp/paged/PR22569_EVAL.md b/backend/cpp/llama-cpp/paged/PR22569_EVAL.md deleted file mode 100644 index 32fbbe266..000000000 --- a/backend/cpp/llama-cpp/paged/PR22569_EVAL.md +++ /dev/null @@ -1,136 +0,0 @@ -# Evaluation: llama.cpp PR #22569 (paged KV cache, `-kvp`) on DGX Spark (GB10, sm_121) - -Question: is upstream draft PR #22569 the right base to give LocalAI vLLM-class -high-concurrency GPU throughput, or should we finish our own from-scratch P4 -(`backend/cpp/llama-cpp/paged/`)? - -Date: 2026-06-21. Hardware: NVIDIA GB10 (compute 12.1 / sm_121), 122502 MiB unified -memory, CUDA 13.0, gcc 13.3. Models: `Qwen3-32B-Q4_K_M.gguf` (18.4 GB, 64 layers, -n_head 64 / n_head_kv 8 / head_dim 128 / n_embd 5120) and `Qwen3-0.6B-Q8_0.gguf` for -the correctness gate. - -## TL;DR verdict: DO NOT adopt #22569. Finish our own P4. - -On GB10 with a 32B dense model, PR #22569 delivers **no throughput win and no concurrency -win** - it is ~12% *slower* than the existing contiguous path and hits the *same* -256-sequence ceiling. The "scale to thousands of sequences like vLLM" premise does not -hold for this PR or this hardware/model. On top of that it is broken out of the box, -wired to the wrong integration surface, and a contested draft. - -## 1. Builds? Correct? - -- **Builds: YES.** Cloned `matiaslin/llama.cpp@paged_attention` (PR #22569, single commit - `0b0f7bd...`, base = current master). Clean CUDA build for sm_121 - (`-DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=121 -DCMAKE_BUILD_TYPE=Release`). - `llama-paged`, `llama-batched-bench`, `test-paged-kv`, `test-paged-kv-e2e` all link. - It is self-contained (ships its own CPU+CUDA `ggml_paged_attn` op) and does **not** - depend on the competing CUDA PR #17579 (ericcurtin, `--pagedattention`). - -- **Runs out of the box: NO.** `llama-paged -kvp` on Qwen3-32B *and* Qwen3-0.6B crashes - at context creation: - `build_attn(llm_graph_input_attn_kv_paged*) -> ggml_reshape_2d ->` - `GGML_ASSERT(ggml_nelements(a) == ne0*ne1)` (src/llama-graph.cpp:2556). Same crash with - `--fit off` (so it is the real graph, not just the memory probe). - **Root cause:** the paged path hardcodes `ggml_reshape_2d(cur, hparams.n_embd, ...)`, - wrong for any model where `n_head*head_dim != n_embd`. Qwen3 decouples head_dim: - 32B = 64*128 = **8192** vs n_embd 5120; 0.6B = 16*128 = **2048** vs 1024. The PR's - "qwen3 verified" claim does **not** hold against current Qwen3 GGUFs. Fix is ~1 line - (use the real attention width `cur->ne[0]*cur->ne[1]`); applied for the rest of the eval. - -- **`fit_params` (`-ngpub` auto-sizing) also crashed on GB10** in the same reshape path - during the device-memory probe (before the fix). After the reshape fix, paged - auto-fit works (sized 96624 GPU blocks on the 0.6B from 85 GiB free). - -- **Correctness after the reshape fix:** paged decode runs and produces **coherent** - output on Qwen3-32B (sensible mercury / miso-soup / Starry-Night answers across 128 and - 256 concurrent sequences), indicating the `ggml_paged_attn` op is functionally roughly - correct. PR's own greedy/top-K equivalence test (`test-paged-kv-e2e`, top-K argmax + - top-5 overlap >= 4 + first-4-token greedy match vs non-paged) on Qwen3-0.6B did - **not** reach a PASS/FAIL verdict on GB10: its paged auto-fit grabs ~88 GiB - (96531 blocks) and the run then stalls at cache init (a third GB10 fit-robustness - issue, distinct from the reshape bug). So the formal greedy-equivalence gate is - **unverified on this box**, but the qualitative evidence (coherent multi-sequence 32B - output with explicit small `-ngpub`) indicates the fixed op is roughly correct. This - does not change the verdict, which is decided by throughput below. - -## 2. Throughput: paged vs contiguous on GB10 (Qwen3-32B-Q4_K_M) - -Contiguous = `llama-batched-bench` (unified KV, continuous batching), S_TG decode tok/s. -Paged = `llama-paged -kvp --fit off` (its scheduler-driven continuous-batching loop), -`aggregate tps`. Both `npp~16, ntg/n_predict=128, n_batch=n_ubatch=2048, -ngl 99`. - -| npl | contiguous (S_TG t/s) | paged `-kvp` (agg t/s) | outcome | -|------|----------------------|------------------------|---------| -| 128 | **537** (S 553) | **477** | both run; paged ~12% slower | -| 256 | **541** (S 550) | **471** | both run; paged ~13% slower; neither gains over 128 | -| 512 | FAIL | FAIL | **both** die: `n_seq_max must be <= 256` | -| 1024 | FAIL | FAIL | **both** die: `n_seq_max must be <= 256` | - -### The decisive facts - -1. **PR #22569 does NOT lift the 256-sequence ceiling.** Both contiguous and paged fail - identically at npl 512/1024 with `n_seq_max must be <= 256` (llama.cpp's compile-time - `LLAMA_MAX_SEQ`). It is **not** an OOM - GB10 has 119 GiB and at npl=256 contiguous KV - is only 16 GiB. Paging gives **zero** concurrency headroom over contiguous here. The - "paged unlocks thousands of seqs" premise is false for this PR. - -2. **Paged is slower, not faster.** The fresh `ggml_paged_attn` op (477/471 t/s) loses to - the mature CUDA flash-attention contiguous path (537/541 t/s) by ~12-13% at equal - concurrency. The PR's A10G "2.5x" came entirely from contiguous OOMing at 26 seqs on a - 24 GiB card; that lever does not exist on GB10's 119 GiB. - -3. **The 32B dense model is compute-bound and plateaus by npl=128 on GB10.** Aggregate is - flat from 128->256 (contiguous 537->541; paged 477->471). Doubling concurrency buys - nothing because the GPU is already saturated on the 32B weight matmuls. Even if we - recompiled with a larger `LLAMA_MAX_SEQ`, aggregate would not climb - so vLLM-class - ~24k aggregate is **unreachable for 32B-dense on a single GB10 regardless of KV - layout**. The throughput gap to vLLM at this model/hardware is a compute/bandwidth - problem, not a KV-fragmentation problem. - -## 3. Verdict and reasoning: finish our own P4 - -**Do not adopt #22569 as the base.** Reasons: - -- **No win on target hardware.** Even fully completed, on GB10 + 32B it is slower than - what we already have and capped at the same 256 seqs. There is no throughput or - concurrency dividend to harvest here. -- **Wrong integration surface.** Paged is driven only by a brand-new parallel C API - (`llama_paged_scheduler_init/add_request/prepare_batch/get_batch_info/update/...`) and a - bespoke `examples/paged` loop. `-kvp`/`--kv-paged` is gated to `LLAMA_EXAMPLE_PAGED` - only - it is NOT wired into `llama-server`/`batched-bench`/`parallel`, i.e. NOT the path - LocalAI's grpc-server derives from. Adopting it means rewriting LocalAI's serving loop - around the new scheduler API. -- **Broken / restricted.** Crashes out of the box on all current Qwen3 (and any - decoupled-head-dim model); fit_params crashed; Phase-1 restrictions enforced at context - creation: single CUDA device, full offload only, `n_batch == n_ubatch`, no SWA - (gemma3/llama4/etc. unsupported), no CoW / prefix-caching, no - `seq_cp`/`seq_keep`/`seq_div`/`seq_add`, no state save/load. -- **Contested draft.** Unmerged; the author is openly asking maintainers whether the C - API is even the right design; maintainers are skeptical of paged for single-node use. - -**What P4 should actually target (re-scoped by this data).** The aggregate-throughput -gap to vLLM on a compute-bound dense model on one GB10 is not addressable by paged KV. -The durable, real LocalAI wins from paging are the ones our from-scratch P0 already -implements the machinery for and that #22569 explicitly omits: -- **on-demand KV sizing** (fit more *diverse* concurrent tenants without per-seq - over-reservation), and -- **automatic cross-tenant prefix sharing** (chained-hash block cache - shared system - prompts / RAG preambles), which #22569 defers to a non-existent Phase 2. - -Finish our own P4 (CPU gather-read + a CUDA gather-read) against these capacity/ -prefix-sharing objectives - measured as max concurrent *distinct* tenants and KV memory -saved, not single-model aggregate tok/s. To chase raw aggregate, the levers are lifting -`LLAMA_MAX_SEQ` and smaller/MoE models in memory-bandwidth-bound regimes - orthogonal to -paged attention. The ~1-line reshape fix found here (and the GB10 fit_params crash) are -worth upstreaming to #22569 regardless, but the PR is not our base. - -### Reproduction (DGX, `~/llama.cpp-pr22569`) -```sh -export PATH=/usr/local/cuda/bin:$PATH -# contiguous -./build/bin/llama-batched-bench -m Qwen3-32B-Q4_K_M.gguf -ngl 99 -npp 16 -ntg 128 \ - -npl 128 -c 20480 -b 2048 -ub 2048 # 256/512/1024 -> n_seq_max must be <= 256 -# paged (needs the src/llama-graph.cpp:2556 reshape fix: hparams.n_embd -> cur->ne[0]*cur->ne[1]) -./build/bin/llama-paged -m Qwen3-32B-Q4_K_M.gguf -kvp --fit off -ngpub 2048 -ncpub 128 \ - -np 128 -ns 128 -n 128 -b 2048 -ub 2048 -ngl 99 # 512/1024 -> n_seq_max must be <= 256 -``` diff --git a/backend/cpp/llama-cpp/paged/README.md b/backend/cpp/llama-cpp/paged/README.md deleted file mode 100644 index 77a600443..000000000 --- a/backend/cpp/llama-cpp/paged/README.md +++ /dev/null @@ -1,95 +0,0 @@ -# Paged Attention for llama.cpp (vLLM-parity), CPU-first - -A from-scratch port of vLLM V1's paged KV-cache model into the llama.cpp / ggml -world, built CPU-first and verified incrementally. The host-side block manager is -a faithful port of vLLM; the compute stays in ggml (no new op — the read path -gathers blocks with `ggml_get_rows` and feeds the existing attention ops). - -Design: `docs/superpowers/specs/2026-06-19-paged-attention-llamacpp-design.md` -Plan: `docs/superpowers/plans/2026-06-19-paged-attention-llamacpp.md` - -## Status - -| Phase | What | State | -|------|------|-------| -| P0 | vLLM-parity host block manager (`FreeBlockQueue`, `BlockPool`, `PagedKVManager`, chained-hash prefix cache) | ✅ verified — `make check`, 4/4 suites | -| P1 | ggml paged write/gather mechanism (`set_rows` by slot_mapping → `get_rows` gather) | ✅ verified — `make ggml-check`, non-contiguous blocks `[2,1,5]` round-trip + isolation | -| P2 (core) | attention over gathered paged KV matches independent host reference | ✅ verified — max abs err **7.5e-08** | -| P3 (partial) | capacity & prefix-sharing wins | ✅ measured — `make bench`: **9.2×** more concurrent seqs, **11.3×** less KV memory | -| **P3 (in-model placement)** | **paged, non-contiguous block KV placement in the real model** | ✅ **Gate 0 PASSED** — Qwen3-0.6B token-identical (`patches/0001-paged-kv-block-placement.patch`) | -| P4 (in-model compute) | gather-read (`build_attn_paged`, read only a seq's blocks) + win-2 throughput + multi-seq | ⛔ remaining | - -The design's central risk — *does paged (non-contiguous) KV produce correct attention?* — -is **retired at two levels**: (1) at the ggml-op level (P2, 7.5e-08 vs reference) and -(2) **in a real model** (P3): with KV physically scattered across permuted, non-contiguous -blocks (cells `0-15, 144-159, 32-47, …`), Qwen3-0.6B greedy generation is **token-for-token -identical** to the contiguous cache. Reproduce: - -```sh -# from backend/cpp/llama-cpp-fallback-build/llama.cpp (patch applied, CPU build) -B=build-cpu/bin/llama-simple; M=; P="...long prompt..." -"$B" -m "$M" -n 40 "$P" > base.txt -LLAMA_KV_PAGED=1 "$B" -m "$M" -n 40 "$P" > paged.txt -diff base.txt paged.txt && echo TOKEN-IDENTICAL -# LLAMA_KV_PAGED_DEBUG=1 prints the permuted physical cells per step -``` - -This proves the **storage/placement** layer of paged attention in-model. What remains (P4) -is the **compute** optimization that yields the throughput win: a gather-read that attends -only a sequence's own blocks (instead of scanning `[0,n_kv)` with a mask), plus the -multi-sequence driver to measure tok/s vs concurrency. The patch is single-sequence scope. - -## Build & test - -```sh -make check # P0 host-manager unit suites (pure C++, no deps) -make ggml-check GGML_SRC=/ggml GGML_BUILD= # P1/P2 ggml tests -make bench # P3 capacity + prefix-sharing numbers -``` - -`ggml-check` needs a built ggml. To build one CPU-only from a llama.cpp checkout: -`cmake -S /ggml -B /tmp/ggml-build -DGGML_CUDA=OFF -DCMAKE_BUILD_TYPE=Release && cmake --build /tmp/ggml-build -j` -(if it complains about a missing `ggml.pc.in`, add a minimal pkg-config stub). - -## Files - -- `paged_kv_manager.{h,cpp}` — the vLLM-parity block manager (no ggml/llama dep). -- `tests/test_free_block_queue.cpp` — intrusive LRU free list. -- `tests/test_block_pool.cpp` — alloc/touch/free/evict/cache. -- `tests/test_paged_kv_manager.cpp` — allocate/block_table/slot_mapping/free. -- `tests/test_prefix_cache.cpp` — chained block hashing + first-miss cache hit. -- `tests/test_ggml_paged_rw.cpp` — paged write/gather through real ggml ops. -- `tests/test_ggml_paged_attn.cpp` — attention over paged KV vs host reference. -- `paged-bench.cpp` — capacity (win 1) + prefix-sharing (win 3) measurements. - -## Remaining work — integration map (for the next session) - -Target: a paged read path active behind a flag, producing **token-identical** greedy -output vs the contiguous cache on a real model (Gate 0), then `paged-bench` win 2. - -Exact seams in the vendored llama.cpp (`backend/cpp/llama-cpp-fallback-build/llama.cpp`, -the pinned build fetches `LLAMA_VERSION=f3e182816421…`): - -1. **Memory type** — `src/llama-model.cpp:2070` `create_memory()` constructs `llama_kv_cache`. - Add a paged variant (or a flag on the existing cache) implementing `llama_memory_i` - (`src/llama-memory.h`), backed by `PagedKVManager`. -2. **Allocation** — `src/llama-kv-cache.cpp:818` `find_slot()` produces `slot_info.idxs`. - Replace the ring-buffer scan with block-aligned allocation from `PagedKVManager`. -3. **Read path** — `src/llama-kv-cache.cpp:1145/1165` `get_k`/`get_v` return a contiguous - `[0,n_kv)` view. For paged, gather the sequence's blocks (`ggml_get_rows`) into scratch. - The new branch lives alongside `build_attn` in `src/llama-graph.cpp` (`build_attn_mha`). -4. **Mask** — `src/llama-graph.cpp` `build_attn_inp_kq_mask` sizes the mask to the gathered - length per sequence. -5. **Gate 0 driver** — `build-cpu/bin/llama-simple` (greedy argmax) on - `Qwen3-0.6B.Q4_K_M.gguf`; assert paged output == contiguous output token-for-token. - -### Honest caveats (from the maintainer discussion + reading `find_slot`) - -- llama.cpp's **unified cache already shares one KV pool** across sequences and already - tolerates non-contiguous slots. So win-1 vs *unified* is smaller than vs per-seq - reservation (stream mode). The durable LocalAI wins are **on-demand sizing** and - **automatic cross-tenant prefix sharing** (P0 implements the block-hash machinery). -- vLLM's classic `paged_attention_v1/v2` CUDA kernel is **deprecated**; the live path is - FlashAttention/FlashInfer over a block table. The port targets that pattern, not the - old kernel. Upstream draft PRs #22569 (new `ggml_paged_attn` op) and #17579 (CUDA) are - unmerged; maintainers are skeptical for single-user use. diff --git a/backend/cpp/llama-cpp/paged/UPSTREAM_GGML_ISSUE.md b/backend/cpp/llama-cpp/paged/UPSTREAM_GGML_ISSUE.md deleted file mode 100644 index 9705865ea..000000000 --- a/backend/cpp/llama-cpp/paged/UPSTREAM_GGML_ISSUE.md +++ /dev/null @@ -1,78 +0,0 @@ -# Upstream ggml issue draft: MXFP4 MoE prefill underutilizes Blackwell (GB10) — ~22 TFLOP/s, ~27× behind vLLM - -**Title:** CUDA: MXFP4 MoE prefill runs the Ampere-class warp `mma.sync`, far below Blackwell FP4 peak (GB10 / sm_121) - -## Summary - -On a GB10 (DGX Spark, sm_121), MXFP4 MoE prefill for Qwen3-Coder-30B-A3B is bottlenecked by -`mul_mat_q` (the per-expert grouped MMQ), which runs at only **~22 effective TFLOP/s** — a small -fraction of the GPU's FP4 capability. Batched prefill plateaus at ~3.65k tok/s (B=32) vs vLLM FP8 ~99k -on the same box (~27×). The native FP4 block-scaled `mma.sync` path (PR #17906 et al.) *is* engaged — the -limit is that it's a warp-level MMA kernel, not a tcgen05/CUTLASS-class grouped GEMM. - -## Hardware / build - -- NVIDIA GB10, compute capability 12.1, 119 GiB unified LPDDR5X. -- llama.cpp built `-DCMAKE_CUDA_ARCHITECTURES=121` (sm_121a/compute_121a confirmed in cubins). -- Model: Qwen3-Coder-30B-A3B-Instruct, `MXFP4_MOE` (15.9 GiB, 4.47 BPW). - -## Measurements - -Single-stream (`llama-bench`, ub2048): - -| metric | Q8_0 | MXFP4 | vLLM FP8 | -|---|---|---|---| -| prefill pp2048 | ~2200 | 3441 | — | -| decode tg128 | 62 | 86 | 52 | - -Batched (decode-phase aggregate `S_TG`; prefill aggregate `S_PP`): - -| B | llama MXFP4 prefill | vLLM FP8 prefill | llama MXFP4 decode | vLLM FP8 decode | -|---|---|---|---|---| -| 1 | 1625 | 9644 | 83 | 48 | -| 8 | 3634 | 33373 | 267 | 312 | -| 32 | 3651 | 99398 | 551 | 1171 | -| 64 | 3648 | 151990 | 770 | 2064 | - -Decode is competitive (we win at B=1). **Prefill plateaus and is the gap.** - -## Profiling (nsys, MXFP4 pp2048 kernel time) - -| kernel | % | -|---|---| -| `mul_mat_q<(ggml_type)39>` (MXFP4 MoE GEMM) | **37.2** | -| `mul_mat_q<(ggml_type)8>` (dense/attn, still Q8) | 10.1 | -| `flash_attn_ext_f16` | 8.8 | -| `quantize_mmq_mxfp4` (activation quant) | 8.0 | - -Only cutlass kernel present is `cutlass_80_tensorop` (Ampere). No tcgen05 / wgmma anywhere. - -## What we ruled out (so it's the kernel, not config) - -- **ubatch**: saturates at 2048 (pp4096: ub512 2994 → ub2048 3316 → ub8192 3180). -- **tile width**: `mmq_x` already selects the full 128-wide tile at ub2048 (~128 tokens/expert). -- **cuBLAS fallback**: `GGML_CUDA_FORCE_CUBLAS` is a no-op (3419 ↔ 3423 t/s) — dequant→cuBLAS-FP16 neither - helps nor hurts, i.e. the FP4 MMQ kernel isn't worse than FP16 cuBLAS, both hit a common ceiling. -- prefill does **not** scale with bigger single prompts (attention O(N²) confounds): pp2048 3295, pp8192 - 1524, pp16384 2051 — so it's the many-sequence batched MoE GEMM, not batch size. - -## Proposal - -A tcgen05 / CUTLASS-3.x grouped-GEMM path for FP4 (MXFP4 + NVFP4) MoE on sm_120/121: -- One grouped GEMM over all experts with per-group token offsets (full tiles regardless of tokens/expert), - vs today's per-expert MMQ scheduler. -- Block-scaled `e2m1` operands via tcgen05 tensor-memory MMA (`mma.sync.aligned.kind::mxf4…` is the - warp-level form; the collective-mainloop/tcgen05 form is what extracts Blackwell throughput at prefill - tile sizes). -- Fuse activation quantization (`quantize_mmq_mxfp4`, ~8%) into the permute/gather. -- Optionally extend to dense layers (qkv/o_proj/lm_head) so full-model prefill is FP4/FP8. - -This mirrors what vLLM/FlashInfer/TensorRT-LLM do for Blackwell MoE. Happy to test iterations on the GB10. - -## Repro - -```sh -llama-quantize qwen3coder-f16.gguf qwen3coder-mxfp4.gguf MXFP4_MOE -llama-bench -m qwen3coder-mxfp4.gguf -ngl 99 -p 2048 -n 0 -ub 2048 -llama-batched-bench -m qwen3coder-mxfp4.gguf -ngl 99 -c 45056 -b 2048 -ub 2048 -npp 512 -ntg 128 -npl 1,8,32,64 -``` diff --git a/backend/cpp/llama-cpp/paged/VLLM_DECOMPOSITION.md b/backend/cpp/llama-cpp/paged/VLLM_DECOMPOSITION.md deleted file mode 100644 index 181bffd3b..000000000 --- a/backend/cpp/llama-cpp/paged/VLLM_DECOMPOSITION.md +++ /dev/null @@ -1,83 +0,0 @@ -# What makes vLLM fast on GB10 — kernel vs scheduler (code-grounded, measured) - -Decisive analysis (vLLM v0.23.0, torch 2.11+cu130, sm_121, model `RedHatAI/Qwen3-32B-NVFP4A16`, source at tag -`v0.23.0`). **Answer: it's the scheduler, not the kernel.** This closes the kernel track and opens the -scheduler track. - -## The decomposition (measured on the DGX, prefix-cache OFF, unique prompts) - -| | vLLM W4A16 Marlin | llama.cpp | verdict | -|---|---|---|---| -| **single-stream prefill** | ~800 t/s (~52 TFLOPS) | 718 MMQ / **1153 MXFP4** | **tied; llama.cpp MXFP4 wins** | -| decode batch-1 | 11.8 t/s | ~similar | bandwidth-bound (≈190/273 GB/s); no kernel helps | -| **aggregate decode** | 328 (N32) / 569 (N64) / **667 (N128)** | the gap | **~56× multiplier = scheduler** | - -vLLM's single-stream Marlin is **not** at the roofline — it's in the same ~4×-under regime as MMQ. The 24k -headline is entirely the aggregate decode multiplier. - -## The kernel vLLM actually runs on sm_121 (W4A16, forced) - -Dispatch (vLLM v0.23.0): `compressed_tensors.py:704` (NVFP4 + no input-quant → `W4A4Fp4(use_a16=True)`) → -`compressed_tensors_w4a4_nvfp4.py:28` → `kernels/linear/__init__.py:894` (`if use_a16: force_kernel = -MarlinNvFp4LinearKernel`, **unconditional, no cc gate**) → `nvfp4/marlin.py` → `marlin_utils_fp4.py:182` -`ops.marlin_gemm(b_q_type=float4_e2m1f)`, activations FP16/BF16. csrc: `csrc/quantization/marlin/marlin.cu` -+ `marlin_template.h` + `marlin.cuh`. - -Techniques = **exactly the playbook we proved loses on GB10**: XOR shared swizzle (`marlin_template.h:722 -^ (row%8)`), 4-stage cp.async pipeline (`marlin.cu:396 stages=4`, `cp_async_wait`), ldmatrix+mma, -FP16/BF16 acts. Native FP4 (`FlashInferB12xNvFp4LinearKernel`) needs `Sm120BlockScaledDenseGemm` cubins absent -on GB10 → W4A4 hangs → forced W4A16 Marlin fallback. **Nothing to port; vLLM's kernel is occupancy-blocked too.** - -## The scheduler (the real multiplier) — what llama.cpp lacks - -- **Paged KV cache** (`vllm/v1/core/kv_cache_manager.py`, `block_pool.py`): block KV, no fragmentation → very - high concurrent batch. **llama.cpp: NO** (contiguous per-slot KV → fragmentation caps real concurrency). -- **Chunked prefill** (`config/scheduler.py:84 enable_chunked_prefill=True`, default ON): interleaves prefill - chunks with decode so decode batches stay full. **llama.cpp: NO** (a long prefill stalls the decode batch). -- **Continuous batching** (`v1/core/sched/scheduler.py`): per-step admit/evict. **llama.cpp: YES** (`n_parallel`, - rudimentary — we enabled VRAM-scaled slots in #10411). - -## Sizing the scheduler gap — MEASURED (llama.cpp aggregate, the surprise) - -`llama-batched-bench` Qwen3-32B-Q4_K_M, npp=128 ntg=128, npl scaling (DGX): - -| npl | S_PP (agg prefill) | **S_TG (agg decode)** | vLLM decode | llama % of vLLM | -|---|---|---|---|---| -| 1 | 628 | 10.2 | 11.8 | 86% | -| 8 | 773 | 59.8 | - | - | -| 32 | 763 | **235** | **328** | **72%** | -| 64 | 761 | **391** | **569** | **69%** | -| 128 | 762 | **540** | **667** | **81%** | - -**The "30x gap" headline is wrong for realistic concurrency.** llama.cpp's continuous batching already -captures **~70-81% of vLLM's aggregate decode** at npl<=128, with a near-identical multiplier (10.2 -> 540 = -**53x**, vs vLLM's 56x). And it is still climbing linearly at 128 (not plateaued). Combined with llama.cpp being -*ahead* single-stream (MXFP4 1153 > vLLM 800), **llama.cpp is already broadly competitive with vLLM on GB10 at -self-hosted concurrency.** - -Two real findings remain: -1. **Aggregate prefill is flat ~760** regardless of npl - but that is the **GB10 compute roofline** (vLLM single- - stream is ~800; neither can prefill faster aggregate, it is compute-bound). So prefill is **not a throughput - gap**; chunked prefill is a **latency/TTFT** win (stop a long prefill stalling the decode batch), not a - throughput one. -2. **vLLM's ~24k headline lives at thousands-of-sequences concurrency**, which **paged KV** unlocks (block KV, - no fragmentation). llama.cpp's contiguous KV caps how far npl can scale before memory/fragmentation bite. So - paged KV is the **high-concurrency (datacenter) lever**, not a moderate-concurrency one. - -## Recommendation - -**Pivot to the scheduler; treat the GEMM kernel as good-enough / roofline-blocked on GB10.** -Now that the gap is measured, ROI-ordered: -1. **Ship the MXFP4-dense win** — 1153 t/s single-stream beats vLLM's 800; a Blackwell dense-quant - recommendation (requantize, no kernel work). Already documented in `BLACKWELL_KERNEL_GAPS.md` §6. Cheapest. -2. **Chunked prefill** — the tractable scheduler win: interleave prefill chunks with decode so a long prompt - doesn't stall the decode batch. Payoff is **latency/TTFT under mixed load** (and steadier decode batches), - not aggregate prefill throughput (that's GB10-compute-capped at ~760-800 for both engines). A grpc-server - scheduler change; no KV-layout rewrite. -3. **Paged KV** — the **high-concurrency (thousands-of-seqs) lever** that unlocks vLLM's 24k regime. Heavy - (block KV manager; contested upstream PR #22569 / vendored `patches/`). Worth it only if datacenter-scale - concurrency is a target; at self-hosted concurrency (npl<=128) llama.cpp is already ~75-80% of vLLM. - -**Reframed expectation:** llama.cpp on GB10 is NOT 30x behind vLLM. It is ahead single-stream (MXFP4) and -~70-81% of vLLM aggregate at npl<=128. The genuine differentiator vLLM still has is **scaling to very high -concurrency via paged KV**. Kernel tracks (W4A16 178 t/s; FP4-MMA) stay **banked** - not the lever. diff --git a/backend/cpp/llama-cpp/paged/VLLM_THROUGHPUT_GAP.md b/backend/cpp/llama-cpp/paged/VLLM_THROUGHPUT_GAP.md deleted file mode 100644 index e8b5b6771..000000000 --- a/backend/cpp/llama-cpp/paged/VLLM_THROUGHPUT_GAP.md +++ /dev/null @@ -1,59 +0,0 @@ -# Where vLLM beats llama.cpp on a DGX Spark (GB10), and how to close it — keeping quality - -The question: "vLLM is faster at the end — what do we improve, while keeping good quality?" Answer: the -gap is **three independent things**, and the biggest *per-user, quality-preserving* one is **speculative -decoding**, which llama.cpp already supports. - -## Decomposition (measured + researched) - -| vLLM advantage | helps single user? | llama.cpp answer | quality cost | status | -|---|---|---|---|---| -| **Per-user decode speed** | **yes** | **speculative decoding** (Qwen3 draft / EAGLE3) | **none** (target-verified, lossless) | mature in llama.cpp; **the main lever** | -| Prefill / TTFT | no (it's first-token latency) | tune FP4-MMA / Marlin W4A16 kernel | none | hard; `BLACKWELL_KERNEL_GAPS.md` | -| Aggregate throughput @ concurrency | no (per-user = 0) | continuous batching (paged engine) | none | also kernel-bound | - -Key measured fact: **single-user decode is already at parity** (Qwen3-32B: llama 10.2 vs vLLM 11.7 t/s) — -both hit GB10's ~273 GB/s bandwidth wall (~15 t/s ceiling) **without** spec-dec. So vLLM's real per-user -speed edge is spec-dec, not architecture. - -## Why spec-dec is THE lever here (and quality-safe) - -- **Lossless:** the 32B target verifies every drafted token (accept/reject) — output distribution is - identical to no-drafting. So you keep **Q4_K_M quality** (no lossy MXFP4 needed) *and* get speed. -- **GB10 is best-case for it:** decode is bandwidth-bound (one ~17 GB weight-read per token) with huge idle - compute. Spec-dec verifies K drafted tokens in **one** weight-read → converts the loop to compute-bound, - where GB10 has headroom. Realized speedup ≈ mean accepted length. -- **Measured (others, same model class):** llama.cpp Qwen2.5-32B dense + 0.5B draft = **2.9×** (13→38 t/s); - vLLM EAGLE3 on Qwen3-32B = ~1.8–2.5× general, up to ~3× code/structured. **Competitive.** -- **Regime caveat:** spec-dec gives **~nothing for MoE-A3B** models (only ~3B active → not bandwidth-bound, - nothing to amortize). It shines for **dense** 27–32B — the opposite regime. So this lever is *dense-model* - specific. - -## Qwen3-32B specifics - -- **No native MTP head** (MTP is a Qwen3-*Next*/MoE feature). Options: a **same-family draft** - (Qwen3-0.6B or **1.7B** — same tokenizer, llama.cpp vocab check passes) or an external **EAGLE3 head** - (RedHatAI/AngelSlim Qwen3-32B-eagle3, accept length 2.15–2.49). -- Draft pick: **lean Qwen3-1.7B** (0.6B had ~60% lower acceptance in AWS's test; on a bandwidth-bound box the - 32B weight-read dwarfs the draft cost, so maximize acceptance). `--spec-draft-n-max 5–8`. - -## Recommended LocalAI actions (quality-preserving, ranked) - -1. **Make speculative decoding easy/recommended for dense ≥14B models on Blackwell** — a draft-model field in - the model config (`-md` / `--spec-draft-*`), with a suggested Qwen3-1.7B draft for the Qwen3 family. This - is the biggest per-user speed win, lossless, available **now** (no kernel). Gallery: ship target+draft pairs. -2. Kernel work (FP4-MMA tuning / Marlin W4A16) — improves **prefill/TTFT**, separate metric. -3. Continuous batching (paged engine) — **aggregate** concurrency only; per-user = 0. - -## Honesty / status - -The research conclusion is solid (sources below). **Our own empirical spec-dec run on the DGX is pending** — -the box rebooted mid-session and `llama-cli` now hangs at 0% GPU (while `llama-bench` works), plus the network -is dropping ssh mid-command. Drafts (Qwen3-0.6B/1.7B Q8) are downloaded and the spec-dec flags are confirmed; -re-run `llama-cli -m Qwen3-32B-Q4_K_M -md Qwen3-1.7B-Q8_0 -ngl 99 -ngld 99 --spec-draft-n-max 8` when the box -is stable to confirm the ~2× locally. The conclusion does not depend on it (it's measured-reproducible by -others on this exact model class), but we should bank our own number. - -Sources: llama.cpp Discussion #10466 (Qwen2.5-32B+0.5B = 2.9×), #16578 (DGX Spark), DandinPower/llama.cpp_bench -(32B = 10.7 t/s, bandwidth-bound); vLLM MTP docs + Red Hat EAGLE3 article (lossless, up to 2.5×); AWS spec-dec -blog (Qwen3-32B+1.7B up to 3×, 0.6B ~60% lower accept); RedHatAI/AngelSlim Qwen3-32B-eagle3 heads. diff --git a/backend/cpp/llama-cpp/paged/W4A16_MARLIN_KERNEL_PLAN.md b/backend/cpp/llama-cpp/paged/W4A16_MARLIN_KERNEL_PLAN.md deleted file mode 100644 index 3ae2ae30b..000000000 --- a/backend/cpp/llama-cpp/paged/W4A16_MARLIN_KERNEL_PLAN.md +++ /dev/null @@ -1,176 +0,0 @@ -# W4A16 Marlin-style GEMM for ggml-cuda on Blackwell (sm_120/121) — implementation plan - -> **STOPPED (2026-06-21): the kernel is NOT the lever — validated by a code-grounded vLLM analysis.** -> Measured on the DGX: vLLM's single-stream W4A16 prefill on GB10 = **~800 t/s (~52 TFLOPS), statistically TIED -> with llama.cpp MMQ (718/47)** — and vLLM uses the *exact* XOR-swizzle + 4-stage cp.async Marlin we proved -> collapses GB10 occupancy (vLLM even warns at load that Marlin "may degrade performance for compute-heavy -> workloads"). There is no kernel trick to port. Moreover llama.cpp's **MXFP4 path (1153 t/s) already BEATS -> vLLM single-stream (800)** — vLLM has no FP4 cubins on sm_121 and falls back to slower W4A16 Marlin, so -> llama.cpp is *ahead* on the kernel. **vLLM's entire 24k headline is the aggregate decode multiplier (~56×) -> from paged KV + chunked prefill + continuous batching — a SCHEDULER win.** llama.cpp lacks paged KV + -> chunked prefill. **Effort pivots to the scheduler** (see the paged-attention work). This kernel work is -> banked + resumable (178 t/s, P0/P1/P2/P3/P3b committed) but is not the throughput lever on GB10. Detail: -> `VLLM_DECOMPOSITION.md`. - -The committed multi-week kernel. Goal: get 4-bit-weight dense matmul to the GB10 **BF16 ceiling (~213 -TFLOP/s ≈ ~3,300 t/s prefill on Qwen3-32B)**, ~4.3× over today's 765. This is the *match-vLLM* path; vLLM's -own GB10 dense throughput runs on W4A16 Marlin (its FP4 path is broken on sm_121). - -## Why a custom kernel (validated, not assumed) - -On GB10 (sm_121), measured: **both** llama-MMQ (int8, Ampere-tuned) **and** cuBLAS-FP16 sit at ~46 TFLOP/s -(~21% of peak). cuBLAS falls back to an Ampere `cutlass_80_tensorop` kernel (CUDA-13 has no sm_121 GEMM for -these shapes); rebuilt with `-DGGML_CUDA_FORCE_CUBLAS=ON` it's *slower* than MMQ (690 vs 750). **No library -path reaches the ceiling on consumer Blackwell** — a hand-tuned sm_120a kernel is required. `mmapeak` measures -the 213 BF16 peak as reachable, and vLLM's Marlin hits it, so the ceiling is real; the work is reaching it. - -## What Marlin does (the design we mirror) - -Weights stored 4-bit, **dequantized in-register/shared-mem** in-flight; GEMM math on **FP16/BF16 tensor -cores** (`mma.sync m16n8k16`). Speed comes from: `cp.async` global→shared with a **multi-stage double-buffered -pipeline**, **offline weight reshuffle** into the MMA-friendly layout, activations kept resident in registers, -and **Stream-K** partitioning. Sources: IST-DASLab/marlin, arXiv 2408.11743, vLLM machete (Hopper successor). - -## Phases (each ends with: numerical parity vs MMQ + a prefill benchmark) - -### P0 — Harness + baseline — DONE -- **Correctness gate (GREEN):** `test-backend-ops test -o MUL_MAT -b CUDA0` → **1103/1103 passed** (CUDA vs CPU - reference, covers Q4_0/Q4_K at the real FFN shapes m=4096,k=14336,n=1..512). This is *the* parity check the - W4A16 kernel must keep green at every phase — it tests the CUDA MUL_MAT path the kernel will hook. The - `not supported` lines are `type_b=f16` combos (irrelevant; prefill uses f32 activations). -- **Perf baseline:** `llama-bench` dense Q4_K prefill = **~750 t/s (pp512 718 / pp2048 750) ≈ 46 TFLOP/s ≈ 21% - of the 213 BF16 ceiling**. The kernel must beat this toward ~3,300. (`test-backend-ops perf -o MUL_MAT` gives - per-shape GFLOPS too; build it once with the harness.) -- **Op-level baseline (the canonical kernel target), `test-backend-ops perf -o MUL_MAT`, m=4096 k=14336 (FFN):** - | n (tokens) | q4_0 | q4_K | regime | - |---|---|---|---| - | 1 | 817 GFLOPS | 761 GFLOPS | decode / mat-vec (memory-bound) | - | 8 | 5.77 TFLOPS | 4.11 TFLOPS | small-batch | - | **512** | **49.5 TFLOPS** | **47.1 TFLOPS** | **prefill GEMM — ~22% of the 213 ceiling** | - - So the prefill GEMM target: lift q4_K n=512 from **47 → toward ~213 TFLOPS** (~4.5×). This per-shape number - is cleaner than end-to-end for kernel iteration. -- **Harness script:** `~/p0harness.sh` on the DGX (build test-backend-ops + correctness + perf). Reusable each - phase: `test-backend-ops test -o MUL_MAT -b CUDA0` must stay 1103/1103; the q4_K n=512 perf must climb from 47. -- test-backend-ops needed `-DLLAMA_BUILD_TESTS=ON`; now built in `~/llama.cpp-pr24423/build`. - -### P1 — Dispatch seam (no behavior change) — DONE -- `marlin-w4a16.{cuh,cu}` + a gated hook in `ggml_cuda_mul_mat` (dense, non-ids path), behind - `GGML_CUDA_W4A16` + sm_120/121 (`cc >= GGML_CUDA_CC_BLACKWELL`) + type∈{Q4_0,Q4_K} + f32 activations. - Returns false → falls back to MMQ. Source + apply instructions: `kernel/w4a16/` (`HOOK.md`). -- **Verified on GB10:** clean build; `test-backend-ops MUL_MAT` = **1103/1103** (byte-identical default); - `llama-bench` dense Q4 pp512 unchanged (717.77 default / 718.26 with flag); `GGML_CUDA_W4A16=1` reaches the - seam (stderr `[w4a16] ... P1 seam - using MMQ`) and falls back. The empty frame P2/P3 fills. - -### P2 — Correctness-first kernel (slow OK) — DONE -- **Kernel:** `marlin-w4a16.cu` replaces the P1 TODO with a real W4A16 GEMM. In-kernel dequant Q4→BF16 into - shared mem, `mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32` via ggml's `mma.cuh` tile abstractions - (`tile<16,8,nv_bfloat162>` A, `tile<8,8,nv_bfloat162>` B, `tile<16,8,float>` C), F32 accumulate, F32 write. - One warp per 16(M)x8(N) output tile, K looped in steps of 16. Both src0 (weights, row m) and src1 (acts, - row n) are row-major `[row][k]`, so A and B load symmetrically via `load_generic`; the mma does the dot over k. -- **Types handled:** Q4_0 and Q4_K. Q4_0 dequant `w=d*(q-8)` inline; Q4_K via the superblock decode mirrored - from `convert.cu` (`get_scale_min_k4`, 8x32 sub-blocks, `d*q-m`). -- **Shape classes handled:** contiguous 2D GEMM (the prefill path), `ne2==ne3==1`, f32 activations, K%16==0 - (always true: Q4_0 K%32, Q4_K K%256). **Falls back to MMQ (returns false)** for batched (bs!=[1,1]), - broadcast (nr!=[1,1]), permuted / non-contiguous (per!=[0,1,2,3]), and any non-f32 activation (e.g. f16) - - keeps the gate green. M / N boundaries are zero-padded in-kernel (handles M not %16, N not %8). -- **Parity (the gate):** `GGML_CUDA_W4A16=1 test-backend-ops test -o MUL_MAT -b CUDA0` = **1103/1103 passed** - (the Q4_0/Q4_K f32 contiguous shapes run the kernel and match the CPU reference; batched/permuted/f16 fall - back). Default (flag-unset) build still **1103/1103** (byte-identical, seam returns false). -- **Model sanity / P2 perf:** `GGML_CUDA_W4A16=1 llama-bench -m Qwen3-32B-Q4_K_M.gguf -ngl 99 -p 512 -n 16 - -ub 2048` runs clean: **pp512 = 31.75 t/s**, tg16 = 6.28 t/s. Slow as expected (naive 1-warp/tile, weights - re-dequantized per n-tile, no pipeline) - this is the correctness checkpoint; P3 brings the speedup. The real - Q4_K model matmul path engages the kernel without error. - -### P3 — The Marlin pipeline (the speedup) — STEP 1 + SKEW-PAD/TILING LANDED; PREPACK + PIPELINE + STREAM-K DEFERRED -Goal: `cp.async` double/triple-buffered global->shared; offline weight reshuffle (a one-time repack of the Q4 -tensor into the mma+pipeline layout); register-resident activation tiles; Stream-K split for the prefill M. -Target: >=150 TFLOP/s (>=~2,300 t/s), then ~213. **MMQ baseline to beat: 47.1 TFLOPS (q4_K n=512) / pp512 718.** - -**Kernel structure now (committed P3b):** block-tiled multi-warp GEMM with a CONFLICT-FREE shared feed via skew -padding. `blockDim=(32, WM*WN)` so `threadIdx.x` is the warp lane (required by `mma.cuh` get_i/get_j) and -`threadIdx.y` is the warp index; the original 1-warp P2 launch put 128 threads on `threadIdx.x` and exploded -`get_j` into an out-of-bounds shared read (found via compute-sanitizer). `WM*WN` warps compute a -`BM(=WM*FM*16) x BN(=WN*FN*8)` output tile; each warp owns an `FM x FN` grid of m16n8k16 mma fragments -accumulated in F32. Per k-step (16-deep): all warps cooperatively dequant the `BM x 16` Q4 weight strip + load -the `BN x 16` f32->bf16 activation strip into shared, one `__syncthreads`, then `ldmatrix.x4` (A) / `ldmatrix.x2` -(B) fragments + `FM*FN` mmas. The shared rows hold 8 bf162 of data but are stored at a PADDED stride of 12 bf162 -(`W4A16_SPAD`): ldmatrix's per-lane address is `row*stride`, and the natural stride 8 (a divisor of the -32-bank / 128-byte cycle) collides rows 0,4,8,12 into a 2-way bank conflict; skewing to 12 (4-byte aligned, so -ldmatrix's 16-byte alignment holds) makes `{r*12 mod 32}` hit 8 distinct bank-quads for r in 0..7, so both -halves of ldmatrix are conflict-free at only +50% on the small staged tile (~12 KB at the shipping tile). -Shipping config `WM=4,WN=4,FM=2,FN=4` -> `BM=128, BN=128`, 16 warps, 8 m16n8 C-tiles per warp (keeping -register pressure low is what lets BN grow without an occupancy cliff). M/N tails zero-padded in-kernel; still -gated to contiguous 2D Q4_0/Q4_K f32 prefill, else falls back to MMQ. - -**Per-step results (q4_K n=512 via `test-backend-ops perf`; pp512/pp2048 via llama-bench Qwen3-32B-Q4_K_M):** - -| step | q4_K n=512 | q4_0 n=512 | pp512 | pp2048 | vs MMQ 47 / 718 | notes | -|---|---|---|---|---|---|---| -| P2 (1 warp/tile) | ~2 TFLOPS | - | 31.75 | - | 0.04x | correctness checkpoint | -| Step 1: block tiling (load_generic, BM64/4w) | 6.63 (cold) | 7.53 | 119 | 123 | 0.14x | original committed kernel | -| P3b-1: skew-pad ldmatrix + BM128/8w | 8.50 (cold) | 10.56 | 148.5 | 153.9 | 0.18x | +28% q4_K, +40% q4_0 over step 1 | -| **P3b-2: + BN128/16w (current)** | **9.92 (cold)** | **11.68** | **177.6** | **185.0** | **0.21x** | +17% q4_K, +20% pp512 over P3b-1 (+49% pp512 over step 1) | - -Parity gate **1103/1103** at every step, flag set and unset (byte-identical when unset). All P3b numbers above -are from thermally-bracketed cold A/B sessions (committed measured immediately before AND after each candidate, -identical both times -> the deltas are real, not thermal). P3b-1 cold A/B: 6.63/7.53 vs 8.52/10.49. P3b-2 cold -A/B: BN64/8w 10.56/8.50 then 10.51/8.45 (bracket) vs BN128/16w 11.68/9.92. - -**What landed / what was tried (honest):** -- **P3b - LANDED (committed).** Two combined changes lift the prior committed kernel: (1) **skew-pad - conflict-free ldmatrix** (shared row stride 8->12 bf162; makes `ldmatrix.x4`/`.x2` bank-conflict-free at near - zero occupancy cost) and (2) **bigger tile / more warps** (`BM=128, BN=64`, 8 warps). Cold A/B: q4_K - 6.63->8.52 (+28%), q4_0 7.53->10.49 (+40%), pp512 119->148.5 (+25%). **Still ~5.5x under MMQ (47) per-op and - ~4.8x under pp512 718 - does NOT beat MMQ.** This is forward progress, not the finish line. -- **The XOR-swizzle-FIRST plan was tested and is WRONG for this GPU - documented so it is not re-tried.** A - wide-row (BK=64, 128-byte rows) XOR swizzle `seg ^ (row&7)` IS conflict-free, but the 16 KB shared it needs - collapsed occupancy and dropped q4_K n=512 to **2.84 TFLOPS** (worse than the unswizzled 6.63) - the same - occupancy cliff P3 hit with a 32 KB pipeline. The conflict-free feed must be bought WITHOUT widening shared: - skew padding (above) does exactly that (6 KB), which is why it is the committed form. Lesson: on GB10 occupancy - dominates bank-conflict latency; never trade occupancy for a conflict-free layout. -- **Conflict-free feed alone did NOT beat the unswizzled kernel - the limiter moved.** At the SAME BM64/4w tile, - skew-pad ldmatrix (6.70) ~= load_generic (6.63): removing bank conflicts bought ~nothing. The win came only - when the tile grew (BM128/8w). A 5-config tile sweep then split the two quant types: - - **q4_0 SCALES with warps/tiles** (7.7 -> 10.5 -> **15.8 TFLOPS at BM128/16w**): feed/global-traffic bound, - helped by cutting redundant activation re-reads (more BM = fewer M-blocks each re-reading the act column). - - **q4_K is largely DEQUANT-COMPUTE bound** (the BM64/16w tile gives q4_0=15.8 but q4_K=6.8 - they diverge - hard). This **refines P3's "within 12%" finding**: that held only in the low-throughput memory-bound regime; - once the feed is unblocked, q4_K's per-element 6-bit superblock decode (`get_scale_min_k4` + superblock - indexing, redone every k-step AND re-done by every N-block) becomes the wall. BM256 regressed both (too few - blocks / register pressure). -- **Growing BN partly relieves the q4_K dequant wall (P3b-2).** Because every N-block re-decodes the same - weight strip, halving the N-block count (BN 64->128) halves that redundant q4_K decode - but only when BN is - spread across MORE WARPS (16w, 8 C-tiles/warp), not more fragments-per-warp: the FN=8 / FM=4 variants (16 - C-tiles/warp) regressed to ~6.6 on register pressure, while WM=4,WN=4,FM=2,FN=4 (16w, 8 tiles/warp) lifted - q4_K 8.5->9.9 and q4_0 10.6->11.7 cold. BN=256 was no better and costs more shared. **BN128/16w is the - shipping tile.** -- **Next blocker (the remaining q4_K unlock) = offline prepack.** BN growth only divides the redundant decode by - the N-block count; it cannot remove the per-k-step decode itself. The full fix is the **one-time offline - repack** - decode the Q4 tensor ONCE into a cached device buffer keyed off the tensor data pointer, in a layout - with the scale/min pre-applied (store reshuffled 4-bit + per-subblock bf16 d,m, ~1.25x the q4 size, NOT a full - bf16 blow-up which would be ~4x), so the in-kernel path becomes a cheap `q*d - m` with coalesced loads. Then - `cp.async` multi-stage (sized to NOT widen shared past the occupancy cliff) and **Stream-K** over M. These - remain the multi-week core; **prepack is the highest-value next step for q4_K specifically** (it should let - q4_K join q4_0 on the feed-bound scaling curve instead of plateauing at ~10). -- **Methodology note (unchanged):** the box thermally throttles under sustained perf+bench runs (identical code - ~8.8 cold vs ~6.6 hot earlier), so only same-session A/Bs are trustworthy. The P3b deltas above were taken in - one bracketed cold session for exactly this reason. - -### P4 — Tune -- Tile (mmq_x/y analogues), warps, pipeline depth, occupancy. We have nsys (throughput) but **not ncu** on the - DGX — tuning is empirical (sweep configs, measure t/s). Note ncu would need sudo/driver perms we lack. - -### P5 — Enable -- Default on for sm_120/121 + Q4_0/Q4_K dense when parity holds + faster; keep the flag as an escape hatch. - Ship as a LocalAI llama.cpp patch (the patches/ series) and/or upstream (ggml has no Marlin-equivalent — - issue #1519 — so it's net-new upstream value; float it with maintainers first). - -## Risks / notes -- **Multi-week, expert-CUDA, DGX-only** (GB10 is the only sm_121). The session's network flakiness + - `llama-cli` hang make `llama-bench`/`test-backend-ops` the reliable verification tools (both work). -- Quantization correctness: Q4_K's superblock structure (256-elem, 6-bit scales) is more complex to dequant - in-kernel than Q4_0; consider landing Q4_0 first, then Q4_K. -- **Beat-path follow-on:** the FP4-MMA path (`mul_mat_q`, ~5% of FP4 peak) tuned/fixed on sm_121 reaches - ~6,600 (2× BF16). Separate track; this W4A16 kernel is the match-path foundation. -- Reuse ggml's `mma.cuh` tile abstractions (MMQ already uses them) rather than raw PTX where possible. diff --git a/backend/cpp/llama-cpp/paged/kernel/w4a16/HOOK.md b/backend/cpp/llama-cpp/paged/kernel/w4a16/HOOK.md deleted file mode 100644 index a701f1496..000000000 --- a/backend/cpp/llama-cpp/paged/kernel/w4a16/HOOK.md +++ /dev/null @@ -1,31 +0,0 @@ -# W4A16 seam — how to apply to a llama.cpp / ggml-cuda checkout - -Two source files + two one-line edits to `ggml/src/ggml-cuda/ggml-cuda.cu`. The build picks up the -new `.cu` via the existing `file(GLOB)` after a `cmake -S . -B build` reconfigure (no CMakeLists edit). - -## Files (copy into `ggml/src/ggml-cuda/`) -- `marlin-w4a16.cuh` -- `marlin-w4a16.cu` - -## Edit `ggml/src/ggml-cuda/ggml-cuda.cu` - -1. **Include** — after the existing `#include "ggml-cuda/fp4-grouped-moe.cuh"` (sibling-header style): - ```cpp - #include "ggml-cuda/marlin-w4a16.cuh" - ``` - -2. **Dispatch hook** — immediately before the dense dispatch chain, i.e. before - `if (!split && use_mul_mat_vec_f) {` in `ggml_cuda_mul_mat(...)` (after `const int cc = ...`): - ```cpp - if (!split && ggml_cuda_w4a16_mul_mat(ctx, src0, src1, dst)) { return; } - ``` - -## Verify (P1 acceptance — met) -- `cmake --build build --target test-backend-ops llama-bench` → builds clean. -- `test-backend-ops test -o MUL_MAT -b CUDA0` → **1103/1103** (byte-identical default). -- `llama-bench` dense Q4 pp512 → unchanged (~718, MMQ). -- `GGML_CUDA_W4A16=1 llama-bench` → unchanged + stderr `[w4a16] ... P1 seam - using MMQ` (seam reached, - gating passes on sm_121, falls back). - -The kernel body (P2 correctness → P3 Marlin pipeline) replaces the `TODO(P2/P3)` block in `marlin-w4a16.cu` -and returns `true` once parity holds. diff --git a/backend/cpp/llama-cpp/paged/kernel/w4a16/SUBAGENT_BRIEFS.md b/backend/cpp/llama-cpp/paged/kernel/w4a16/SUBAGENT_BRIEFS.md deleted file mode 100644 index 4130ff5ac..000000000 --- a/backend/cpp/llama-cpp/paged/kernel/w4a16/SUBAGENT_BRIEFS.md +++ /dev/null @@ -1,66 +0,0 @@ -# W4A16 kernel - subagent dispatch briefs (P3, P4, P5) - -**Dispatch strategy.** Each phase = one fresh **Opus-4.8** subagent handed a complete zero-context brief. -Phases are **sequential** (P3 needs P2's correct kernel; P4 needs P3's pipeline; P5 needs P4's tuned kernel), -so dispatch phase N+1 only after phase N's commit lands, and before dispatching, splice phase N's *actual* -deliverable (final kernel shape, configs, fallback set) into the next brief. P2's brief (already dispatched) -is the template; reuse the COMMON section below verbatim in every dispatch. - ---- - -## COMMON (paste into every phase brief) - -- **Kernel dev is on the remote DGX** (GB10, sm_121): `ssh -o ConnectTimeout=25 -o ServerAliveInterval=10 -o ServerAliveCountMax=10 dgx.casa ''`. Network is FLAKY (re-poll on drop; nohup jobs survive). `llama-cli` HANGS - never use it. Only `llama-bench` + `test-backend-ops` work. -- Checkout `~/llama.cpp-pr24423`, build `~/llama.cpp-pr24423/build` (sm_121, `-DLLAMA_BUILD_TESTS=ON`). Kernel file `ggml/src/ggml-cuda/marlin-w4a16.cu`. Build auto-GLOBs it; no CMakeLists edits. Hook already in `ggml-cuda.cu`, gated behind env `GGML_CUDA_W4A16`. -- Dense test model: `~/bench/q3-32b-gguf/Qwen3-32B-Q4_K_M.gguf`. -- **Builds run detached + poll** (never blocking foreground): write a `~/pN.sh` that builds `--target test-backend-ops llama-bench`, echoes `RC=$?`, runs the gate, echoes `PN_DONE`; `nohup` it; poll `for i in $(seq 1 90); do grep -q PN_DONE ~/pN.out && break; sleep 20; done; tail ~/pN.out`. -- **GPU hygiene:** check `docker ps | grep local-ai` + `nvidia-smi`; `docker stop` a running localai worker if present (authorized); never pkill native procs; never start model servers. -- **Parity gate (must stay green every step):** `GGML_CUDA_W4A16=1 CUDA_VISIBLE_DEVICES=0 ./build/bin/test-backend-ops test -o MUL_MAT -b CUDA0` = **1103/1103**; and flag-unset stays 1103/1103 (byte-identical). A wrong result is worse than a fallback - return false for any shape you can't do correctly. -- **Perf measurement:** `test-backend-ops perf -o MUL_MAT -b CUDA0` (per-shape GFLOPS; the canonical target is q4_K m=4096 k=14336 **n=512**, baseline **47.1 TFLOPS**, ceiling ~213) + `llama-bench -m -ngl 99 -p 512,2048 -n 0 -ub 2048` (baseline pp512 ~718). -- **LocalAI repo (commit here; you do NOT inherit cwd - `cd` explicitly):** `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention`. Plan: `backend/cpp/llama-cpp/paged/W4A16_MARLIN_KERNEL_PLAN.md`. Source mirror: `backend/cpp/llama-cpp/paged/kernel/w4a16/`. After a phase passes: fetch the final `marlin-w4a16.cu` from the DGX (`ssh ... 'cat ...'`), overwrite the mirror, update the plan (mark the phase DONE with numbers), `git commit -s` (DCO sign-off; user is Ettore Di Giacinto ). **No `Co-Authored-By`. No em-dashes anywhere. Trailer `Assisted-by: Claude:opus-4.8 [Claude Code]`. Do NOT push.** -- Final message = the result (gate ?/1103, the perf delta, blockers + resolutions, commit hash). A precise partial result beats a vague success claim. - ---- - -## P3 brief - the Marlin pipeline (the speedup) - -**Goal.** Take P2's correct-but-slow kernel from ~47 toward ~150+ TFLOPS (then ~213) on the q4_K n=512 prefill GEMM, **without ever breaking parity**. This is the Marlin design: the math is the same BF16 mma; the speed comes from feeding the tensor cores without stalling. - -**Implement, incrementally (re-run the parity gate after each):** -1. **`cp.async` multi-stage pipeline** - double/triple-buffer global->shared loads of both the Q4 weight tiles and the activation tiles so dequant+mma on stage k overlaps the load of stage k+1. (Study `mma.cuh` + how `mmq.cu`/`mmf.cu` stage shared memory; ggml already uses `cp.async`/`__pipeline_*`.) -2. **Offline weight reshuffle** - repack the Q4 weights once into the mma+pipeline-friendly layout (Marlin's interleave) so loads are coalesced and the mma fragment maps directly. Do this as a load-time transform of src0 (a new prepacked buffer keyed off the tensor) - NOT per-call. Document where the repack lives + its memory cost. -3. **Register-resident activation tiles + Stream-K** split of the M dimension across blocks for the prefill (large-M) case so all SMs stay busy. - -**Acceptance.** Parity gate stays **1103/1103** at every commit; `test-backend-ops perf` q4_K n=512 climbs materially above 47 TFLOPS (target >=150) and `llama-bench` pp512 climbs above ~718. Report the TFLOPS + t/s after each of the 3 steps so the contribution of each is visible. If a step regresses parity, revert it and report why. - -**Reference.** IST-DASLab/marlin (github), arXiv 2408.11743, vLLM machete. Mirror `mmf.cu`'s BF16 GEMM structure; Marlin = that + Q4 dequant-on-load + the pipeline/reshuffle. - -**Splice before dispatch:** P2's final kernel structure (tile sizes, which types/shapes it handles vs falls back, helper functions it defined). - ---- - -## P4 brief - tune to the ceiling - -**Goal.** Drive the P3 kernel as close to the ~213 TFLOPS ceiling as empirical tuning allows. **No `ncu` on this box** (no driver perms) - tune by throughput: `test-backend-ops perf` + `llama-bench` + `nsys` (throughput only). - -**Do.** Parametrize the kernel (template params / constants) over: tile M/N/K, warps per block, pipeline depth (stages), and occupancy (regs, shared-mem budget). Sweep systematically (a script that rebuilds + benches each config, logs q4_K n=512 TFLOPS + pp512/pp2048 t/s), pick the best, hard-set it (with a short comment on the sweep). Check both prefill shapes (n=512 and n=2048) and confirm decode (n=1) didn't regress (it should still route to mat-vec, not this kernel - verify the gating). - -**Acceptance.** Best config maximizes q4_K n=512 TFLOPS (stretch ~150-213) with parity **1103/1103** intact; the sweep table (config -> TFLOPS/t-s) is recorded in the plan's P4 section. Report the chosen config + the final pp512/pp2048 t/s vs the 718/750 baseline and vs vLLM's ~3300 single-stream target. - -**Splice before dispatch:** P3's pipeline structure + the perf it reached + which knobs are already fixed vs free. - ---- - -## P5 brief - enable + package + (maybe) upstream - -**Goal.** Make W4A16 the default dense-Q4 path on Blackwell and ship it through LocalAI. - -**Do.** -1. **Flip the gate:** default-ON for sm_120/121 + Q4_0/Q4_K dense when faster, keep an opt-out env (e.g. `GGML_CUDA_W4A16=0`) as an escape hatch. The existing return-false-on-unhandled-shape path is the correctness safety net; keep it. Verify the default (no env) build now runs W4A16 for dense Q4, gate green, faster than the old MMQ baseline. -2. **Package as a LocalAI llama.cpp patch:** produce `backend/cpp/llama-cpp/paged/patches/kernel/0002-w4a16-marlin.patch` (the new files + the `ggml-cuda.cu` hook + the gate flip) that applies cleanly to the pinned llama.cpp, mirroring the existing `patches/kernel/0001-fp4-grouped-moe-scaffold.patch`. Confirm LocalAI's `make backends/llama-cpp` build path can consume it (read `.agents/llama-cpp-backend.md` + the build memory: `make -C backend/cpp/llama-cpp clean` before rebuilds). -3. **Docs:** update `BLACKWELL_KERNEL_GAPS.md` + the plan with the shipped result; add a short note to the LocalAI docs if there's a Blackwell/performance page. -4. **Upstream decision (do NOT open without surfacing first):** ggml has no Marlin-equivalent (issue #1519) so this is net-new upstream value. Draft (do not submit) an upstream PR description + note the sm_121 build-flag caveats; report it for the user to decide. - -**Acceptance.** Default Blackwell build uses W4A16 for dense Q4, parity 1103/1103, measurably faster than MMQ; the patch applies + the LocalAI llama-cpp backend builds with it (verify or, if the full backend build is too heavy, document the exact build command + that the patch applies cleanly). Report the end-to-end LocalAI dense-Q4 prefill number vs the start-of-project 765 t/s. - -**Splice before dispatch:** P4's final kernel + config + the measured ceiling reached; the exact enable condition decided. diff --git a/backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cu b/backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cu deleted file mode 100644 index 57064ee42..000000000 --- a/backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cu +++ /dev/null @@ -1,258 +0,0 @@ -#include "marlin-w4a16.cuh" -#include "mma.cuh" - -#include -#include -#include - -// W4A16 Marlin-style GEMM. -// -// In-kernel dequantize Q4 weights -> BF16, multiply against BF16-converted F32 -// activations using mma.sync m16n8k16 BF16 tensor-core ops, accumulate in F32, -// write F32 output. Handles only the contiguous 2D GEMM (prefill) case for -// Q4_0 / Q4_K; everything else returns false and falls back to MMQ. -// -// ggml MUL_MAT convention: dst[m,n] = sum_k src0[k,m] * src1[k,n]. -// src0 (weights): ne0=K (contiguous), ne1=M -> row m is K contiguous quants. -// src1 (acts,f32): ne0=K (contiguous), ne1=N -> row n is K contiguous floats. -// dst (f32): ne0=M (contiguous), ne1=N -> element (m,n) at m + n*M. -// Both operands are row-major [row][k]; m16n8k16 computes C[m,n] += sum_k A[m,k]*B[n,k]. -// -// Thread layout: blockDim = (32, WM*WN). threadIdx.x is the warp lane (0..31, -// required by mma.cuh get_i/get_j), threadIdx.y is the warp index. -// -// P3b step 1 - conflict-free shared layout via SKEW PADDING: -// - WM*WN warps compute a BM(=WM*FM*16) x BN(=WN*FN*8) output tile; each warp -// owns an FM x FN grid of m16n8k16 mma fragments accumulated in F32. -// - Per 16-deep k-step the warps cooperatively dequant the BM x 16 Q4 weight -// strip + load the BN x 16 f32->bf16 activation strip into shared, then feed -// the tensor cores with ldmatrix.x4 (A) / ldmatrix.x2 (B). -// - The shared rows are PADDED to SPAD(=12) bf162 instead of the natural 8. -// ldmatrix's per-lane address is row*stride; with the natural stride 8 (a -// divisor of the 32-bank / 128-byte cycle) rows 0,4,8,12 collide -> 2-way -// bank conflict on every fragment load (this is why P3 measured a plain -// ldmatrix swap as neutral). Skewing the stride to 12 (4-byte aligned, so -// ldmatrix's 16-byte alignment holds) makes {r*12 mod 32} hit 8 distinct -// bank-quads for r in 0..7, so both halves of ldmatrix.x4 and ldmatrix.x2 are -// conflict-free. The pad costs only +50% on the small (~4 KB) staged tile, so -// unlike a 128-byte-row XOR swizzle it does NOT collapse occupancy on GB10 -// (a wide-row swizzle pushed shared to 16 KB and dropped this to ~2.8 TFLOPS). -// -// Dead-ends already proven (do not re-try): a double-buffered KSTAGE=64 cp.async -// pipeline collapsed occupancy (32 KB shared -> 2.7 TFLOPS); a plain ldmatrix on -// the UNpadded layout was neutral (bank conflicts); a wide-row (BK=64) XOR swizzle -// was conflict-free but occupancy-starved (16 KB shared -> 2.8 TFLOPS). Skew -// padding gets the conflict-free feed at near-zero occupancy cost. - -using namespace ggml_cuda_mma; - -typedef tile<16, 8, nv_bfloat162> tile_A; // 16(M) x 16(K) -typedef tile< 8, 8, nv_bfloat162> tile_B; // 8(N) x 16(K) -typedef tile<16, 8, float> tile_C; // 16(M) x 8(N) - -// bf162 columns actually live per shared row (16 k-values = 8 bf162) ... -#define W4A16_KP 8 -// ... padded to this stride to bank-skew the ldmatrix row addresses. -#define W4A16_SPAD 12 - -static bool w4a16_enabled() { - static const bool en = (std::getenv("GGML_CUDA_W4A16") != nullptr); - return en; -} - -// 6-bit packed scale/min decode for Q4_K (mirrors convert.cu get_scale_min_k4). -static __device__ __forceinline__ void w4a16_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { - if (j < 4) { - d = q[j] & 63; m = q[j + 4] & 63; - } else { - d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); - m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); - } -} - -// Dequantize a single Q4_0 weight at column k of a row. -static __device__ __forceinline__ float w4a16_dq_q4_0(const char * row, int k) { - const block_q4_0 * blk = (const block_q4_0 *) row + (k / QK4_0); - const int j = k % QK4_0; - const float d = __half2float(blk->d); - const int q = (j < QK4_0/2) ? (blk->qs[j] & 0xF) : (blk->qs[j - QK4_0/2] >> 4); - return (q - 8) * d; -} - -// Dequantize a single Q4_K weight at column k of a row. -static __device__ __forceinline__ float w4a16_dq_q4_K(const char * row, int k) { - const block_q4_K * blk = (const block_q4_K *) row + (k / QK_K); - const int e = k % QK_K; - const int il = e / 64; // 0..3 - const int within = e % 64; - const int half = within / 32; // 0..1 - const int pos = within % 32; - const int ir = pos / 4; // 0..7 - const int l = pos % 4; // 0..3 - const int is = 2*il + half; - const float dall = __low2half (blk->dm); - const float dmin = __high2half(blk->dm); - uint8_t sc, mn; - w4a16_scale_min_k4(is, blk->scales, sc, mn); - const float d = dall * sc; - const float m = dmin * mn; - const uint8_t qb = blk->qs[32*il + 4*ir + l]; - const int q = (half == 0) ? (qb & 0xF) : (qb >> 4); - return d * q - m; -} - -template -static __global__ void __launch_bounds__(WM*WN*32, 1) -w4a16_gemm_kernel( - const char * __restrict__ src0, - const char * __restrict__ src1, - float * __restrict__ dst, - const int M, const int N, const int K, - const int64_t nb01, const int64_t nb11, const int64_t dst_ne0) { - constexpr int KP = W4A16_KP; // 8 bf162 = 16 k per row - constexpr int SPAD = W4A16_SPAD; // padded row stride (bank skew) - constexpr int BM = WM*FM*16; - constexpr int BN = WN*FN*8; - constexpr int NTH = WM*WN*32; - - const int m0 = blockIdx.x * BM; - const int n0 = blockIdx.y * BN; - - const int warp_id = threadIdx.y; // 0 .. WM*WN-1 - const int warp_n = warp_id % WN; - const int warp_m = warp_id / WN; - const int tid = threadIdx.y*32 + threadIdx.x; - - __shared__ nv_bfloat162 sW[BM*SPAD]; // [m][kpair], padded row stride SPAD - __shared__ nv_bfloat162 sB[BN*SPAD]; // [n][kpair], padded row stride SPAD - - tile_C C[FM][FN]; // zero-initialized accumulators - - for (int k0 = 0; k0 < K; k0 += 16) { - // Dequantize the BM x 16 weight strip once; reused across the block's BN span. - #pragma unroll - for (int idx = tid; idx < BM*KP; idx += NTH) { - const int m = idx / KP; - const int kk = idx % KP; - const int k = k0 + 2*kk; - float w0 = 0.0f, w1 = 0.0f; - if (m0 + m < M) { - const char * row = src0 + (int64_t)(m0 + m) * nb01; - if (IS_Q4_K) { w0 = w4a16_dq_q4_K(row, k); w1 = w4a16_dq_q4_K(row, k + 1); } - else { w0 = w4a16_dq_q4_0(row, k); w1 = w4a16_dq_q4_0(row, k + 1); } - } - sW[m*SPAD + kk] = __floats2bfloat162_rn(w0, w1); - } - // Load the BN x 16 activation strip (f32 -> bf16). - #pragma unroll - for (int idx = tid; idx < BN*KP; idx += NTH) { - const int n = idx / KP; - const int kk = idx % KP; - const int k = k0 + 2*kk; - float a0 = 0.0f, a1 = 0.0f; - if (n0 + n < N) { - const float * arow = (const float *)(src1 + (int64_t)(n0 + n) * nb11); - a0 = arow[k]; a1 = arow[k + 1]; - } - sB[n*SPAD + kk] = __floats2bfloat162_rn(a0, a1); - } - __syncthreads(); - - tile_A Af[FM]; - tile_B Bf[FN]; - #pragma unroll - for (int fm = 0; fm < FM; ++fm) { - const int mrow = (warp_m*FM + fm) * 16; - load_ldmatrix(Af[fm], sW + mrow*SPAD, SPAD); - } - #pragma unroll - for (int fn = 0; fn < FN; ++fn) { - const int ncol = (warp_n*FN + fn) * 8; - load_ldmatrix(Bf[fn], sB + ncol*SPAD, SPAD); - } - #pragma unroll - for (int fm = 0; fm < FM; ++fm) { - #pragma unroll - for (int fn = 0; fn < FN; ++fn) { - mma(C[fm][fn], Af[fm], Bf[fn]); - } - } - __syncthreads(); - } - - #pragma unroll - for (int fm = 0; fm < FM; ++fm) { - #pragma unroll - for (int fn = 0; fn < FN; ++fn) { - const int mbase = m0 + (warp_m*FM + fm) * 16; - const int nbase = n0 + (warp_n*FN + fn) * 8; - #pragma unroll - for (int l = 0; l < tile_C::ne; ++l) { - const int m = mbase + tile_C::get_i(l); - const int n = nbase + tile_C::get_j(l); - if (m < M && n < N) { - dst[(int64_t)n * dst_ne0 + m] = C[fm][fn].x[l]; - } - } - } - } -} - -bool ggml_cuda_w4a16_mul_mat( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, - const ggml_tensor * src1, - ggml_tensor * dst) { - if (!w4a16_enabled()) { - return false; - } - if (src0->type != GGML_TYPE_Q4_0 && src0->type != GGML_TYPE_Q4_K) { - return false; - } - if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) { - return false; - } - const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; - if (!GGML_CUDA_CC_IS_NVIDIA(cc) || cc < GGML_CUDA_CC_BLACKWELL) { - return false; // consumer Blackwell (sm_120/121) only - } - - if (src0->ne[2] != 1 || src0->ne[3] != 1 || - src1->ne[2] != 1 || src1->ne[3] != 1 || - dst->ne[2] != 1 || dst->ne[3] != 1) { - return false; - } - if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) { - return false; - } - - const int64_t K = src0->ne[0]; - const int64_t M = src0->ne[1]; - const int64_t N = src1->ne[1]; - if (src1->ne[0] != K || dst->ne[0] != M || dst->ne[1] != N) { - return false; - } - if (K % 16 != 0) { - return false; - } - - cudaStream_t stream = ctx.stream(); - - // Block tile config: WM*WN warps compute BM(=WM*FM*16) x BN(=WN*FN*8). - constexpr int WM = 4, WN = 4, FM = 2, FN = 4; // BM=128, BN=128, 16 warps - constexpr int BM = WM*FM*16; - constexpr int BN = WN*FN*8; - const dim3 grid((unsigned)((M + BM - 1) / BM), (unsigned)((N + BN - 1) / BN), 1); - const dim3 block(32, WM*WN, 1); - - if (src0->type == GGML_TYPE_Q4_K) { - w4a16_gemm_kernel<<>>( - (const char *) src0->data, (const char *) src1->data, (float *) dst->data, - (int) M, (int) N, (int) K, src0->nb[1], src1->nb[1], dst->ne[0]); - } else { - w4a16_gemm_kernel<<>>( - (const char *) src0->data, (const char *) src1->data, (float *) dst->data, - (int) M, (int) N, (int) K, src0->nb[1], src1->nb[1], dst->ne[0]); - } - return true; -} diff --git a/backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cuh b/backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cuh deleted file mode 100644 index 253149d67..000000000 --- a/backend/cpp/llama-cpp/paged/kernel/w4a16/marlin-w4a16.cuh +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include "common.cuh" - -// W4A16 Marlin-style BF16 GEMM for NVIDIA Blackwell consumer GPUs (sm_120/121). -// Dense (non-MoE) 4-bit-weight matmul run on BF16 tensor cores, the path that -// reaches the GB10 BF16 ceiling where MMQ (int8, Ampere-tuned) and cuBLAS (sm_80 -// fallback) both plateau at ~22% of it. Returns true if it handled the op; false -// to fall back to MMQ. Gated behind GGML_CUDA_W4A16 until correct + faster. -bool ggml_cuda_w4a16_mul_mat( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, // 4-bit weights (Q4_0/Q4_K) - const ggml_tensor * src1, // F32 activations - ggml_tensor * dst); // F32 output diff --git a/backend/cpp/llama-cpp/paged/paged-bench.cpp b/backend/cpp/llama-cpp/paged/paged-bench.cpp deleted file mode 100644 index fd365975b..000000000 --- a/backend/cpp/llama-cpp/paged/paged-bench.cpp +++ /dev/null @@ -1,129 +0,0 @@ -// paged-bench: quantify the multi-tenant wins of paged KV allocation that are -// properties of the host-side block model (vLLM-parity), independent of the -// in-model compute path. -// -// Win 1 (capacity): on-demand block allocation vs contiguous per-seq -// reservation, under a fixed KV block budget. -// Win 3 (prefix sharing): automatic cross-tenant prefix dedup via block -// hashing. -// -// Win 2 (throughput) is intentionally NOT here: it requires the paged read -// path wired into llama-graph.cpp (Gate 0). Measuring it at this layer would -// be dishonest, so it is reported as pending. - -#include "paged_kv_manager.h" - -#include -#include -#include - -using namespace paged; - -// A deterministic LCG so sequence lengths vary without Math.random-style nondeterminism. -struct Lcg { - uint64_t s; - explicit Lcg(uint64_t seed) : s(seed) {} - uint32_t next() { s = s * 6364136223846793005ULL + 1442695040888963407ULL; return (uint32_t)(s >> 33); } - int range(int lo, int hi) { return lo + (int)(next() % (uint32_t)(hi - lo + 1)); } -}; - -static size_t cdiv(size_t a, size_t b) { return (a + b - 1) / b; } - -int main() { - const int block_size = 16; - const int n_ctx = 2048; // max context a sequence could use - const int num_blocks = 512; // fixed KV budget: 512 blocks * 16 = 8192 cells - - printf("paged-bench (block_size=%d, n_ctx=%d, budget=%d blocks = %d cells)\n\n", - block_size, n_ctx, num_blocks, num_blocks * block_size); - - // --------------------------------------------------------------------- - // WIN 1: concurrency capacity. Sequences have realistic, VARYING lengths - // (most short, a few long) - the regime where reserving n_ctx per seq - // wastes the most. Count how many fit under the same block budget. - // --------------------------------------------------------------------- - { - Lcg rng(12345); - const int blocks_per_ctx = (int) cdiv(n_ctx, block_size); // contiguous reserves this per seq - - // Contiguous (stream-style) reservation: every seq reserves n_ctx worth. - int contiguous_fit = num_blocks / blocks_per_ctx; - - // Paged on-demand: draw real lengths until the pool is exhausted. - PagedKVManager m(num_blocks, block_size, /*enable_caching=*/false); - int paged_fit = 0; - long total_tokens = 0; - for (int seq = 0; ; ++seq) { - // 80% short (8-128 tok), 20% long (up to n_ctx) - int len = (rng.range(0, 99) < 80) ? rng.range(8, 128) : rng.range(128, n_ctx); - if (!m.allocate(seq, (size_t) len)) break; - paged_fit++; - total_tokens += len; - } - - printf("WIN 1 concurrency capacity @ %d-block budget\n", num_blocks); - printf(" contiguous (reserve n_ctx/seq): %d sequences\n", contiguous_fit); - printf(" paged (on-demand blocks): %d sequences (avg %ld tok/seq)\n", - paged_fit, paged_fit ? total_tokens / paged_fit : 0); - printf(" --> paged fits %.1fx more concurrent sequences\n\n", - contiguous_fit ? (double) paged_fit / contiguous_fit : 0.0); - } - - // --------------------------------------------------------------------- - // WIN 3: cross-tenant prefix sharing. N tenants share a long system - // prompt / RAG context, then diverge. Compare physical blocks consumed - // with prefix caching on vs off. - // --------------------------------------------------------------------- - { - const int n_tenants = 32; - const int shared_len = 1024; // shared system prompt (64 blocks) - const int distinct_len = 64; // per-tenant suffix (4 blocks) - - // Shared prefix token ids (identical across tenants -> identical block hashes). - std::vector shared(shared_len); - for (int i = 0; i < shared_len; ++i) shared[i] = 1000 + i; - - // --- prefix caching OFF: every tenant pays for the whole prefix --- - long blocks_off = 0; - { - PagedKVManager m(num_blocks * 8, block_size, /*enable_caching=*/false); - for (int t = 0; t < n_tenants; ++t) { - m.allocate(t, (size_t) (shared_len + distinct_len)); - blocks_off += m.block_table(t).size(); - } - } - - // --- prefix caching ON: shared blocks are deduped to one physical copy --- - long blocks_on = 0; - { - PagedKVManager m(num_blocks * 8, block_size, /*enable_caching=*/true); - // tenant 0 fills + caches the shared prefix - auto h = m.compute_block_hashes(shared); - m.allocate(0, (size_t) (shared_len + distinct_len)); - m.cache_blocks(0, h, (size_t) shared_len); - long physical = m.block_table(0).size(); - // tenants 1..N-1 hit the cached prefix; only their distinct suffix is new - for (int t = 1; t < n_tenants; ++t) { - size_t cached_tokens = m.get_computed_blocks(h); // shared blocks reused - size_t new_tokens = (shared_len - cached_tokens) + distinct_len; - m.allocate(t, (size_t) (shared_len + distinct_len)); - // physically new blocks = only what wasn't already resident - physical += (long) cdiv(new_tokens, block_size); - } - blocks_on = physical; - } - - printf("WIN 3 cross-tenant prefix sharing (%d tenants, %d-tok shared prefix)\n", - n_tenants, shared_len); - printf(" prefix-cache OFF: %ld physical blocks\n", blocks_off); - printf(" prefix-cache ON: %ld physical blocks\n", blocks_on); - printf(" --> %.1fx less KV memory for the shared workload\n\n", - blocks_on ? (double) blocks_off / blocks_on : 0.0); - } - - printf("WIN 2 aggregate throughput under load: PENDING\n"); - printf(" Requires the paged gather-read path wired into llama-graph.cpp\n"); - printf(" (Gate 0) to measure tok/s vs concurrency. Not measurable at the\n"); - printf(" allocation layer; not reported here to avoid overclaiming.\n"); - return 0; -} diff --git a/backend/cpp/llama-cpp/paged/paged-loadgen.cpp b/backend/cpp/llama-cpp/paged/paged-loadgen.cpp deleted file mode 100644 index 1491bcd7c..000000000 --- a/backend/cpp/llama-cpp/paged/paged-loadgen.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// paged-loadgen: a dynamic-load benchmark for paged KV that actually exercises the -// regime where paging wins - variable prompt lengths, variable generation lengths, -// staggered (continuous) arrival, and a shared system prefix. The stock -// examples/paged/paged.cpp adds all requests up front with a fixed n_predict from a -// 20-prompt pool, so it never creates KV-memory pressure or fragmentation and -// therefore never shows a paged advantage (see PAGED_KV_HIGH_CONCURRENCY.md). -// -// Build: drop into PR #22569's examples/paged/ and add to its CMakeLists.txt next to -// llama-paged (it uses the same llama_paged_scheduler_* API). Run on the TARGET GPU -// (e.g. 2xH200) where bandwidth lets decode scale to thousands of sequences and KV -// memory becomes the binding constraint - that is where paged KV pays off and where -// this harness produces a meaningful number. On a low-bandwidth box (GB10) throughput -// plateaus long before memory binds, so the win is not observable there regardless. -// -// Metrics reported: -// - goodput (decode tokens/s aggregate) under the dynamic load -// - peak concurrent in-flight sequences actually sustained -// - paged peak KV bytes used vs the contiguous reservation a unified cache needs -// (n_seq_peak * max_ctx), i.e. the capacity ratio = the headroom paging unlocks -// -// The capacity ratio is the load-bearing number for the buy decision: it is how many -// more concurrent tenants a fixed HBM budget serves with paging than without. - -#include "common.h" -#include "llama.h" - -#include -#include -#include -#include -#include -#include - -// ---- workload knobs (env-overridable so the harness is sweepable without rebuilds) ---- -static int env_int(const char * k, int dflt) { const char * v = getenv(k); return v ? atoi(v) : dflt; } - -struct workload_cfg { - int total_requests = env_int("LG_TOTAL", 2000); // total requests to serve - int target_inflight = env_int("LG_INFLIGHT", 256); // continuous-batching concurrency target - int prefix_tokens = env_int("LG_PREFIX", 512); // shared system-prompt prefix (prefix-cache target) - int suffix_min = env_int("LG_SUFMIN", 16); // per-request unique prompt suffix range - int suffix_max = env_int("LG_SUFMAX", 768); - int gen_short = env_int("LG_GENSHORT", 32); // bimodal generation: most short... - int gen_long = env_int("LG_GENLONG", 1024); // ...some long (the over-reservation driver) - int gen_long_pct = env_int("LG_LONGPCT", 15); // % of requests that are long - int block_size = env_int("LG_BLOCK", 16); // must match -kvbls - unsigned seed = (unsigned) env_int("LG_SEED", 1234); -}; - -// Per-request plan drawn from the workload distribution. -struct req_plan { int prompt_len; int gen_len; }; - -int main(int argc, char ** argv) { - common_params params; - params.n_predict = -1; // per-request, controlled by the plan below - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PAGED)) { - fprintf(stderr, "usage: %s -m -kvp --fit off -ngpub N -ncpub M -ngl 99\n", argv[0]); - return 1; - } - params.kv_paged = true; - - common_init_result init = common_init_from_params(params); - llama_model * model = init.model.get(); - llama_context * ctx = init.context.get(); - if (!model || !ctx) { fprintf(stderr, "load failed\n"); return 1; } - const llama_vocab * vocab = llama_model_get_vocab(model); - - workload_cfg cfg; - std::mt19937 rng(cfg.seed); - std::uniform_int_distribution suf(cfg.suffix_min, cfg.suffix_max); - std::uniform_int_distribution pct(1, 100); - - // KV bytes/token = 2(K,V) * n_layers * n_head_kv * head_dim * sizeof(f16). Confirmed - // against llama-kv-cache-paged.cpp (block_bytes formula). Used for the capacity ratio. - const int n_layers = llama_model_n_layer(model); - const int n_head_kv = llama_model_n_head_kv(model); - const int head_dim = llama_model_n_embd(model) / llama_model_n_head(model); - const size_t kv_bytes_per_token = (size_t)2 * n_layers * n_head_kv * head_dim * sizeof(uint16_t); - - // A long shared system prefix that every request reuses (the prefix-cache target). - std::vector prefix = common_tokenize(ctx, std::string(cfg.prefix_tokens, 'x'), true); - - // Pre-draw all request plans so paged peak usage and the contiguous reservation are - // computed from the SAME workload. - std::vector plans(cfg.total_requests); - int max_ctx = 0; - for (auto & p : plans) { - p.prompt_len = cfg.prefix_tokens + suf(rng); - p.gen_len = (pct(rng) <= cfg.gen_long_pct) ? cfg.gen_long : cfg.gen_short; - max_ctx = std::max(max_ctx, p.prompt_len + p.gen_len); - } - - llama_paged_scheduler * sched = llama_paged_scheduler_init(ctx); - if (!sched) { fprintf(stderr, "scheduler init failed\n"); return 1; } - - // ---- continuous-arrival loop: keep ~target_inflight requests live at all times ---- - int next_req = 0, done = 0, inflight = 0, peak_inflight = 0; - long total_decoded = 0; - size_t peak_kv_bytes_paged = 0; // sum over live seqs of ceil(used/block)*block*kv_bytes - size_t live_used_tokens = 0; // running sum of actual KV tokens held by live seqs - - auto admit = [&](int rid) { - const req_plan & p = plans[rid]; - std::vector toks = prefix; // shared prefix... - std::vector suff = common_tokenize(ctx, std::string(p.prompt_len - cfg.prefix_tokens, 'y'), false); - toks.insert(toks.end(), suff.begin(), suff.end()); // ...+ unique suffix - if (llama_paged_scheduler_add_request(sched, toks.data(), toks.size(), rid)) { - inflight++; peak_inflight = std::max(peak_inflight, inflight); - live_used_tokens += p.prompt_len; - } - }; - - const int64_t t0 = ggml_time_us(); - for (int i = 0; i < cfg.target_inflight && next_req < cfg.total_requests; ++i) admit(next_req++); - - llama_batch batch = {}; - std::vector sampled; std::vector stop_flags; - - while (done < cfg.total_requests) { - if (!llama_paged_scheduler_prepare_batch(sched, &batch)) break; - const llama_paged_batch_info * info = llama_paged_scheduler_get_batch_info(sched); - sampled.assign(info->n_seq, 0); stop_flags.assign(info->n_seq, 0); - - // (decode is done inside the scheduler/update path in PR #22569; greedy here) - for (int i = 0; i < info->n_seq; ++i) { - const int rid = info->seq_ids[i]; - llama_paged_seq_state st{}; - llama_paged_scheduler_get_seq_state(sched, rid, &st); - // greedy argmax from the i-th row of logits - const float * lg = llama_get_logits_ith(ctx, i); - int best = 0; float bv = lg[0]; - for (int t = 1; t < llama_vocab_n_tokens(vocab); ++t) if (lg[t] > bv) { bv = lg[t]; best = t; } - sampled[i] = best; - const bool stop = llama_vocab_is_eog(vocab, best) || st.n_decoded + 1 >= plans[rid].gen_len; - stop_flags[i] = stop ? 1 : 0; - if (!stop) { total_decoded++; live_used_tokens++; } - if (stop) { - done++; inflight--; - live_used_tokens -= (plans[rid].prompt_len + st.n_decoded); - if (next_req < cfg.total_requests) admit(next_req++); // continuous arrival - } - } - // paged peak KV: blocks are allocated per live seq = ceil(used/block); approximate - // current paged footprint from live_used_tokens rounded up per the block size. - const size_t paged_now = (size_t)std::ceil((double)live_used_tokens / cfg.block_size) - * cfg.block_size * kv_bytes_per_token; - peak_kv_bytes_paged = std::max(peak_kv_bytes_paged, paged_now); - - llama_paged_scheduler_update(sched, &batch, sampled.data(), stop_flags.data()); - } - const double secs = (ggml_time_us() - t0) / 1e6; - - // Contiguous unified-KV reservation needed to serve the SAME peak concurrency without - // mid-generation eviction: every live slot must be backed for the worst-case context. - const size_t contig_reserve = (size_t)peak_inflight * max_ctx * kv_bytes_per_token; - - printf("\n==== paged-loadgen ====\n"); - printf("requests served : %d (target inflight %d, peak inflight %d)\n", done, cfg.target_inflight, peak_inflight); - printf("goodput (decode) : %.1f tok/s (%ld tokens / %.2f s)\n", total_decoded / secs, total_decoded, secs); - printf("kv bytes / token : %zu (n_layer=%d n_head_kv=%d head_dim=%d f16)\n", kv_bytes_per_token, n_layers, n_head_kv, head_dim); - printf("paged peak KV : %.2f GiB (allocated on demand)\n", peak_kv_bytes_paged / 1073741824.0); - printf("contiguous reserve : %.2f GiB (peak_inflight * max_ctx %d)\n", contig_reserve / 1073741824.0, max_ctx); - printf("CAPACITY RATIO : %.2fx <- tenants-per-HBM paging unlocks\n", - peak_kv_bytes_paged ? (double)contig_reserve / peak_kv_bytes_paged : 0.0); - printf(" (plus cross-request prefix sharing of the %d-token shared prefix, not counted above)\n", cfg.prefix_tokens); - - llama_paged_scheduler_free(sched); - return 0; -} diff --git a/backend/cpp/llama-cpp/paged/paged_kv_manager.cpp b/backend/cpp/llama-cpp/paged/paged_kv_manager.cpp deleted file mode 100644 index 20ff191ed..000000000 --- a/backend/cpp/llama-cpp/paged/paged_kv_manager.cpp +++ /dev/null @@ -1,296 +0,0 @@ -#include "paged_kv_manager.h" -#include -#include - -namespace paged { - -// --------------------------------------------------------------------------- -// FreeBlockQueue (port of kv_cache_utils.py FreeKVCacheBlockQueue) -// --------------------------------------------------------------------------- - -FreeBlockQueue::FreeBlockQueue(const std::vector& blocks) { - num_free_blocks = blocks.size(); - for (size_t i = 0; i < blocks.size(); ++i) { - if (i > 0) blocks[i]->prev_free = blocks[i - 1]; - if (i + 1 < blocks.size()) blocks[i]->next_free = blocks[i + 1]; - } - if (!blocks.empty()) { - fake_head.next_free = blocks.front(); - blocks.front()->prev_free = &fake_head; - fake_tail.prev_free = blocks.back(); - blocks.back()->next_free = &fake_tail; - } else { - fake_head.next_free = &fake_tail; - fake_tail.prev_free = &fake_head; - } -} - -KVCacheBlock* FreeBlockQueue::popleft() { - KVCacheBlock* first = fake_head.next_free; - if (first == &fake_tail || first == nullptr) { - assert(num_free_blocks == 0); - throw std::runtime_error("No free blocks available"); - } - fake_head.next_free = first->next_free; - first->next_free->prev_free = &fake_head; - first->prev_free = first->next_free = nullptr; - num_free_blocks--; - return first; -} - -std::vector FreeBlockQueue::popleft_n(size_t n) { - std::vector ret; - if (n == 0) return ret; - assert(num_free_blocks >= n); - num_free_blocks -= n; - KVCacheBlock* curr = fake_head.next_free; - ret.reserve(n); - for (size_t i = 0; i < n; ++i) { - assert(curr != nullptr); - ret.push_back(curr); - KVCacheBlock* last = curr; - curr = curr->next_free; - last->prev_free = last->next_free = nullptr; - } - if (curr != nullptr) { - fake_head.next_free = curr; - curr->prev_free = &fake_head; - } - return ret; -} - -void FreeBlockQueue::remove(KVCacheBlock* block) { - if (!block->prev_free || !block->next_free) - throw std::runtime_error("remove() called on an invalid block"); - block->prev_free->next_free = block->next_free; - block->next_free->prev_free = block->prev_free; - block->prev_free = block->next_free = nullptr; - num_free_blocks--; -} - -void FreeBlockQueue::append(KVCacheBlock* block) { - KVCacheBlock* last = fake_tail.prev_free; - last->next_free = block; - block->prev_free = last; - block->next_free = &fake_tail; - fake_tail.prev_free = block; - num_free_blocks++; -} - -void FreeBlockQueue::append_n(const std::vector& blocks) { - if (blocks.empty()) return; - KVCacheBlock* last = fake_tail.prev_free; - for (KVCacheBlock* b : blocks) { - b->prev_free = last; - last->next_free = b; - last = b; - } - last->next_free = &fake_tail; - fake_tail.prev_free = last; - num_free_blocks += blocks.size(); -} - -void FreeBlockQueue::prepend_n(const std::vector& blocks) { - if (blocks.empty()) return; - KVCacheBlock* first = fake_head.next_free; - KVCacheBlock* prev = &fake_head; - for (KVCacheBlock* b : blocks) { - b->prev_free = prev; - prev->next_free = b; - prev = b; - } - prev->next_free = first; - first->prev_free = prev; - num_free_blocks += blocks.size(); -} - -std::vector FreeBlockQueue::get_all_free_blocks() const { - std::vector ret; - const KVCacheBlock* curr = fake_head.next_free; - while (curr && curr->next_free != nullptr) { - ret.push_back(const_cast(curr)); - curr = curr->next_free; - } - return ret; -} - -// --------------------------------------------------------------------------- -// BlockPool (port of block_pool.py) -// --------------------------------------------------------------------------- - -static std::vector make_ptrs(std::vector& v) { - std::vector p; - p.reserve(v.size()); - for (auto& b : v) p.push_back(&b); - return p; -} - -static std::vector make_block_vec(int32_t num_blocks) { - std::vector v; - v.reserve(num_blocks); - for (int32_t i = 0; i < num_blocks; ++i) v.emplace_back(i); - return v; -} - -BlockPool::BlockPool(int32_t num_blocks, bool enable_caching) - : enable_caching_(enable_caching), - blocks_(make_block_vec(num_blocks)), - ptrs_(make_ptrs(blocks_)), - free_queue_(ptrs_) { - // vLLM reserves block_id 0 as the null block (never cached). - null_block = free_queue_.popleft(); - null_block->is_null = true; -} - -bool BlockPool::maybe_evict_cached_block(KVCacheBlock* block) { - if (!block->has_hash) return false; - auto it = cached_block_hash_to_block_.find(block->block_hash); - if (it == cached_block_hash_to_block_.end() || it->second != block) return false; - cached_block_hash_to_block_.erase(it); - block->reset_hash(); - return true; -} - -std::vector BlockPool::get_new_blocks(size_t n) { - if (n > get_num_free_blocks()) - throw std::runtime_error("Cannot get free blocks from pool"); - auto ret = free_queue_.popleft_n(n); - for (KVCacheBlock* b : ret) { - if (enable_caching_) maybe_evict_cached_block(b); - assert(b->ref_cnt == 0); - b->ref_cnt += 1; - } - return ret; -} - -KVCacheBlock* BlockPool::get_cached_block(uint64_t block_hash) { - auto it = cached_block_hash_to_block_.find(block_hash); - return it == cached_block_hash_to_block_.end() ? nullptr : it->second; -} - -void BlockPool::touch(const std::vector& blocks) { - for (KVCacheBlock* b : blocks) { - // ref_cnt==0 means the block is a free-list eviction candidate; pull it out. - if (b->ref_cnt == 0 && !b->is_null) free_queue_.remove(b); - b->ref_cnt += 1; - } -} - -void BlockPool::free_blocks(const std::vector& ordered_blocks) { - std::vector without_hash, with_hash; - for (KVCacheBlock* b : ordered_blocks) { - if (b->is_null) continue; - b->ref_cnt -= 1; - if (b->ref_cnt == 0) (b->has_hash ? with_hash : without_hash).push_back(b); - } - free_queue_.prepend_n(without_hash); // un-hashed: evicted first (front) - free_queue_.append_n(with_hash); // hashed: kept warm (tail) -} - -void BlockPool::cache_full_blocks(const std::vector& req_blocks, - size_t num_cached_blocks, size_t num_full_blocks, - const std::vector& block_hashes) { - for (size_t i = num_cached_blocks; i < num_full_blocks; ++i) { - KVCacheBlock* blk = req_blocks[i]; - if (blk->has_hash) continue; - blk->has_hash = true; - blk->block_hash = block_hashes[i]; - cached_block_hash_to_block_[blk->block_hash] = blk; - } -} - -// --------------------------------------------------------------------------- -// PagedKVManager (port of SingleTypeKVCacheManager / FullAttentionManager) -// --------------------------------------------------------------------------- - -static inline size_t cdiv(size_t a, size_t b) { return (a + b - 1) / b; } - -PagedKVManager::PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching) - : block_size_(block_size), pool_(num_blocks, enable_caching) {} - -bool PagedKVManager::allocate(int seq_id, size_t total_tokens) { - auto& req = req_to_blocks_[seq_id]; - size_t need = cdiv(total_tokens, block_size_); - if (need <= req.size()) return true; - size_t add = need - req.size(); - if (add > pool_.get_num_free_blocks()) return false; // OOM - auto nb = pool_.get_new_blocks(add); - req.insert(req.end(), nb.begin(), nb.end()); - return true; -} - -std::vector PagedKVManager::block_table(int seq_id) const { - std::vector bt; - auto it = req_to_blocks_.find(seq_id); - if (it == req_to_blocks_.end()) return bt; - bt.reserve(it->second.size()); - for (KVCacheBlock* b : it->second) bt.push_back(b->block_id); - return bt; -} - -int64_t PagedKVManager::slot(int seq_id, int pos) const { - const auto& req = req_to_blocks_.at(seq_id); - int32_t phys = req[pos / block_size_]->block_id; - return (int64_t)phys * block_size_ + (pos % block_size_); -} - -std::vector PagedKVManager::slot_mapping(int seq_id, const std::vector& positions) const { - std::vector sm; - sm.reserve(positions.size()); - for (int p : positions) sm.push_back(slot(seq_id, p)); - return sm; -} - -void PagedKVManager::free(int seq_id) { - auto it = req_to_blocks_.find(seq_id); - if (it == req_to_blocks_.end()) return; - // Free in reverse so the tail of the block chain is evicted first (vLLM order). - std::vector ordered(it->second.rbegin(), it->second.rend()); - pool_.free_blocks(ordered); - req_to_blocks_.erase(it); -} - -// FNV-1a chained block hash. Deterministic and prefix-sensitive; folds the parent -// hash into the seed so each block hash transitively encodes its whole prefix -// (behavioral parity with vLLM hash_block_tokens chaining; vLLM uses sha256 bytes). -uint64_t PagedKVManager::hash_block(uint64_t parent_hash, const std::vector& token_ids) { - uint64_t h = 1469598103934665603ull ^ parent_hash; - for (int t : token_ids) { - h ^= (uint64_t)(uint32_t)t; - h *= 1099511628211ull; - } - if (h == 0) h = 0x9e3779b97f4a7c15ull; // never 0 (0 reads as "no hash") - return h; -} - -std::vector PagedKVManager::compute_block_hashes(const std::vector& token_ids) const { - std::vector hashes; - uint64_t parent = 0; // NONE_HASH analogue - size_t n_full = token_ids.size() / block_size_; - for (size_t i = 0; i < n_full; ++i) { - std::vector blk(token_ids.begin() + i * block_size_, - token_ids.begin() + (i + 1) * block_size_); - parent = hash_block(parent, blk); - hashes.push_back(parent); - } - return hashes; -} - -size_t PagedKVManager::get_computed_blocks(const std::vector& block_hashes) { - std::vector hits; - for (uint64_t bh : block_hashes) { // stop at first miss (prefix property) - KVCacheBlock* cb = pool_.get_cached_block(bh); - if (!cb) break; - hits.push_back(cb); - } - pool_.touch(hits); // ++ref_cnt, pull from free list - return hits.size() * (size_t)block_size_; -} - -void PagedKVManager::cache_blocks(int seq_id, const std::vector& block_hashes, size_t num_tokens) { - auto& req = req_to_blocks_[seq_id]; - size_t n_full = num_tokens / block_size_; - pool_.cache_full_blocks(req, /*num_cached=*/0, n_full, block_hashes); -} - -} // namespace paged diff --git a/backend/cpp/llama-cpp/paged/paged_kv_manager.h b/backend/cpp/llama-cpp/paged/paged_kv_manager.h deleted file mode 100644 index 740280a7f..000000000 --- a/backend/cpp/llama-cpp/paged/paged_kv_manager.h +++ /dev/null @@ -1,108 +0,0 @@ -#pragma once -// Paged KV cache block manager for llama.cpp (CPU-first prototype). -// -// Host-side block management is a faithful port of vLLM V1: -// vllm/v1/core/kv_cache_utils.py (KVCacheBlock, FreeKVCacheBlockQueue, hash_block_tokens) -// vllm/v1/core/block_pool.py (BlockPool: get_new_blocks/touch/free/evict/cache_full_blocks) -// vllm/v1/core/single_type_kv_cache_manager.py (allocate_new_blocks, find_longest_cache_hit) -// -// Parity is on behavior/algorithm (block chaining, first-miss stop, ref-counting, -// LRU eviction order), not on exact hash bytes. This unit has zero ggml/llama.cpp -// dependency so it can be unit-tested in isolation. - -#include -#include -#include -#include - -namespace paged { - -// vLLM KVCacheBlock (kv_cache_utils.py). -struct KVCacheBlock { - int32_t block_id = 0; - int ref_cnt = 0; - bool has_hash = false; // vLLM: _block_hash is set only when full+cached - uint64_t block_hash = 0; - bool is_null = false; - KVCacheBlock* prev_free = nullptr; - KVCacheBlock* next_free = nullptr; - - explicit KVCacheBlock(int32_t id = 0) : block_id(id) {} - void reset_hash() { has_hash = false; block_hash = 0; } -}; - -// Intrusive doubly-linked free list with fake head/tail (vLLM FreeKVCacheBlockQueue). -// O(1) middle removal is required so touch() can pull a warm cached block out of the -// free list when a later request hits its prefix. -class FreeBlockQueue { -public: - size_t num_free_blocks = 0; - - explicit FreeBlockQueue(const std::vector& blocks); - KVCacheBlock* popleft(); - std::vector popleft_n(size_t n); - void remove(KVCacheBlock* block); - void append(KVCacheBlock* block); - void append_n(const std::vector& blocks); - void prepend_n(const std::vector& blocks); - std::vector get_all_free_blocks() const; - -private: - KVCacheBlock fake_head{-1}; - KVCacheBlock fake_tail{-1}; -}; - -// vLLM BlockPool (block_pool.py). -class BlockPool { -public: - KVCacheBlock* null_block = nullptr; - - BlockPool(int32_t num_blocks, bool enable_caching); - std::vector get_new_blocks(size_t n); - KVCacheBlock* get_cached_block(uint64_t block_hash); - void touch(const std::vector& blocks); - void free_blocks(const std::vector& ordered_blocks); - void cache_full_blocks(const std::vector& req_blocks, - size_t num_cached_blocks, size_t num_full_blocks, - const std::vector& block_hashes); - size_t get_num_free_blocks() const { return free_queue_.num_free_blocks; } - -private: - bool maybe_evict_cached_block(KVCacheBlock* block); - - bool enable_caching_; - std::vector blocks_; // owns all block descriptors - std::vector ptrs_; - FreeBlockQueue free_queue_; - // vLLM stores hash -> {block_id: block} to allow duplicate-content blocks; the - // prototype keeps the last writer (single KV-cache group is sufficient for the wins). - std::unordered_map cached_block_hash_to_block_; -}; - -// Allocation + prefix-caching surface, ported from SingleTypeKVCacheManager / -// FullAttentionManager. Single KV-cache group; no extra_keys / eagle / spec-decode. -class PagedKVManager { -public: - PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching); - - // Grow seq_id to cover total_tokens slots. Returns false on OOM (free queue empty). - bool allocate(int seq_id, size_t total_tokens); - std::vector block_table(int seq_id) const; - int64_t slot(int seq_id, int pos) const; - std::vector slot_mapping(int seq_id, const std::vector& positions) const; - void free(int seq_id); - int block_size() const { return block_size_; } - - // Prefix caching (win 3). - static uint64_t hash_block(uint64_t parent_hash, const std::vector& token_ids); - std::vector compute_block_hashes(const std::vector& token_ids) const; - size_t get_computed_blocks(const std::vector& block_hashes); // returns num cached tokens - void cache_blocks(int seq_id, const std::vector& block_hashes, size_t num_tokens); - -protected: - int block_size_; - BlockPool pool_; - std::map> req_to_blocks_; -}; - -} // namespace paged diff --git a/backend/cpp/llama-cpp/paged/patches/0001-paged-kv-block-placement.patch b/backend/cpp/llama-cpp/paged/patches/0001-paged-kv-block-placement.patch deleted file mode 100644 index 9ff9452ea..000000000 --- a/backend/cpp/llama-cpp/paged/patches/0001-paged-kv-block-placement.patch +++ /dev/null @@ -1,59 +0,0 @@ -diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp -index a49a055a6..d95102bbd 100644 ---- a/src/llama-kv-cache.cpp -+++ b/src/llama-kv-cache.cpp -@@ -11,6 +11,8 @@ - #include - #include - #include -+#include -+#include - #include - - static bool ggml_is_power_of_2(int n) { -@@ -931,6 +933,45 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch, - return { }; - } - -+ // [paged, experimental] Place this sequence's tokens at permuted, -+ // non-contiguous fixed-size BLOCK positions instead of a contiguous run. -+ // This validates that attention is invariant to physical KV placement - -+ // the correctness premise of paged attention. Enabled via LLAMA_KV_PAGED. -+ // Single-sequence scope (uses get_used() as the logical base); falls back -+ // to the normal allocator if the permuted cells aren't available. -+ static const bool paged_mode = (std::getenv("LLAMA_KV_PAGED") != nullptr); -+ if (paged_mode) { -+ const uint32_t bs = 16; // block size (tokens/block) -+ const uint32_t nblk = cells.size() / bs; // blocks in this stream's pool -+ if (nblk >= 2) { -+ // stride coprime to nblk => block-index permutation is a bijection -+ uint32_t k = 1; -+ for (uint32_t cand = (nblk / 2) | 1u; cand < nblk; cand += 2) { -+ if (std::gcd(cand, nblk) == 1u) { k = cand; break; } -+ } -+ const uint32_t base = cells.get_used(); -+ bool ok = true; -+ for (uint32_t i = 0; i < n_tokens; ++i) { -+ const uint32_t L = base + i; -+ const uint32_t b = L / bs; -+ const uint32_t off = L % bs; -+ if (b >= nblk) { ok = false; break; } -+ const uint32_t phys = ((b * k) % nblk) * bs + off; // permuted block -+ if (phys >= cells.size() || !cells.is_empty(phys)) { ok = false; break; } -+ res.idxs[s].push_back(phys); -+ } -+ if (ok && res.idxs[s].size() == n_tokens) { -+ if (std::getenv("LLAMA_KV_PAGED_DEBUG")) { -+ fprintf(stderr, "[paged] seq placed %u tok at cells:", n_tokens); -+ for (uint32_t z = 0; z < res.idxs[s].size() && z < 24; ++z) fprintf(stderr, " %u", res.idxs[s][z]); -+ fprintf(stderr, " (k=%u nblk=%u base=%u)\n", k, nblk, base); -+ } -+ continue; // paged placement succeeded for this sequence -+ } -+ res.idxs[s].clear(); // fall back to the normal allocator -+ } -+ } -+ - uint32_t n_tested = 0; - - // for continuous slots, we test that all tokens in the ubatch fit, starting from the current head diff --git a/backend/cpp/llama-cpp/paged/patches/0002-paged-e2e-disable-broken-autofit.patch b/backend/cpp/llama-cpp/paged/patches/0002-paged-e2e-disable-broken-autofit.patch deleted file mode 100644 index 5de1bb641..000000000 --- a/backend/cpp/llama-cpp/paged/patches/0002-paged-e2e-disable-broken-autofit.patch +++ /dev/null @@ -1,12 +0,0 @@ -diff --git a/tests/test-paged-kv-e2e.cpp b/tests/test-paged-kv-e2e.cpp -index 5a352e3..06ead50 100644 ---- a/tests/test-paged-kv-e2e.cpp -+++ b/tests/test-paged-kv-e2e.cpp -@@ -115,6 +115,7 @@ static path_result run_paged(const std::string & model_path) { - params.sampling.temp = 0.0f; // greedy - params.warmup = false; - params.kv_paged = true; -+ params.fit_params = false; // honor explicit n_gpu_blocks; GB10 dev_memory over-reports free VRAM - params.n_gpu_blocks = 64; - params.n_cpu_blocks = 16; - params.n_sequences = 1; diff --git a/backend/cpp/llama-cpp/paged/tests/test_block_pool.cpp b/backend/cpp/llama-cpp/paged/tests/test_block_pool.cpp deleted file mode 100644 index a896fb1e8..000000000 --- a/backend/cpp/llama-cpp/paged/tests/test_block_pool.cpp +++ /dev/null @@ -1,42 +0,0 @@ -#include "../paged_kv_manager.h" -#include -#include -using namespace paged; - -int main() { - BlockPool pool(/*num_blocks=*/8, /*enable_caching=*/true); - // block 0 is reserved as null_block (vLLM pops one at init) - assert(pool.null_block != nullptr && pool.null_block->block_id == 0); - assert(pool.get_num_free_blocks() == 7); - - // get_new_blocks sets ref_cnt=1 and removes from free list - auto b = pool.get_new_blocks(2); - assert(b.size() == 2 && b[0]->ref_cnt == 1 && b[1]->ref_cnt == 1); - assert(pool.get_num_free_blocks() == 5); - - // cache two full blocks with chained hashes, then look them up - std::vector hashes = {1111, 2222}; - pool.cache_full_blocks(b, /*num_cached=*/0, /*num_full=*/2, hashes); - assert(b[0]->has_hash && b[0]->block_hash == 1111); - assert(pool.get_cached_block(1111) == b[0]); - assert(pool.get_cached_block(2222) == b[1]); - assert(pool.get_cached_block(9999) == nullptr); - - // free: hashed blocks go to tail (kept warm), so they remain queryable. - pool.free_blocks(b); - assert(b[0]->ref_cnt == 0); - assert(pool.get_num_free_blocks() == 7); - assert(pool.get_cached_block(1111) == b[0]); // still cached/warm - - // touch a warm cached block: pulls it out of free list, ++ref_cnt - pool.touch({b[0]}); - assert(b[0]->ref_cnt == 1); - assert(pool.get_num_free_blocks() == 6); - - // exhausting the pool then allocating evicts a warm cached hash - auto rest = pool.get_new_blocks(pool.get_num_free_blocks()); - (void) rest; - assert(pool.get_cached_block(2222) == nullptr); // evicted on reuse - printf("test_block_pool: OK\n"); - return 0; -} diff --git a/backend/cpp/llama-cpp/paged/tests/test_free_block_queue.cpp b/backend/cpp/llama-cpp/paged/tests/test_free_block_queue.cpp deleted file mode 100644 index f799f2a5e..000000000 --- a/backend/cpp/llama-cpp/paged/tests/test_free_block_queue.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include "../paged_kv_manager.h" -#include -#include -#include - -using namespace paged; - -static std::vector make_blocks(int n) { - std::vector v; - v.reserve(n); - for (int i = 0; i < n; ++i) v.push_back(KVCacheBlock{i}); - return v; -} - -int main() { - // ordered 0..9 at init; popleft yields ascending block_ids - auto blocks = make_blocks(10); - std::vector ptrs; - for (auto& b : blocks) ptrs.push_back(&b); - FreeBlockQueue q(ptrs); - assert(q.num_free_blocks == 10); - - KVCacheBlock* b0 = q.popleft(); - assert(b0->block_id == 0); - assert(q.num_free_blocks == 9); - - auto two = q.popleft_n(2); // {1,2} - assert(two.size() == 2 && two[0]->block_id == 1 && two[1]->block_id == 2); - assert(q.num_free_blocks == 7); - - // O(1) middle removal: remove block 5 (currently free), count drops - q.remove(ptrs[5]); - assert(q.num_free_blocks == 6); // free: 3,4,6,7,8,9 - - // append puts a block at the tail; it comes back out only after the rest - q.append(b0); // free order now: 3,4,6,7,8,9,0 - assert(q.num_free_blocks == 7); - auto all = q.get_all_free_blocks(); - assert(all.front()->block_id == 3); - assert(all.back()->block_id == 0); - - printf("test_free_block_queue: OK\n"); - return 0; -} diff --git a/backend/cpp/llama-cpp/paged/tests/test_ggml_paged_attn.cpp b/backend/cpp/llama-cpp/paged/tests/test_ggml_paged_attn.cpp deleted file mode 100644 index 0a8b59ff7..000000000 --- a/backend/cpp/llama-cpp/paged/tests/test_ggml_paged_attn.cpp +++ /dev/null @@ -1,133 +0,0 @@ -// Phase 2 (core numeric de-risk): attention over GATHERED paged KV must equal -// an independent host-computed reference. -// -// This answers the central risk in the design: feeding gather-to-scratch KV -// (a sequence whose blocks are non-contiguous in the shared pool) into ggml's -// standard attention ops (mul_mat -> soft_max_ext -> mul_mat) produces correct -// attention. If this holds, the paged read path is numerically sound; the -// remaining work is wiring it into llama-graph.cpp (Gate 0 in a real model). - -#include "../paged_kv_manager.h" - -#include "ggml.h" -#include "ggml-cpu.h" -#include "ggml-alloc.h" -#include "ggml-backend.h" - -#include -#include -#include -#include - -using namespace paged; - -int main() { - const int d = 8; // head dim - const int n_kv = 48; // 3 blocks worth of KV tokens - const int n_q = 4; // query tokens - const int block_size = 16; - const int num_blocks = 8; - const int total_slots = block_size * num_blocks; - const float scale = 1.0f / std::sqrt((float) d); - - // Non-contiguous physical layout for the KV sequence (blocks [2,1,5]). - PagedKVManager m(num_blocks, block_size, /*enable_caching=*/false); - assert(m.allocate(0, 2 * block_size)); - assert(m.allocate(1, 2 * block_size)); - m.free(0); - assert(m.allocate(2, n_kv)); - std::vector positions(n_kv); - for (int i = 0; i < n_kv; ++i) positions[i] = i; - auto slots64 = m.slot_mapping(2, positions); - std::vector slots32(slots64.begin(), slots64.end()); - - // Deterministic K, V, Q in logical [d, n] layout (column-major: col = token). - std::vector K(d * n_kv), V(d * n_kv), Q(d * n_q); - for (int t = 0; t < n_kv; ++t) - for (int e = 0; e < d; ++e) { - K[t * d + e] = std::sin(0.1f * t + 0.3f * e); - V[t * d + e] = std::cos(0.2f * t - 0.1f * e); - } - for (int q = 0; q < n_q; ++q) - for (int e = 0; e < d; ++e) Q[q * d + e] = std::sin(0.05f * q + 0.7f * e); - - // ---- Independent host reference attention ------------------------------- - std::vector ref(d * n_q, 0.0f); - for (int q = 0; q < n_q; ++q) { - std::vector score(n_kv); - float mx = -1e30f; - for (int t = 0; t < n_kv; ++t) { - float dot = 0.0f; - for (int e = 0; e < d; ++e) dot += K[t * d + e] * Q[q * d + e]; - score[t] = dot * scale; - mx = std::fmax(mx, score[t]); - } - float sum = 0.0f; - for (int t = 0; t < n_kv; ++t) { score[t] = std::exp(score[t] - mx); sum += score[t]; } - for (int t = 0; t < n_kv; ++t) { - float p = score[t] / sum; - for (int e = 0; e < d; ++e) ref[q * d + e] += p * V[t * d + e]; - } - } - - // ---- ggml paged path ---------------------------------------------------- - ggml_backend_t backend = ggml_backend_cpu_init(); - struct ggml_init_params dp = { ggml_tensor_overhead() * 16, NULL, true }; - struct ggml_context * ctx_data = ggml_init(dp); - - struct ggml_tensor * poolK = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, total_slots); - struct ggml_tensor * poolV = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, total_slots); - struct ggml_tensor * kSrc = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, n_kv); - struct ggml_tensor * vSrc = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, n_kv); - struct ggml_tensor * qT = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, n_q); - struct ggml_tensor * wIdx = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I64, n_kv); - struct ggml_tensor * gIdx = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I32, n_kv); - - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx_data, backend); - std::vector zeros(d * total_slots, 0.0f); - ggml_backend_tensor_set(poolK, zeros.data(), 0, ggml_nbytes(poolK)); - ggml_backend_tensor_set(poolV, zeros.data(), 0, ggml_nbytes(poolV)); - ggml_backend_tensor_set(kSrc, K.data(), 0, ggml_nbytes(kSrc)); - ggml_backend_tensor_set(vSrc, V.data(), 0, ggml_nbytes(vSrc)); - ggml_backend_tensor_set(qT, Q.data(), 0, ggml_nbytes(qT)); - ggml_backend_tensor_set(wIdx, slots64.data(), 0, ggml_nbytes(wIdx)); - ggml_backend_tensor_set(gIdx, slots32.data(), 0, ggml_nbytes(gIdx)); - - struct ggml_init_params cp = { ggml_tensor_overhead() * 64 + ggml_graph_overhead(), NULL, true }; - struct ggml_context * ctx = ggml_init(cp); - - struct ggml_tensor * wroteK = ggml_set_rows(ctx, poolK, kSrc, wIdx); - struct ggml_tensor * wroteV = ggml_set_rows(ctx, poolV, vSrc, wIdx); - struct ggml_tensor * gK = ggml_get_rows(ctx, wroteK, gIdx); // [d, n_kv] - struct ggml_tensor * gV = ggml_get_rows(ctx, wroteV, gIdx); // [d, n_kv] - - struct ggml_tensor * kq = ggml_mul_mat(ctx, gK, qT); // [n_kv, n_q] - struct ggml_tensor * probs = ggml_soft_max_ext(ctx, kq, NULL, scale, 0.0f); - struct ggml_tensor * vT = ggml_cont(ctx, ggml_transpose(ctx, gV)); // [n_kv, d] - struct ggml_tensor * out = ggml_mul_mat(ctx, vT, probs); // [d, n_q] - ggml_set_output(out); - - struct ggml_cgraph * gf = ggml_new_graph(ctx); - ggml_build_forward_expand(gf, out); - ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - assert(ggml_gallocr_alloc_graph(galloc, gf)); - assert(ggml_backend_graph_compute(backend, gf) == GGML_STATUS_SUCCESS); - - std::vector got(d * n_q); - ggml_backend_tensor_get(out, got.data(), 0, ggml_nbytes(out)); - - // ---- compare ------------------------------------------------------------ - double max_err = 0.0; - for (int i = 0; i < d * n_q; ++i) max_err = std::fmax(max_err, std::fabs(got[i] - ref[i])); - printf("paged attention max abs err vs host reference: %.3e\n", max_err); - assert(max_err < 1e-4 && "paged-gathered attention must match host reference"); - - ggml_gallocr_free(galloc); - ggml_free(ctx); - ggml_free(ctx_data); - ggml_backend_buffer_free(buf); - ggml_backend_free(backend); - - printf("test_ggml_paged_attn: OK (attention over non-contiguous paged KV matches reference)\n"); - return 0; -} diff --git a/backend/cpp/llama-cpp/paged/tests/test_ggml_paged_rw.cpp b/backend/cpp/llama-cpp/paged/tests/test_ggml_paged_rw.cpp deleted file mode 100644 index 4f5032695..000000000 --- a/backend/cpp/llama-cpp/paged/tests/test_ggml_paged_rw.cpp +++ /dev/null @@ -1,142 +0,0 @@ -// Phase 1 integration test: prove the paged KV write+read MECHANISM at the -// ggml-op level, driven by PagedKVManager. -// -// write: ggml_set_rows(pool, k_src, slot_mapping) // scatter by slot -// read: ggml_get_rows(pool, gather_idx) // gather seq's slots -// -// The decisive property: a sequence's physical blocks are NON-CONTIGUOUS and -// OUT-OF-ORDER (forced via allocate/free/reallocate), yet gather(write(x)) == x, -// and a second sequence written into disjoint blocks does not contaminate it. -// This is exactly how a paged read path feeds contiguous scratch to attention. - -#include "../paged_kv_manager.h" - -#include "ggml.h" -#include "ggml-cpu.h" -#include "ggml-alloc.h" -#include "ggml-backend.h" - -#include -#include -#include -#include - -using namespace paged; - -int main() { - const int n_embd = 8; - const int block_size = 16; - const int num_blocks = 8; // block 0 reserved as null - const int total_slots = block_size * num_blocks; // 128 - - // --- Force a non-contiguous, out-of-order block layout for seqC ---------- - PagedKVManager m(num_blocks, block_size, /*enable_caching=*/false); - assert(m.allocate(/*seqA=*/0, 2 * block_size)); // blocks {1,2} - assert(m.allocate(/*seqB=*/1, 2 * block_size)); // blocks {3,4} - m.free(0); // returns {1,2} to free list - assert(m.allocate(/*seqC=*/2, 3 * block_size)); // reuses freed blocks, reordered - - auto btC = m.block_table(2); - auto btB = m.block_table(1); - printf("seqC block_table = ["); - for (size_t i = 0; i < btC.size(); ++i) printf("%s%d", i ? "," : "", btC[i]); - printf("]\n"); - assert(btC.size() == 3); - // sanity: seqC and seqB occupy disjoint physical blocks - for (int cb : btC) for (int bb : btB) assert(cb != bb); - - const int n_tokens = 3 * block_size; // 48 tokens for seqC - - // slot_mapping for seqC positions 0..n_tokens-1 - std::vector positions(n_tokens); - for (int i = 0; i < n_tokens; ++i) positions[i] = i; - std::vector slots64 = m.slot_mapping(2, positions); // I64 for set_rows - std::vector slots32(slots64.begin(), slots64.end()); // I32 for get_rows - - // seqB occupies different blocks; write a sentinel there to prove isolation. - std::vector posB(2 * block_size); - for (size_t i = 0; i < posB.size(); ++i) posB[i] = (int) i; - std::vector slotsB64 = m.slot_mapping(1, posB); - - // --- ggml backend + persistent (statically allocated) tensors ------------ - ggml_backend_t backend = ggml_backend_cpu_init(); - assert(backend); - - struct ggml_init_params dp = { /*mem_size=*/ ggml_tensor_overhead() * 16, - /*mem_buffer=*/ NULL, /*no_alloc=*/ true }; - struct ggml_context * ctx_data = ggml_init(dp); - - // The shared paged KV pool: one flat block pool, exactly like a paged layer. - struct ggml_tensor * pool = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, n_embd, total_slots); - struct ggml_tensor * k_src = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, n_embd, n_tokens); - struct ggml_tensor * w_idx = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I64, n_tokens); - struct ggml_tensor * g_idx = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I32, n_tokens); - struct ggml_tensor * kB_src = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, n_embd, (int) posB.size()); - struct ggml_tensor * wB_idx = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I64, (int) posB.size()); - - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx_data, backend); - assert(buf); - - // pool starts zeroed - std::vector zeros(n_embd * total_slots, 0.0f); - ggml_backend_tensor_set(pool, zeros.data(), 0, ggml_nbytes(pool)); - - // token t carries the value (float) t in every embedding lane -> easy to verify - std::vector ksrc(n_embd * n_tokens); - for (int t = 0; t < n_tokens; ++t) - for (int e = 0; e < n_embd; ++e) ksrc[t * n_embd + e] = (float) t; - ggml_backend_tensor_set(k_src, ksrc.data(), 0, ggml_nbytes(k_src)); - ggml_backend_tensor_set(w_idx, slots64.data(), 0, ggml_nbytes(w_idx)); - ggml_backend_tensor_set(g_idx, slots32.data(), 0, ggml_nbytes(g_idx)); - - // seqB sentinel = 999 everywhere - std::vector kBsrc(n_embd * posB.size(), 999.0f); - ggml_backend_tensor_set(kB_src, kBsrc.data(), 0, ggml_nbytes(kB_src)); - ggml_backend_tensor_set(wB_idx, slotsB64.data(), 0, ggml_nbytes(wB_idx)); - - // --- compute graph: write seqB, write seqC, then gather seqC ------------- - struct ggml_init_params cp = { /*mem_size=*/ ggml_tensor_overhead() * 32 + ggml_graph_overhead(), - /*mem_buffer=*/ NULL, /*no_alloc=*/ true }; - struct ggml_context * ctx = ggml_init(cp); - - struct ggml_tensor * wroteB = ggml_set_rows(ctx, pool, kB_src, wB_idx); // view(pool) - struct ggml_tensor * wroteC = ggml_set_rows(ctx, wroteB, k_src, w_idx); // chain so order is fixed - struct ggml_tensor * gathered = ggml_get_rows(ctx, wroteC, g_idx); - ggml_set_output(gathered); - - struct ggml_cgraph * gf = ggml_new_graph(ctx); - ggml_build_forward_expand(gf, gathered); - - ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); - assert(ggml_gallocr_alloc_graph(galloc, gf)); - - assert(ggml_backend_graph_compute(backend, gf) == GGML_STATUS_SUCCESS); - - // --- verify gather(write(x)) == x for the non-contiguous sequence -------- - std::vector out(n_embd * n_tokens); - ggml_backend_tensor_get(gathered, out.data(), 0, ggml_nbytes(gathered)); - - int mism = 0; - for (int t = 0; t < n_tokens; ++t) - for (int e = 0; e < n_embd; ++e) - if (std::fabs(out[t * n_embd + e] - (float) t) > 1e-6f) mism++; - assert(mism == 0 && "gathered paged KV must equal source (round-trip)"); - - // --- verify isolation: read seqC slots directly from pool, unaffected by seqB - std::vector pool_host(n_embd * total_slots); - ggml_backend_tensor_get(pool, pool_host.data(), 0, ggml_nbytes(pool)); - for (int t = 0; t < n_tokens; ++t) { - int slot = (int) slots64[t]; - for (int e = 0; e < n_embd; ++e) - assert(std::fabs(pool_host[slot * n_embd + e] - (float) t) < 1e-6f); - } - - ggml_gallocr_free(galloc); - ggml_free(ctx); - ggml_free(ctx_data); - ggml_backend_buffer_free(buf); - ggml_backend_free(backend); - - printf("test_ggml_paged_rw: OK (non-contiguous paged write/gather round-trip)\n"); - return 0; -} diff --git a/backend/cpp/llama-cpp/paged/tests/test_paged_kv_manager.cpp b/backend/cpp/llama-cpp/paged/tests/test_paged_kv_manager.cpp deleted file mode 100644 index b4f63c3a0..000000000 --- a/backend/cpp/llama-cpp/paged/tests/test_paged_kv_manager.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include "../paged_kv_manager.h" -#include -#include -using namespace paged; - -int main() { - PagedKVManager m(/*num_blocks=*/8, /*block_size=*/16, /*enable_caching=*/false); - // 20 tokens -> ceil(20/16)=2 blocks - assert(m.allocate(/*seq=*/0, 20)); - auto bt = m.block_table(0); - assert(bt.size() == 2); - - // slot arithmetic: pos 0 -> block bt[0]*16 + 0 ; pos 17 -> bt[1]*16 + 1 - assert(m.slot(0, 0) == (int64_t)bt[0] * 16 + 0); - assert(m.slot(0, 17) == (int64_t)bt[1] * 16 + 1); - - auto sm = m.slot_mapping(0, {0, 16, 17}); - assert(sm.size() == 3 && sm[1] == (int64_t)bt[1] * 16 + 0); - - // growing the same seq reuses existing blocks, adds only new ones - assert(m.allocate(0, 40)); // ceil(40/16)=3 -> +1 block - assert(m.block_table(0).size() == 3); - - // OOM: blocks left = 8 - 1(null) - 3 = 4 blocks; ask for 5 blocks - assert(m.allocate(1, 5 * 16) == false); - - // free returns blocks to the pool for reuse - m.free(0); - assert(m.allocate(1, 5 * 16)); // now fits - printf("test_paged_kv_manager: OK\n"); - return 0; -} diff --git a/backend/cpp/llama-cpp/paged/tests/test_prefix_cache.cpp b/backend/cpp/llama-cpp/paged/tests/test_prefix_cache.cpp deleted file mode 100644 index b8151936a..000000000 --- a/backend/cpp/llama-cpp/paged/tests/test_prefix_cache.cpp +++ /dev/null @@ -1,35 +0,0 @@ -#include "../paged_kv_manager.h" -#include -#include -#include -using namespace paged; - -int main() { - PagedKVManager m(/*num_blocks=*/64, /*block_size=*/16, /*enable_caching=*/true); - - // shared prefix of 32 tokens (2 full blocks) + distinct suffix - std::vector shared(32); - for (int i = 0; i < 32; ++i) shared[i] = 100 + i; - - // chained hashing is deterministic and prefix-sensitive - auto h = m.compute_block_hashes(shared); - assert(h.size() == 2); - auto h2 = m.compute_block_hashes(shared); - assert(h == h2); // deterministic - std::vector other = shared; other[0] = 999; - assert(m.compute_block_hashes(other)[0] != h[0]); // sensitive to content - - // seq 0: cold, no cache hit yet - assert(m.get_computed_blocks(h) == 0); - assert(m.allocate(0, 32)); - m.cache_blocks(0, h, 32); - - // seq 1: warm — the 2 shared blocks are a cache hit (32 tokens) - assert(m.get_computed_blocks(h) == 32); - - // first-miss stop: a chain that diverges after block 1 hits only 1 block - auto hmix = h; hmix[1] = 0xDEADBEEF; - assert(m.get_computed_blocks(hmix) == 16); - printf("test_prefix_cache: OK\n"); - return 0; -} diff --git a/backend/cpp/llama-cpp/prepare.sh b/backend/cpp/llama-cpp/prepare.sh index 370af4215..b55e89f0f 100644 --- a/backend/cpp/llama-cpp/prepare.sh +++ b/backend/cpp/llama-cpp/prepare.sh @@ -2,30 +2,18 @@ ## Patches -## Apply patches: the base `patches/` series, then the gated `patches/paged/` -## series (default on; LLAMA_PAGED=off skips it). Only *.patch files are applied -## (docs/dirs like patches/paged/ and *.md are skipped). The Makefile `llama.cpp` -## target already `git apply`s these at checkout, so each apply is guarded by a -## sentinel and skipped when already present - re-applying git-format patches with -## `patch` fuzzily duplicates hunks (redefinition errors). This block only does -## real work if prepare.sh is run against an unpatched checkout. +## Apply the base `patches/` series (top-level *.patch only; *.md/dirs skipped). +## The stock llama-cpp backend is patch-free by default, so this normally does +## nothing. The Makefile `llama.cpp` target already `git apply`s any base patch +## at checkout, so each apply here is `-N` (skip already-applied): re-applying a +## git-format patch with `patch` would fuzzily duplicate hunks. This block only +## does real work if prepare.sh is run against an unpatched checkout. if [ -d "patches" ]; then for patch in patches/*.patch; do [ -e "$patch" ] || continue echo "Applying patch $patch" patch -d llama.cpp/ -p1 -N -r - < "$patch" || true done - if [ "${LLAMA_PAGED:-on}" != "off" ] && [ -d "patches/paged" ]; then - if [ -f llama.cpp/src/paged-kv-manager.cpp ]; then - echo "paged-attention patch series already applied (sentinel present) - skipping re-apply" - else - for patch in patches/paged/*.patch; do - [ -e "$patch" ] || continue - echo "Applying paged patch $patch" - patch -d llama.cpp/ -p1 -N -r - < "$patch" || true - done - fi - fi fi set -e diff --git a/backend/index.yaml b/backend/index.yaml index 248738c8f..de1667d45 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -81,7 +81,7 @@ LocalAI's paged-attention llama.cpp variant: on-demand paged KV cache plus a decode-first prefill budget. The SAME upstream llama.cpp grpc-server as the stock llama-cpp backend, with the LocalAI paged patch series applied - (LLAMA_PAGED=on). Tuned for NVFP4 dense / MoE on Blackwell / GB10. Reuses the + (vendored in this backend). Tuned for NVFP4 dense / MoE on Blackwell / GB10. Reuses the llama-cpp gRPC server sources; the paged engine is gated at runtime by the paged_kv / max_batch_tokens model options. Qwen3.5 gated-DeltaNet models can additionally opt into the reduced-precision hybrid SSM-state fast mode with diff --git a/docs/content/features/backends.md b/docs/content/features/backends.md index f37819ebd..95b94c102 100644 --- a/docs/content/features/backends.md +++ b/docs/content/features/backends.md @@ -125,7 +125,7 @@ For getting started, see the available backends in LocalAI here: https://github. LocalAI supports various types of backends: - **LLM Backends**: For running language models (e.g., llama.cpp, vLLM, SGLang, transformers, MLX) - - **`llama-cpp-localai-paged`**: LocalAI's paged-attention llama.cpp variant - on-demand paged KV cache plus a decode-first prefill budget, tuned for NVFP4 dense/MoE on Blackwell/GB10. Same upstream llama.cpp pin as the stock `llama-cpp` backend, reusing its gRPC server; the paged engine is enabled per-model via the `paged_kv` / `max_batch_tokens` options. For Qwen3.5 gated-DeltaNet (hybrid SSM) models you can additionally set `options: [ssm_bf16_tau:]` to enable the reduced-precision hybrid SSM-state fast mode: fast-decaying recurrent heads (memory length tau below the threshold, e.g. `32` / `64`) persist their state as bf16, halving that head's decode byte stream. Default off (`0`) keeps every head f32 and is bit-exact; when enabled the mode is **not** bit-exact (~91% same-top-p ceiling - see `backend/cpp/llama-cpp/patches/paged/README.md` for the quality/throughput profile). + - **`llama-cpp-localai-paged`**: LocalAI's paged-attention llama.cpp variant - on-demand paged KV cache plus a decode-first prefill budget, tuned for NVFP4 dense/MoE on Blackwell/GB10. Same upstream llama.cpp pin as the stock `llama-cpp` backend, reusing its gRPC server; the paged engine is enabled per-model via the `paged_kv` / `max_batch_tokens` options. For Qwen3.5 gated-DeltaNet (hybrid SSM) models you can additionally set `options: [ssm_bf16_tau:]` to enable the reduced-precision hybrid SSM-state fast mode: fast-decaying recurrent heads (memory length tau below the threshold, e.g. `32` / `64`) persist their state as bf16, halving that head's decode byte stream. Default off (`0`) keeps every head f32 and is bit-exact; when enabled the mode is **not** bit-exact (~91% same-top-p ceiling - see `backend/cpp/llama-cpp-localai-paged/patches/paged/README.md` for the quality/throughput profile). - **Speech-to-Text Backends**: For transcription (e.g., whisper.cpp, parakeet.cpp, faster-whisper, NeMo) - **Text-to-Speech Backends**: For speech synthesis (e.g., piper, Kokoro, VibeVoice, Qwen3-TTS) - **Sound Generation Backends**: For music and audio generation (e.g., ACE-Step) diff --git a/gallery/index.yaml b/gallery/index.yaml index 63cddb103..0fd5a3fe0 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -2,7 +2,7 @@ # ============================================================================= # NVFP4 Qwen3.6 (dense + MoE) for the LocalAI paged-attention llama.cpp backend. # These reproduce the GB10 / DGX Spark benchmark serving config (see -# backend/cpp/llama-cpp/patches/paged/LOCALAI_LLAMACPP_BACKEND_PLAN.md section 2). +# backend/cpp/llama-cpp-localai-paged/patches/paged/LOCALAI_LLAMACPP_BACKEND_PLAN.md section 2). # # PUBLISHED: the dense + MoE base NVFP4 GGUFs are live at huggingface.co/mudler/ # Qwen3.6-27B-NVFP4-GGUF and .../Qwen3.6-35B-A3B-NVFP4-GGUF (file_type MOSTLY_NVFP4); @@ -20,7 +20,7 @@ # persist their state as bf16 (LLAMA_SSM_BF16_TAU), halving that head's decode byte # stream. Default off (0) = every head f32 = bit-exact; when enabled the mode is NOT # bit-exact (~91% same-top-p, beats vLLM dense) - see -# backend/cpp/llama-cpp/patches/paged/README.md for the quality profile. +# backend/cpp/llama-cpp-localai-paged/patches/paged/README.md for the quality profile. # The two NVFP4 entries below intentionally stay bit-exact (no ssm_bf16_tau). # ============================================================================= - name: "qwen3.6-27b-nvfp4-paged" diff --git a/scripts/build/llama-cpp-localai-paged-darwin.sh b/scripts/build/llama-cpp-localai-paged-darwin.sh index 9d205bd7e..6a5779079 100755 --- a/scripts/build/llama-cpp-localai-paged-darwin.sh +++ b/scripts/build/llama-cpp-localai-paged-darwin.sh @@ -6,7 +6,8 @@ set -ex # scripts/build/llama-cpp-darwin.sh exactly, swapping the build dir, binary names, # shared-lib dir and output tar for the paged wrapper. The paged wrapper Makefile # (backend/cpp/llama-cpp-localai-paged) reuses backend/cpp/llama-cpp's CMakeLists -# /grpc-server with LLAMA_PAGED=on, so the Darwin/Metal path is identical: ggml +# /grpc-server and applies its own vendored paged patch series (patches/paged/) +# onto the cloned tree, so the Darwin/Metal path is identical: ggml # CPU_ALL_VARIANTS + GGML_METAL=ON, and --target ggml pulls in ggml-metal via # add_dependencies so the Metal GPU backend is produced as a loadable # libggml-metal.dylib. The new paged GDN/conv ops have no Metal kernel, so a