mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-23 16:19:07 -04:00
Compare commits
59 Commits
worktree-f
...
feat/recon
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f9a465ee25 | ||
|
|
48e22da165 | ||
|
|
f940dc858a | ||
|
|
f6d93591bd | ||
|
|
594576f440 | ||
|
|
5614b39782 | ||
|
|
b4f7a36d6d | ||
|
|
c6170b875d | ||
|
|
a9c7484986 | ||
|
|
e05dece93c | ||
|
|
7c2a347e79 | ||
|
|
6e0c491380 | ||
|
|
2bcdfe2a68 | ||
|
|
b843f498ca | ||
|
|
46d7d59a82 | ||
|
|
e3bca9a172 | ||
|
|
a19ab22186 | ||
|
|
91d08d88e6 | ||
|
|
2c5ed413cb | ||
|
|
01e098a844 | ||
|
|
600dafd20b | ||
|
|
ce8a3e9266 | ||
|
|
a88d9d2de3 | ||
|
|
1cf1bf32e1 | ||
|
|
f45c6acc54 | ||
|
|
1a1bd57469 | ||
|
|
1f29e96030 | ||
|
|
64560a974b | ||
|
|
32c47706ae | ||
|
|
e58870a573 | ||
|
|
8fab1d2e45 | ||
|
|
7b462a0d51 | ||
|
|
aed181e6c1 | ||
|
|
a556cd9afc | ||
|
|
b50b1fe418 | ||
|
|
b4c0dc67fe | ||
|
|
01fa12e0de | ||
|
|
cf7f9573a2 | ||
|
|
c6303104c7 | ||
|
|
3e96d811b7 | ||
|
|
23f225260c | ||
|
|
aef10723c9 | ||
|
|
9565db5f94 | ||
|
|
e19c43cf04 | ||
|
|
b081247d95 | ||
|
|
1be959ce30 | ||
|
|
518381278e | ||
|
|
93706fec57 | ||
|
|
11aee03a80 | ||
|
|
8915f2ab91 | ||
|
|
f143d7f688 | ||
|
|
dd928f0bdd | ||
|
|
c43a752afc | ||
|
|
079ac0e15a | ||
|
|
2e734bf560 | ||
|
|
72d46c1115 | ||
|
|
606128e4e9 | ||
|
|
59c7ad5153 | ||
|
|
78d682224a |
@@ -198,6 +198,27 @@ docker-build-backends: ... docker-build-<backend-name>
|
|||||||
- If the backend is in `backend/python/<backend-name>/` but uses `.` as context in the workflow file, use `.` context
|
- If the backend is in `backend/python/<backend-name>/` but uses `.` as context in the workflow file, use `.` context
|
||||||
- Check similar backends to determine the correct context
|
- Check similar backends to determine the correct context
|
||||||
|
|
||||||
|
## Documenting the backend (README + docs)
|
||||||
|
|
||||||
|
A backend is not "added" until it is discoverable. Update the user-facing docs:
|
||||||
|
|
||||||
|
- **`docs/content/features/backends.md`** - add the backend to the right
|
||||||
|
category in the "LocalAI supports various types of backends" list (and add a
|
||||||
|
new category if it introduces a new modality, e.g. sound classification).
|
||||||
|
- If the backend introduces a **new API surface** (a new endpoint or a realtime
|
||||||
|
capability), document it under `docs/content/` where its area lives (audio,
|
||||||
|
vision, etc.) and follow the api-endpoints checklist in
|
||||||
|
[api-endpoints-and-auth.md](api-endpoints-and-auth.md).
|
||||||
|
|
||||||
|
**If the backend is a native C/C++/GGML engine created and maintained by the
|
||||||
|
LocalAI team** (a from-scratch port like `parakeet.cpp`, `ced.cpp`,
|
||||||
|
`vibevoice.cpp`, `rf-detr.cpp`, not a wrapper around a third-party runtime), it
|
||||||
|
ALSO belongs in the top-level **`README.md`** table under "native C/C++/GGML
|
||||||
|
engines ... developed and maintained by the LocalAI project itself". Add a row
|
||||||
|
linking the upstream engine repo with a one-line description. This is the
|
||||||
|
project's showcase of its own engines; a new in-house backend that is missing
|
||||||
|
from it is a documentation bug.
|
||||||
|
|
||||||
## 5. Verification Checklist
|
## 5. Verification Checklist
|
||||||
|
|
||||||
After adding a new backend, verify:
|
After adding a new backend, verify:
|
||||||
@@ -211,6 +232,8 @@ After adding a new backend, verify:
|
|||||||
- [ ] No YAML syntax errors (check with linter)
|
- [ ] No YAML syntax errors (check with linter)
|
||||||
- [ ] No Makefile syntax errors (check with linter)
|
- [ ] No Makefile syntax errors (check with linter)
|
||||||
- [ ] Follows the same pattern as similar backends (e.g., if it's a transcription backend, follow `faster-whisper` pattern)
|
- [ ] Follows the same pattern as similar backends (e.g., if it's a transcription backend, follow `faster-whisper` pattern)
|
||||||
|
- [ ] Documented: added to the category list in `docs/content/features/backends.md` (and any new endpoint/realtime capability documented under `docs/content/`)
|
||||||
|
- [ ] If it is an in-house native C/C++/GGML engine, added to the maintained-engines table in the top-level `README.md`
|
||||||
|
|
||||||
## Bundling runtime shared libraries (`package.sh`)
|
## Bundling runtime shared libraries (`package.sh`)
|
||||||
|
|
||||||
|
|||||||
@@ -70,6 +70,12 @@ if [ "${BUILD_TYPE:-}" = "vulkan" ] && [ "${SKIP_DRIVERS:-false}" = "false" ]; t
|
|||||||
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||||
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
||||||
|
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe + Arm SoC) and their ICD
|
||||||
|
# manifests. The LunarG SDK below only provides the loader and shader
|
||||||
|
# tooling, not hardware drivers — without Mesa the packaged Vulkan backend
|
||||||
|
# would ship a loader that finds no GPU. package-gpu-libs.sh bundles these
|
||||||
|
# .so files plus their deps into the backend so it stays self-contained.
|
||||||
|
apt-get install -y mesa-vulkan-drivers libdrm2
|
||||||
if [ "amd64" = "${TARGETARCH:-}" ]; then
|
if [ "amd64" = "${TARGETARCH:-}" ]; then
|
||||||
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz"
|
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz"
|
||||||
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz
|
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz
|
||||||
|
|||||||
456
.github/backend-matrix.yml
vendored
456
.github/backend-matrix.yml
vendored
@@ -3575,6 +3575,450 @@ include:
|
|||||||
dockerfile: "./backend/Dockerfile.golang"
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
context: "./"
|
context: "./"
|
||||||
ubuntu-version: '2404'
|
ubuntu-version: '2404'
|
||||||
|
# ced
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "12"
|
||||||
|
cuda-minor-version: "8"
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-nvidia-cuda-12-ced'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "13"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-nvidia-cuda-13-ced'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "13"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-nvidia-l4t-cuda-13-arm64-ced'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
- build-type: ''
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
platform-tag: 'amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-cpu-ced'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: ''
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
platform-tag: 'arm64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-cpu-ced'
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'sycl_f32'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-intel-sycl-f32-ced'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'sycl_f16'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-intel-sycl-f16-ced'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'vulkan'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
platform-tag: 'amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-vulkan-ced'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'vulkan'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
platform-tag: 'arm64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-vulkan-ced'
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "12"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-nvidia-l4t-arm64-ced'
|
||||||
|
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2204'
|
||||||
|
- build-type: 'hipblas'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-rocm-hipblas-ced'
|
||||||
|
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "ced"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
# voice-detect
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "12"
|
||||||
|
cuda-minor-version: "8"
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-nvidia-cuda-12-voice-detect'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "voice-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "13"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-nvidia-cuda-13-voice-detect'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "voice-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "13"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-nvidia-l4t-cuda-13-arm64-voice-detect'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
backend: "voice-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
- build-type: ''
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
platform-tag: 'amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-cpu-voice-detect'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "voice-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: ''
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
platform-tag: 'arm64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-cpu-voice-detect'
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "voice-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'sycl_f32'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-intel-sycl-f32-voice-detect'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "voice-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'sycl_f16'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-intel-sycl-f16-voice-detect'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "voice-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'vulkan'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
platform-tag: 'amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-vulkan-voice-detect'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "voice-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'vulkan'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
platform-tag: 'arm64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-vulkan-voice-detect'
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "voice-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "12"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-nvidia-l4t-arm64-voice-detect'
|
||||||
|
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
backend: "voice-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2204'
|
||||||
|
- build-type: 'hipblas'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-rocm-hipblas-voice-detect'
|
||||||
|
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "voice-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
# face-detect
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "12"
|
||||||
|
cuda-minor-version: "8"
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-nvidia-cuda-12-face-detect'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "face-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "13"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-nvidia-cuda-13-face-detect'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "face-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "13"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-nvidia-l4t-cuda-13-arm64-face-detect'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
backend: "face-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
- build-type: ''
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
platform-tag: 'amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-cpu-face-detect'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "face-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: ''
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
platform-tag: 'arm64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-cpu-face-detect'
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "face-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'sycl_f32'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-intel-sycl-f32-face-detect'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "face-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'sycl_f16'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-intel-sycl-f16-face-detect'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "face-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'vulkan'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
platform-tag: 'amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-vulkan-face-detect'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "face-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'vulkan'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
platform-tag: 'arm64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-vulkan-face-detect'
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "face-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "12"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-nvidia-l4t-arm64-face-detect'
|
||||||
|
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
backend: "face-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2204'
|
||||||
|
- build-type: 'hipblas'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-rocm-hipblas-face-detect'
|
||||||
|
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "face-detect"
|
||||||
|
dockerfile: "./backend/Dockerfile.golang"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
# acestep-cpp
|
# acestep-cpp
|
||||||
- build-type: ''
|
- build-type: ''
|
||||||
cuda-major-version: ""
|
cuda-major-version: ""
|
||||||
@@ -4754,6 +5198,18 @@ includeDarwin:
|
|||||||
tag-suffix: "-metal-darwin-arm64-parakeet-cpp"
|
tag-suffix: "-metal-darwin-arm64-parakeet-cpp"
|
||||||
build-type: "metal"
|
build-type: "metal"
|
||||||
lang: "go"
|
lang: "go"
|
||||||
|
- backend: "ced"
|
||||||
|
tag-suffix: "-metal-darwin-arm64-ced"
|
||||||
|
build-type: "metal"
|
||||||
|
lang: "go"
|
||||||
|
- backend: "voice-detect"
|
||||||
|
tag-suffix: "-metal-darwin-arm64-voice-detect"
|
||||||
|
build-type: "metal"
|
||||||
|
lang: "go"
|
||||||
|
- backend: "face-detect"
|
||||||
|
tag-suffix: "-metal-darwin-arm64-face-detect"
|
||||||
|
build-type: "metal"
|
||||||
|
lang: "go"
|
||||||
- backend: "acestep-cpp"
|
- backend: "acestep-cpp"
|
||||||
tag-suffix: "-metal-darwin-arm64-acestep-cpp"
|
tag-suffix: "-metal-darwin-arm64-acestep-cpp"
|
||||||
build-type: "metal"
|
build-type: "metal"
|
||||||
|
|||||||
12
.github/workflows/bump_deps.yaml
vendored
12
.github/workflows/bump_deps.yaml
vendored
@@ -42,6 +42,18 @@ jobs:
|
|||||||
variable: "PARAKEET_VERSION"
|
variable: "PARAKEET_VERSION"
|
||||||
branch: "master"
|
branch: "master"
|
||||||
file: "backend/go/parakeet-cpp/Makefile"
|
file: "backend/go/parakeet-cpp/Makefile"
|
||||||
|
- repository: "mudler/ced.cpp"
|
||||||
|
variable: "CED_VERSION"
|
||||||
|
branch: "master"
|
||||||
|
file: "backend/go/ced/Makefile"
|
||||||
|
- repository: "mudler/voice-detect.cpp"
|
||||||
|
variable: "VOICEDETECT_VERSION"
|
||||||
|
branch: "master"
|
||||||
|
file: "backend/go/voice-detect/Makefile"
|
||||||
|
- repository: "mudler/face-detect.cpp"
|
||||||
|
variable: "FACEDETECT_VERSION"
|
||||||
|
branch: "master"
|
||||||
|
file: "backend/go/face-detect/Makefile"
|
||||||
- repository: "mudler/depth-anything.cpp"
|
- repository: "mudler/depth-anything.cpp"
|
||||||
variable: "DEPTHANYTHING_VERSION"
|
variable: "DEPTHANYTHING_VERSION"
|
||||||
branch: "master"
|
branch: "master"
|
||||||
|
|||||||
@@ -231,6 +231,7 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
|
|||||||
| Backend | What it does |
|
| Backend | What it does |
|
||||||
|---------|-------------|
|
|---------|-------------|
|
||||||
| [parakeet.cpp](https://github.com/mudler/parakeet.cpp) | C++/GGML port of NVIDIA NeMo Parakeet ASR (tdt/ctc/rnnt/hybrid), with cache-aware streaming transcription |
|
| [parakeet.cpp](https://github.com/mudler/parakeet.cpp) | C++/GGML port of NVIDIA NeMo Parakeet ASR (tdt/ctc/rnnt/hybrid), with cache-aware streaming transcription |
|
||||||
|
| [ced.cpp](https://github.com/mudler/ced.cpp) | C++/GGML port of the CED audio-tagging models: sound-event classification (527-class AudioSet) over REST and the realtime API for live recognition |
|
||||||
| [voxtral.c](https://github.com/mudler/voxtral.c) | Voxtral Realtime 4B speech-to-text in pure C |
|
| [voxtral.c](https://github.com/mudler/voxtral.c) | Voxtral Realtime 4B speech-to-text in pure C |
|
||||||
| [vibevoice.cpp](https://github.com/mudler/vibevoice.cpp) | Native port of Microsoft VibeVoice for TTS (voice cloning) and long-form ASR with speaker diarization |
|
| [vibevoice.cpp](https://github.com/mudler/vibevoice.cpp) | Native port of Microsoft VibeVoice for TTS (voice cloning) and long-form ASR with speaker diarization |
|
||||||
| [rf-detr.cpp](https://github.com/mudler/rf-detr.cpp) | Native RF-DETR object detection and instance segmentation |
|
| [rf-detr.cpp](https://github.com/mudler/rf-detr.cpp) | Native RF-DETR object detection and instance segmentation |
|
||||||
@@ -240,6 +241,8 @@ Most backends wrap a best-in-class upstream engine. A handful of them are native
|
|||||||
| [LocalVQE](https://github.com/localai-org/LocalVQE) | Joint acoustic echo cancellation, noise suppression, and dereverberation |
|
| [LocalVQE](https://github.com/localai-org/LocalVQE) | Joint acoustic echo cancellation, noise suppression, and dereverberation |
|
||||||
| [local-store](https://github.com/mudler/LocalAI) | Local-first vector database for embeddings (shipped in-tree) |
|
| [local-store](https://github.com/mudler/LocalAI) | Local-first vector database for embeddings (shipped in-tree) |
|
||||||
|
|
||||||
|
We also maintain [apex-quant](https://github.com/localai-org/apex-quant), a per-tensor, per-layer quantization recipe for Mixture-of-Experts models that exploits their structural sparsity to produce GGUFs matching or beating Q8_0 quality - and they run out of the box on stock llama.cpp.
|
||||||
|
|
||||||
## Resources
|
## Resources
|
||||||
|
|
||||||
- [Documentation](https://localai.io/)
|
- [Documentation](https://localai.io/)
|
||||||
|
|||||||
@@ -65,7 +65,12 @@ RUN <<EOT bash
|
|||||||
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
||||||
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||||
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
|
||||||
|
apt-get install -y mesa-vulkan-drivers libdrm2
|
||||||
|
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
|
||||||
|
# LunarG SDK below only provides the loader and shader tooling, not
|
||||||
|
# hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
|
||||||
|
# bundle and the packaged backend finds no GPU at runtime.
|
||||||
if [ "amd64" = "$TARGETARCH" ]; then
|
if [ "amd64" = "$TARGETARCH" ]; then
|
||||||
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
||||||
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
||||||
|
|||||||
@@ -66,7 +66,12 @@ RUN <<EOT bash
|
|||||||
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
||||||
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||||
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||||
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils && \
|
||||||
|
apt-get install -y mesa-vulkan-drivers libdrm2
|
||||||
|
# Mesa Vulkan ICD drivers (ANV/RADV/lavapipe) + their manifests. The
|
||||||
|
# LunarG SDK below only provides the loader and shader tooling, not
|
||||||
|
# hardware drivers — without Mesa, package-gpu-libs.sh has no ICD to
|
||||||
|
# bundle and the packaged backend finds no GPU at runtime.
|
||||||
if [ "amd64" = "$TARGETARCH" ]; then
|
if [ "amd64" = "$TARGETARCH" ]; then
|
||||||
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
||||||
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
||||||
|
|||||||
@@ -24,6 +24,9 @@ service Backend {
|
|||||||
rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
|
rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
|
||||||
rpc Status(HealthMessage) returns (StatusResponse) {}
|
rpc Status(HealthMessage) returns (StatusResponse) {}
|
||||||
rpc Detect(DetectOptions) returns (DetectResponse) {}
|
rpc Detect(DetectOptions) returns (DetectResponse) {}
|
||||||
|
// SoundDetection runs an audio-tagging / sound-event-classification model
|
||||||
|
// (e.g. CED over the AudioSet ontology) on a clip and returns scored labels.
|
||||||
|
rpc SoundDetection(SoundDetectionRequest) returns (SoundDetectionResponse) {}
|
||||||
rpc Depth(DepthRequest) returns (DepthResponse) {}
|
rpc Depth(DepthRequest) returns (DepthResponse) {}
|
||||||
rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
|
rpc FaceVerify(FaceVerifyRequest) returns (FaceVerifyResponse) {}
|
||||||
rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
|
rpc FaceAnalyze(FaceAnalyzeRequest) returns (FaceAnalyzeResponse) {}
|
||||||
@@ -671,6 +674,24 @@ message DetectResponse {
|
|||||||
repeated Detection Detections = 1;
|
repeated Detection Detections = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- Sound-event classification / audio tagging messages (CED) ---
|
||||||
|
|
||||||
|
message SoundDetectionRequest {
|
||||||
|
string src = 1; // audio file path (LocalAI writes the upload to disk)
|
||||||
|
int32 top_k = 2; // number of top tags to return (0 = all classes)
|
||||||
|
float threshold = 3; // optional: drop tags scoring below this
|
||||||
|
}
|
||||||
|
|
||||||
|
message SoundClass {
|
||||||
|
string label = 1; // AudioSet class name, e.g. "Baby cry, infant cry"
|
||||||
|
float score = 2; // per-class probability (multi-label, independent)
|
||||||
|
int32 index = 3; // class index in the model ontology
|
||||||
|
}
|
||||||
|
|
||||||
|
message SoundDetectionResponse {
|
||||||
|
repeated SoundClass detections = 1; // score-descending
|
||||||
|
}
|
||||||
|
|
||||||
// --- Depth estimation messages (Depth Anything 3) ---
|
// --- Depth estimation messages (Depth Anything 3) ---
|
||||||
|
|
||||||
message DepthRequest {
|
message DepthRequest {
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
IK_LLAMA_VERSION?=b3dfb7858cfcb9166e92f366e5af87f19ebc94be
|
IK_LLAMA_VERSION?=6c00e87ac84404af588ad2e65935bd6f079c696f
|
||||||
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
|
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
|
||||||
|
|
||||||
CMAKE_ARGS?=
|
CMAKE_ARGS?=
|
||||||
|
|||||||
@@ -1,14 +1,6 @@
|
|||||||
|
|
||||||
LLAMA_VERSION?=f3e182816421c648188b5eab269853bf1531d950
|
LLAMA_VERSION?=e475fa2b5f9fb50c3d6fc3e7c6fdf1e004465b62
|
||||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||||
# LLAMA_PAGED controls whether the vendored paged-attention patch series
|
|
||||||
# (patches/paged/) is applied on top of the pinned llama.cpp. Default on; set
|
|
||||||
# LLAMA_PAGED=off to build a clean-against-upstream backend (e.g. to unblock a
|
|
||||||
# dep-bump if an upstream change breaks a paged hook - the paged carry is then
|
|
||||||
# fixed independently). Runtime behaviour stays gated by the LLAMA_KV_PAGED env
|
|
||||||
# regardless, so an LLAMA_PAGED=on build is byte-identical to stock until that
|
|
||||||
# env is set.
|
|
||||||
LLAMA_PAGED?=on
|
|
||||||
|
|
||||||
CMAKE_ARGS?=
|
CMAKE_ARGS?=
|
||||||
BUILD_TYPE?=
|
BUILD_TYPE?=
|
||||||
@@ -145,28 +137,14 @@ llama.cpp:
|
|||||||
git remote add origin $(LLAMA_REPO) && \
|
git remote add origin $(LLAMA_REPO) && \
|
||||||
git fetch --all --tags && \
|
git fetch --all --tags && \
|
||||||
git checkout -b build $(LLAMA_VERSION) && \
|
git checkout -b build $(LLAMA_VERSION) && \
|
||||||
git submodule update --init --recursive --depth 1 --single-branch && \
|
git submodule update --init --recursive --depth 1 --single-branch
|
||||||
for p in $(CURRENT_MAKEFILE_DIR)patches/0*.patch; do \
|
|
||||||
[ -e "$$p" ] || continue; \
|
|
||||||
echo "applying llama.cpp patch: $$p"; \
|
|
||||||
git apply --verbose "$$p" || { echo "patch failed: $$p"; exit 1; }; \
|
|
||||||
done && \
|
|
||||||
if [ "$(LLAMA_PAGED)" = "off" ]; then \
|
|
||||||
echo "LLAMA_PAGED=off: skipping paged-attention patch series"; \
|
|
||||||
else \
|
|
||||||
for p in $(CURRENT_MAKEFILE_DIR)patches/paged/0*.patch; do \
|
|
||||||
[ -e "$$p" ] || continue; \
|
|
||||||
echo "applying llama.cpp PAGED patch: $$p"; \
|
|
||||||
git apply --verbose "$$p" || { echo "paged patch failed: $$p"; exit 1; }; \
|
|
||||||
done; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
llama.cpp/tools/grpc-server: llama.cpp
|
llama.cpp/tools/grpc-server: llama.cpp
|
||||||
mkdir -p llama.cpp/tools/grpc-server
|
mkdir -p llama.cpp/tools/grpc-server
|
||||||
LLAMA_PAGED=$(LLAMA_PAGED) bash prepare.sh
|
bash prepare.sh
|
||||||
|
|
||||||
rebuild:
|
rebuild:
|
||||||
LLAMA_PAGED=$(LLAMA_PAGED) bash prepare.sh
|
bash prepare.sh
|
||||||
rm -rf grpc-server
|
rm -rf grpc-server
|
||||||
$(MAKE) grpc-server
|
$(MAKE) grpc-server
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,18 @@
|
|||||||
#if __has_include("server-chat.cpp")
|
#if __has_include("server-chat.cpp")
|
||||||
#include "server-chat.cpp"
|
#include "server-chat.cpp"
|
||||||
#endif
|
#endif
|
||||||
|
// server-schema.cpp exists only in llama.cpp after the upstream refactor that
|
||||||
|
// extracted the JSON request-schema evaluation (previously the static
|
||||||
|
// server_task::params_from_json_cmpl) into server_schema::eval_llama_cmpl_schema.
|
||||||
|
// server-context.cpp and grpc-server.cpp both call into it, so its definitions
|
||||||
|
// must be part of this translation unit or the link fails. __has_include keeps
|
||||||
|
// the source compatible with older pins/forks (e.g. llama-cpp-turboquant) that
|
||||||
|
// predate the split and still expose params_from_json_cmpl (see the guarded
|
||||||
|
// call sites below).
|
||||||
|
#if __has_include("server-schema.cpp")
|
||||||
|
#define LOCALAI_HAS_SERVER_SCHEMA 1
|
||||||
|
#include "server-schema.cpp"
|
||||||
|
#endif
|
||||||
#include "server-context.cpp"
|
#include "server-context.cpp"
|
||||||
|
|
||||||
// LocalAI
|
// LocalAI
|
||||||
@@ -732,63 +744,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
|
|||||||
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
|
} else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") {
|
||||||
params.kv_unified = false;
|
params.kv_unified = false;
|
||||||
}
|
}
|
||||||
// --- paged KV cache (experimental, off by default) ---
|
|
||||||
// Enables the on-demand paged KV-cache engine (vendored PagedKVManager
|
|
||||||
// + paged placement/gather/alloc seams). The engine is gated inside
|
|
||||||
// llama.cpp by the LLAMA_KV_PAGED env var, evaluated once at first use;
|
|
||||||
// here we expose it as a per-server model option instead of forcing the
|
|
||||||
// operator to export a process-wide env. When enabled we set the env
|
|
||||||
// BEFORE the model/context is created (later in this handler), so the
|
|
||||||
// engine latches on. When the option is absent we touch nothing, so an
|
|
||||||
// externally exported LLAMA_KV_PAGED still works as an escape hatch.
|
|
||||||
// Note: the engine's env check is process-wide and latches on first
|
|
||||||
// use, so enabling it for one model enables it for the worker process;
|
|
||||||
// LocalAI runs one model per llama.cpp worker, so this maps cleanly to
|
|
||||||
// per-server configuration. `kv_paged_debug` turns on the per-slot
|
|
||||||
// [paged-alloc]/free trace (LLAMA_KV_PAGED_DEBUG).
|
|
||||||
//
|
|
||||||
// The continuous-batching serving loop (update_slots) drives paged KV
|
|
||||||
// transparently through the existing kv-cache seams: each slot's
|
|
||||||
// sequence allocates paged blocks on arrival (find_slot placement) and
|
|
||||||
// returns them on slot release (the seq_rm free seam). This is
|
|
||||||
// token-identical to stock under both the unified and per-sequence
|
|
||||||
// caches. The per-slot allocate/free capacity benefit, however, only
|
|
||||||
// materialises with a per-sequence cache, since paged block ownership
|
|
||||||
// is keyed by stream and the unified cache collapses every slot onto a
|
|
||||||
// single stream. Operators who want that benefit should pair this with
|
|
||||||
// `kv_unified:false`; we do NOT flip kv_unified here, to keep the
|
|
||||||
// default serving behaviour (and the idle-slot prompt cache) unchanged.
|
|
||||||
} else if (!strcmp(optname, "kv_paged") || !strcmp(optname, "paged_kv") || !strcmp(optname, "paged_attention")) {
|
|
||||||
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
|
|
||||||
setenv("LLAMA_KV_PAGED", "1", 1);
|
|
||||||
}
|
|
||||||
} else if (!strcmp(optname, "kv_paged_debug") || !strcmp(optname, "paged_kv_debug")) {
|
|
||||||
if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") {
|
|
||||||
setenv("LLAMA_KV_PAGED_DEBUG", "1", 1);
|
|
||||||
}
|
|
||||||
// --- chunked-prefill QoS budget (experimental, off by default) ---
|
|
||||||
// Caps the number of prompt tokens any single slot may prefill per
|
|
||||||
// update_slots iteration, so a large prompt cannot monopolise the batch
|
|
||||||
// and freeze the in-flight decoders. The serving loop reads this budget
|
|
||||||
// from the LLAMA_PREFILL_BUDGET env var (set BEFORE context init, like
|
|
||||||
// kv_paged above) and splits oversized prompts across iterations,
|
|
||||||
// interleaving decode steps for the other slots. A 6k-token prefill that
|
|
||||||
// stalled 8 decoders ~3.4s drops to ~780ms at budget=512 (4.8x stall
|
|
||||||
// cut) with zero TTFT cost and no steady-state regression. Unset or a
|
|
||||||
// non-positive value leaves the env untouched, so the stock unbounded
|
|
||||||
// prefill behaviour is preserved (an externally exported
|
|
||||||
// LLAMA_PREFILL_BUDGET still works as an escape hatch).
|
|
||||||
} else if (!strcmp(optname, "max_prefill_tokens") || !strcmp(optname, "mpt") || !strcmp(optname, "prefill_budget")) {
|
|
||||||
if (optval != NULL) {
|
|
||||||
try {
|
|
||||||
int budget = std::stoi(optval_str);
|
|
||||||
if (budget > 0) {
|
|
||||||
setenv("LLAMA_PREFILL_BUDGET", std::to_string(budget).c_str(), 1);
|
|
||||||
}
|
|
||||||
} catch (const std::exception& e) {
|
|
||||||
// If conversion fails, leave the budget unset (stock behaviour)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (!strcmp(optname, "n_ctx_checkpoints") || !strcmp(optname, "ctx_checkpoints")) {
|
} else if (!strcmp(optname, "n_ctx_checkpoints") || !strcmp(optname, "ctx_checkpoints")) {
|
||||||
if (optval != NULL) {
|
if (optval != NULL) {
|
||||||
try {
|
try {
|
||||||
@@ -2159,7 +2114,11 @@ public:
|
|||||||
task.index = i;
|
task.index = i;
|
||||||
|
|
||||||
task.tokens = std::move(inputs[i]);
|
task.tokens = std::move(inputs[i]);
|
||||||
|
#ifdef LOCALAI_HAS_SERVER_SCHEMA
|
||||||
|
task.params = server_schema::eval_llama_cmpl_schema(
|
||||||
|
#else
|
||||||
task.params = server_task::params_from_json_cmpl(
|
task.params = server_task::params_from_json_cmpl(
|
||||||
|
#endif
|
||||||
ctx_server.impl->vocab,
|
ctx_server.impl->vocab,
|
||||||
params_base,
|
params_base,
|
||||||
ctx_server.get_meta().slot_n_ctx,
|
ctx_server.get_meta().slot_n_ctx,
|
||||||
@@ -2173,7 +2132,7 @@ public:
|
|||||||
// cannot detect tool calls or separate reasoning from content.
|
// cannot detect tool calls or separate reasoning from content.
|
||||||
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
||||||
task.params.oaicompat_cmpl_id = completion_id;
|
task.params.oaicompat_cmpl_id = completion_id;
|
||||||
// oaicompat_model is already populated by params_from_json_cmpl
|
// oaicompat_model is already populated by eval_llama_cmpl_schema
|
||||||
|
|
||||||
tasks.push_back(std::move(task));
|
tasks.push_back(std::move(task));
|
||||||
}
|
}
|
||||||
@@ -2997,7 +2956,11 @@ public:
|
|||||||
task.index = i;
|
task.index = i;
|
||||||
|
|
||||||
task.tokens = std::move(inputs[i]);
|
task.tokens = std::move(inputs[i]);
|
||||||
|
#ifdef LOCALAI_HAS_SERVER_SCHEMA
|
||||||
|
task.params = server_schema::eval_llama_cmpl_schema(
|
||||||
|
#else
|
||||||
task.params = server_task::params_from_json_cmpl(
|
task.params = server_task::params_from_json_cmpl(
|
||||||
|
#endif
|
||||||
ctx_server.impl->vocab,
|
ctx_server.impl->vocab,
|
||||||
params_base,
|
params_base,
|
||||||
ctx_server.get_meta().slot_n_ctx,
|
ctx_server.get_meta().slot_n_ctx,
|
||||||
@@ -3009,7 +2972,7 @@ public:
|
|||||||
// reasoning, tool calls, and content are classified into ChatDeltas.
|
// reasoning, tool calls, and content are classified into ChatDeltas.
|
||||||
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
||||||
task.params.oaicompat_cmpl_id = completion_id;
|
task.params.oaicompat_cmpl_id = completion_id;
|
||||||
// oaicompat_model is already populated by params_from_json_cmpl
|
// oaicompat_model is already populated by eval_llama_cmpl_schema
|
||||||
|
|
||||||
tasks.push_back(std::move(task));
|
tasks.push_back(std::move(task));
|
||||||
}
|
}
|
||||||
|
|||||||
7
backend/cpp/llama-cpp/paged/.gitignore
vendored
7
backend/cpp/llama-cpp/paged/.gitignore
vendored
@@ -1,7 +0,0 @@
|
|||||||
tests/test_free_block_queue
|
|
||||||
tests/test_block_pool
|
|
||||||
tests/test_paged_kv_manager
|
|
||||||
tests/test_prefix_cache
|
|
||||||
tests/test_ggml_paged_rw
|
|
||||||
tests/test_ggml_paged_attn
|
|
||||||
paged-bench
|
|
||||||
@@ -1,105 +0,0 @@
|
|||||||
# Blackwell (GB10 / sm_121) kernel gaps — measured + the corrected strategy
|
|
||||||
|
|
||||||
Supersedes the "greenfield tcgen05 FP4 grouped GEMM" framing in `FP4_GROUPED_MOE_KERNEL.md`. Research +
|
|
||||||
profiling reframed the problem: the kernels we need **already exist in ggml**; they're just **untuned for
|
|
||||||
Blackwell**. And the parity target is far lower than the headline vLLM number implied.
|
|
||||||
|
|
||||||
## 1. The parity target was wrong — it's ~3,300 t/s single-stream, not 24,444
|
|
||||||
|
|
||||||
vLLM's dense "24,444 t/s" is **aggregate concurrent-batch** throughput, not single-sequence. The GB10
|
|
||||||
compute roofline caps **single-stream** Qwen3-32B prefill at **~3,300 t/s (BF16/INT8 ceiling)** / **~6,600
|
|
||||||
(FP4 ceiling)**. So: don't chase 24,444 with one kernel. Aggregate parity = (a kernel at the ceiling) +
|
|
||||||
(batched-prefill scheduling). The *kernel* job is to reach ~3,300 (matches vLLM, which on GB10 also runs at
|
|
||||||
the BF16 ceiling) or ~6,600 (beats it, via FP4).
|
|
||||||
|
|
||||||
## 2. GB10 per-precision DENSE peaks (measured, not spec)
|
|
||||||
|
|
||||||
| precision | dense peak | vs BF16 |
|
|
||||||
|---|---|---|
|
|
||||||
| BF16 / FP16 | ~213 TFLOP/s | 1.0× |
|
|
||||||
| INT8 | ~215 TOPS | **1.0×** |
|
|
||||||
| FP4 (MXFP4/NVFP4) | ~427–500 TFLOP/s | **2.0×** |
|
|
||||||
|
|
||||||
Memory: ~273 GB/s LPDDR5X (the bottleneck for *decode*; prefill is compute-bound). **Critical:** GB10 is
|
|
||||||
**1:1:2** (BF16:INT8:FP4), NOT datacenter Blackwell's 1:2:4 — **INT8 gives ZERO speedup over BF16 here.** So
|
|
||||||
int8-MMQ has no precision advantage; only FP4 does. (NVIDIA spec sheets still claim 1:2:4 — contradicted by
|
|
||||||
direct GB10 measurement; on-the-record discrepancy.)
|
|
||||||
|
|
||||||
## 3. Measured gaps (nsys, GB10)
|
|
||||||
|
|
||||||
| path | kernel | % of prefill | achieved | % of ceiling |
|
|
||||||
|---|---|---|---|---|
|
|
||||||
| **Dense** Q4_K_M | `mul_mat_q<Q4_K/Q6_K>` (int8 MMQ) | 80% | ~46 TFLOP/s | **~21% of 215** |
|
|
||||||
| **MoE** MXFP4 | `mul_mat_q<MXFP4>` (FP4 MMA) | 37% | ~22 TFLOP/s | **~4–5% of 500** (or ~10% of BF16) |
|
|
||||||
|
|
||||||
Both kernels are **engaged correctly but untuned for Blackwell** — llama.cpp's MMQ was "tuned primarily for
|
|
||||||
RTX 3000/4000" (Ampere/Ada). The headroom (4–5×) is recoverable; it's not an architectural ceiling.
|
|
||||||
|
|
||||||
## 4. ggml's current quantized-matmul paths (what exists)
|
|
||||||
|
|
||||||
- **MMQ** (int8): quantizes activations to Q8_1, int8 `mma.sync`/`dp4a`. Prefill path. **Untuned for sm_12x.**
|
|
||||||
- **FP4 MMA** (#17906, merged): native MXFP4/NVFP4 `m16n8k64` block-scaled FP4 mma for cc≥12.0. Works on GB10
|
|
||||||
for MoE (we measured 3441 t/s MXFP4 prefill) — but underutilized (~5% of FP4 peak). On **sm_121** it's hit
|
|
||||||
by build-flag (`120f`) + nvcc `-O3` miscompile (#18331) + capability-gating issues.
|
|
||||||
- **dequant→cuBLAS-FP16**: unfused fallback (materializes FP16 weights, round-trips memory). Not a fused
|
|
||||||
Marlin. (Our `GGML_CUDA_FORCE_CUBLAS` no-op = this didn't even engage for Q4_K.)
|
|
||||||
- **NO fused Marlin-style W4A16 kernel** (dequant 4-bit→BF16 in-shared-mem → BF16 tensor cores). Real gap.
|
|
||||||
|
|
||||||
## 5. Strategy — match vs beat (this replaces the tcgen05-greenfield plan)
|
|
||||||
|
|
||||||
**To MATCH vLLM (~3,300 single-stream): FP4 is NOT required.** Because INT8 == BF16 on GB10, a tuned MMQ and
|
|
||||||
a BF16 Marlin kernel share the *same* ceiling — and vLLM hits parity via W4A16 Marlin (BF16), since its FP4
|
|
||||||
is also broken on sm_121.
|
|
||||||
|
|
||||||
Ranked, by effort:
|
|
||||||
1. **Probe: tune the existing int8 MMQ for Blackwell** (dense). Cheapest. We're at 21% of the ceiling —
|
|
||||||
recover via tile sizes, async copy (`cp.async`), double-buffered shared-mem pipeline, occupancy. Caveat:
|
|
||||||
the `nwarps*tile_C::I==mmq_y` static_assert (found earlier) couples the constants; and the Q8_1
|
|
||||||
activation-quant overhead caps pure-MMQ tuning. Bounded upside, but a fast experiment.
|
|
||||||
2. **Build a Marlin-style W4A16 BF16 GEMM** (dense) — the robust path to ~3,300 (4.3× over today's 765).
|
|
||||||
Dequant 4-bit→BF16 in shared memory, MMA on BF16 tensor cores, `cp.async` multi-buffer, offline weight
|
|
||||||
reshuffle. Mirrors vLLM's actual GB10 path; keeps activations BF16 (better quality than int8 MMQ); fills a
|
|
||||||
genuine ggml gap. **This is the recommended kernel to MATCH.**
|
|
||||||
|
|
||||||
**To BEAT vLLM (~6,600, 2×): fix — don't rewrite — the FP4 path on sm_121.**
|
|
||||||
3. **Get the existing FP4 MMA (#17906/#20644) fully working + tuned on sm_121.** It already works on sm_120
|
|
||||||
(RTX 5090: +43–68% prefill) and on GB10 for MoE. The blockers are the `120f` arch flag, the `-O3`
|
|
||||||
miscompile (#18331), capability gating — **build/compiler fixes, not a new kernel.** Then tune the FP4 MMQ
|
|
||||||
(it's at ~5% of FP4 peak). This is where upstream momentum already is, and the only route past vLLM.
|
|
||||||
|
|
||||||
**Dropped:** the from-scratch tcgen05/CUTLASS grouped GEMM (the old scaffold). It aimed past the matchable
|
|
||||||
ceiling, duplicates work the FP4-MMA path already does, and FP4 on sm_121 is a *fix* problem not a *write*
|
|
||||||
problem. The `fp4-grouped-moe.cu` scaffold/hook stays as a useful dispatch seam, but the kernel behind it
|
|
||||||
should be one of (1)/(2)/(3), not a greenfield CUTLASS collective.
|
|
||||||
|
|
||||||
## 6. Cheap experiment — RESULT: MXFP4 dense = free 1.44×, but not parity (kernel still untuned)
|
|
||||||
|
|
||||||
Requantized Qwen3-32B dense → MXFP4 (forced attn+ffn to mxfp4 via `--tensor-type`, `--allow-requantize`,
|
|
||||||
speed-only test) and benched prefill:
|
|
||||||
|
|
||||||
| quant | kernel | pp512 | pp2048 | vs Q4_K |
|
|
||||||
|---|---|---|---|---|
|
|
||||||
| Q4_K_M | int8-MMQ | 765 | 763 | 1.0× |
|
|
||||||
| **MXFP4** | **FP4-MMA** | **1099** | **1153** | **1.44×** |
|
|
||||||
|
|
||||||
**Findings:**
|
|
||||||
- **MXFP4 dense is a real, free 1.44× over Q4_K** — just a requantize, the existing FP4-MMA path engages for
|
|
||||||
dense weights on GB10. Worth shipping as a **Blackwell dense-quant recommendation** in the gallery (no kernel).
|
|
||||||
- **But it is NOT parity.** 1153 t/s = **~17% of the FP4 ceiling (~6,600)** / ~35% of the BF16 ceiling. So the
|
|
||||||
**FP4-MMA kernel is itself untuned** (consistent with the MoE measurement, ~5% of FP4 peak). MXFP4 moves dense
|
|
||||||
from the int8 path (765) onto the FP4 path (1153), but the FP4 kernel leaves ~4–6× on the table.
|
|
||||||
- **So the kernel work is confirmed and now precise: tune the FP4-MMA kernel** (it's the highest-value, since it
|
|
||||||
serves both dense-MXFP4 and MoE, and FP4 is the only path that can *beat* vLLM). Strategy item (3) — fix +
|
|
||||||
tune the existing FP4-MMA on sm_121 — is the priority; a Marlin-style W4A16 BF16 kernel (2) is the alternative
|
|
||||||
to *match* on the BF16 ceiling if FP4 tuning stalls.
|
|
||||||
|
|
||||||
Conclusion: the cheap test did NOT collapse the kernel problem (the kernels are untuned, not just the quant), but
|
|
||||||
it (a) gives a free 1.44× to ship now, and (b) sharpens the target to **tuning the FP4-MMA kernel**.
|
|
||||||
|
|
||||||
## Sources
|
|
||||||
GB10 peaks (measured): forums.developer.nvidia.com/t/351993, /360142, /373618. Marlin: github.com/IST-DASLab/marlin,
|
|
||||||
arxiv 2408.11743, developers.redhat.com Marlin/Machete. MMQ untuned: llama.cpp docs/build.md, discussions/16578,
|
|
||||||
DandinPower/llama.cpp_bench. FP4 landing/sm121: llama.cpp PR #17906/#20644, issues #19662/#18331. Roofline:
|
|
||||||
vllm.ai/blog/2026-06-01-vllm-dgx-spark, lmsys.org DGX Spark.
|
|
||||||
|
|
||||||
> **Correction (measured):** the earlier `GGML_CUDA_FORCE_CUBLAS` env test was a no-op because it's a *compile-time* `#ifdef`, not a runtime flag — cuBLAS never engaged. A real rebuild with `-DGGML_CUDA_FORCE_CUBLAS=ON` shows cuBLAS is **slower** than MMQ for dense Q4 (pp2048 690 vs 750) and runs an **Ampere `cutlass_80_tensorop` FP16 kernel** — cuBLAS-13.0 has no sm_121-tuned GEMM and falls back to sm_80. So *both* MMQ and cuBLAS sit at ~46 TFLOP/s (~21% of the 213 BF16 peak); there is **no library shortcut** to the ceiling on GB10 — a hand-tuned sm_120a kernel (Marlin-style) is required.
|
|
||||||
@@ -1,334 +0,0 @@
|
|||||||
# Chunked prefill + n_batch/n_ubatch decouple — implementation plan
|
|
||||||
|
|
||||||
Scope: LocalAI's llama.cpp backend (`backend/cpp/llama-cpp/`). Companion to
|
|
||||||
`PHASED_VLLM_PARITY_PLAN.md` Phase 3. This document is the concrete, file-cited
|
|
||||||
plan for what the brief called "chunked prefill".
|
|
||||||
|
|
||||||
Line numbers below are from two trees:
|
|
||||||
- LocalAI: `backend/cpp/llama-cpp/grpc-server.cpp`, `core/backend/options.go`,
|
|
||||||
`backend/backend.proto`, `core/backend/hardware_defaults.go` — exact.
|
|
||||||
- Vendored upstream scheduler: `llama.cpp/tools/server/server-context.cpp`. The
|
|
||||||
build copies `llama.cpp/tools/server/*` into `tools/grpc-server/` (`prepare.sh`
|
|
||||||
lines 15-17) and only overrides `grpc-server.cpp` + `CMakeLists.txt`. So
|
|
||||||
`update_slots()` is **inherited upstream code, not LocalAI code**. Line numbers
|
|
||||||
cited for it are from a same-era checkout (`d12cc3d`, 2026-04-09); the pin is
|
|
||||||
`f3e1828` (Makefile line 2). The structure is identical; exact lines may drift
|
|
||||||
a few rows at the pin — match on the quoted comment strings, not the integers.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## TL;DR — the headline finding
|
|
||||||
|
|
||||||
**Chunked prefill with prefill/decode interleaving is ALREADY implemented** in the
|
|
||||||
llama.cpp server scheduler that LocalAI vendors. It is not a missing feature on
|
|
||||||
this version. `update_slots()` in `server-context.cpp`:
|
|
||||||
|
|
||||||
1. **Adds ongoing decode tokens first** — "first, add sampled tokens from any
|
|
||||||
ongoing sequences" (≈ line 2088). Every `SLOT_STATE_GENERATING` slot gets its
|
|
||||||
one sampled token into the shared `llama_batch` before any prefill is added.
|
|
||||||
2. **Then fills the remaining `n_batch` budget with prompt (prefill) tokens** —
|
|
||||||
"next, batch any pending prompts without exceeding n_batch" (≈ line 2166),
|
|
||||||
gated by `params_base.cont_batching` (LocalAI sets `cont_batching = true` by
|
|
||||||
default, `grpc-server.cpp:547`). The per-slot prefill fill loop
|
|
||||||
(≈ line 2552) is `while (slot.prompt.n_tokens() < slot.task->n_tokens() &&
|
|
||||||
batch.n_tokens < n_batch)` — i.e. it caps each slot's prefill contribution to
|
|
||||||
the **remaining** budget and defers the rest to the next iteration.
|
|
||||||
3. **Decodes the combined batch in one pass** (≈ line 2728-2741): decode tokens
|
|
||||||
and prefill-chunk tokens go through the **same `llama_decode`**, which then
|
|
||||||
splits internally into `n_ubatch` physical sub-batches.
|
|
||||||
|
|
||||||
This is exactly the behavior the abandoned-looking draft **upstream PR #10718**
|
|
||||||
("server : chunked prefill support") asked for — "the first task is no longer
|
|
||||||
blocked by the second long prompt processing task." That PR is still marked OPEN
|
|
||||||
but its goal was absorbed into the natural evolution of `update_slots()`; we do
|
|
||||||
**not** need to port it. A long prefill no longer stalls the decode batch: decode
|
|
||||||
slots are serviced first every iteration, prefill consumes only the leftover
|
|
||||||
budget.
|
|
||||||
|
|
||||||
**Therefore: do not re-implement chunked prefill.** The real LocalAI gap is
|
|
||||||
narrow and is the rest of this plan:
|
|
||||||
|
|
||||||
- **Phase A (the actual gap): the `n_batch`/`n_ubatch` decouple.** LocalAI ties
|
|
||||||
the scheduler token budget (`n_batch`) to the physical forward width
|
|
||||||
(`n_ubatch`) at `grpc-server.cpp:515` + `:519`. This forces
|
|
||||||
`n_batch == n_ubatch`, so the logical scheduling window can never be wider than
|
|
||||||
one physical ubatch. You cannot keep `n_ubatch` at the Blackwell GEMM sweet
|
|
||||||
spot (2048) while widening `n_batch` so concurrent prefills + decodes co-batch
|
|
||||||
into a larger logical window. There is no first-class `batch:`/`ubatch:` split
|
|
||||||
on the Go side, and there is only a one-directional `ubatch` override on the C++
|
|
||||||
side (you can shrink ubatch below the coupled value, never grow n_batch above
|
|
||||||
it).
|
|
||||||
- **Phase B (optional policy lever): a decode-headroom prefill cap.** Upstream
|
|
||||||
caps prefill at the full `n_batch` shared with decode. Under heavy mixed load
|
|
||||||
one fat prefill chunk per iteration still adds inter-token latency (ITL) jitter
|
|
||||||
to the decoders sharing that forward. vLLM exposes
|
|
||||||
`long_prefill_token_threshold` / `max_num_partial_prefills` for this. A
|
|
||||||
LocalAI-specific per-iteration prefill cap (a patch to vendored `update_slots`)
|
|
||||||
bounds that jitter. This is genuinely not in upstream and is the only place a
|
|
||||||
scheduler-policy change is warranted.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 1. Current behavior — precise citations
|
|
||||||
|
|
||||||
### 1.1 The scheduler is upstream, inherited verbatim
|
|
||||||
- `prepare.sh:15-17` copies all of `llama.cpp/tools/server/*` into the
|
|
||||||
`grpc-server` build dir; `grpc-server.cpp` (LocalAI) replaces only the HTTP/gRPC
|
|
||||||
service + `params_parse` + `parse_options`. `update_slots()`, the slot state
|
|
||||||
machine, and the batch builder are **upstream `server-context.cpp`**, untouched
|
|
||||||
by LocalAI today.
|
|
||||||
- Slot states: `server-context.cpp:36-42` —
|
|
||||||
`SLOT_STATE_IDLE / WAIT_OTHER / STARTED / PROCESSING_PROMPT / DONE_PROMPT /
|
|
||||||
GENERATING`.
|
|
||||||
|
|
||||||
### 1.2 Decode-first, then prefill-fill, one shared batch
|
|
||||||
- `common_batch_clear(batch)` (≈ 2078) — one batch per `update_slots` iteration.
|
|
||||||
- Decode phase (≈ 2088-2156): for each `SLOT_STATE_GENERATING` slot,
|
|
||||||
`common_batch_add(batch, slot.sampled, …, /*logits=*/true)` adds exactly one
|
|
||||||
token. Decode is guaranteed a seat before prefill runs.
|
|
||||||
- Budget fetch (≈ 2158-2160): `n_batch = llama_n_batch(ctx)`,
|
|
||||||
`n_ubatch = llama_n_ubatch(ctx)`.
|
|
||||||
- Prefill phase (≈ 2166): `if (params_base.cont_batching || batch.n_tokens == 0)`
|
|
||||||
→ with cont_batching ON, prefill is added to the **same** batch as decode.
|
|
||||||
- Per-slot prefill fill (≈ 2552-2597):
|
|
||||||
`while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch)`
|
|
||||||
— adds prompt tokens until the slot is done **or** the shared budget is hit.
|
|
||||||
Whatever does not fit stays for the next iteration (the slot remains
|
|
||||||
`SLOT_STATE_PROCESSING_PROMPT`).
|
|
||||||
- Whole-prompt completion (≈ 2603-2615): when the slot's prompt is fully consumed
|
|
||||||
it flips to `SLOT_STATE_DONE_PROMPT`, sets `batch.logits[last] = true`, inits
|
|
||||||
the sampler. Next iteration it becomes `GENERATING`.
|
|
||||||
- Budget break (≈ 2693-2695): `if (batch.n_tokens >= n_batch) break;`.
|
|
||||||
- Decode (≈ 2728-2741): loops `batch_view` slices of `min(n_batch, remaining)` and
|
|
||||||
calls `llama_decode`; the physical `n_ubatch` split happens inside
|
|
||||||
`llama_decode`.
|
|
||||||
|
|
||||||
### 1.3 The chunking is gated by `can_split()`
|
|
||||||
- `server-context.cpp:225-231`: `can_split()` returns true unless the task needs
|
|
||||||
embeddings with non-LAST pooling. So **completion/generation tasks always
|
|
||||||
chunk-and-interleave**; only embeddings/rerank force the whole prompt into one
|
|
||||||
ubatch (≈ 2234-2244 raises "input is too large… increase the physical batch
|
|
||||||
size" — this is exactly why LocalAI bumped `n_ubatch` for rerank, see below).
|
|
||||||
|
|
||||||
### 1.4 LocalAI ties n_batch to n_ubatch (the gap)
|
|
||||||
- `grpc-server.cpp:515` — `params.n_batch = request->nbatch();`
|
|
||||||
- `grpc-server.cpp:519` — `params.n_ubatch = request->nbatch();` with the comment
|
|
||||||
that this fixes reranking being capped at the 512 default `n_ubatch`.
|
|
||||||
- `grpc-server.cpp:781-784` — the **only** decouple knob today: an `n_ubatch` /
|
|
||||||
`ubatch` option that overrides `n_ubatch` alone (added for embeddings/rerank).
|
|
||||||
There is **no** `batch` / `n_batch` option parse, so `n_batch` cannot be raised
|
|
||||||
above the coupled value from a model config. Confirmed: `grep '"n_batch"|"batch"'`
|
|
||||||
in `grpc-server.cpp` returns nothing.
|
|
||||||
- Options arrive via `request->options(i)` parsed as `optname:optval`
|
|
||||||
(`grpc-server.cpp:584-585`); these come from `ModelOptions.Options` ⟵
|
|
||||||
`c.Options` (`core/backend/options.go:221`).
|
|
||||||
|
|
||||||
### 1.5 Go side sends a single batch number
|
|
||||||
- `backend/backend.proto:341` — `int32 NBatch = 4;` is the only batch field; there
|
|
||||||
is **no** `NUBatch`.
|
|
||||||
- `core/backend/options.go:108-129` `EffectiveBatchSize`: returns `c.Batch` if set,
|
|
||||||
else context size for single-pass (score/embed/rerank), else
|
|
||||||
`hardwareDefaultBatchSize(512)`.
|
|
||||||
- `core/backend/options.go:228` — `NBatch: int32(b)` (single value to the
|
|
||||||
backend; becomes both `n_batch` and `n_ubatch` via 1.4).
|
|
||||||
- `core/backend/hardware_defaults.go:28,37-40` — `BlackwellBatchSize = 2048`;
|
|
||||||
on Blackwell an unset batch defaults to 2048, so today
|
|
||||||
`n_batch == n_ubatch == 2048` there.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 2. Why the decouple matters for serving (not just rerank)
|
|
||||||
|
|
||||||
Invariant: `n_ubatch <= n_batch`. `n_ubatch` is the physical forward-pass GEMM
|
|
||||||
width (compute efficiency; GB10 sweet spot ≈ 2048). `n_batch` is the per-iteration
|
|
||||||
**scheduler token budget** — the logical window shared by decode + prefill chunks,
|
|
||||||
analogous to vLLM's `max_num_batched_tokens`.
|
|
||||||
|
|
||||||
With `n_batch == n_ubatch` (today), the scheduling window cannot exceed one
|
|
||||||
physical ubatch. Consequences:
|
|
||||||
- Under concurrency, the combined (decode + multiple prefill chunks) logical batch
|
|
||||||
is capped at the physical ubatch, so aggregate prefill cannot grow past one
|
|
||||||
ubatch worth of tokens per iteration even when more slots have prompts queued.
|
|
||||||
- A user who shrinks `batch:` for memory also shrinks the physical ubatch,
|
|
||||||
degrading prefill GEMM efficiency — and vice versa.
|
|
||||||
|
|
||||||
Decoupling lets us hold `n_ubatch = 2048` (efficient GEMM) while setting a larger
|
|
||||||
`n_batch` (e.g. 4096) so more concurrent prefill+decode tokens co-schedule into one
|
|
||||||
logical window, lifting aggregate prefill under mixed load — `llama_decode` still
|
|
||||||
tiles the physical work at 2048.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 3. Phased implementation
|
|
||||||
|
|
||||||
### Phase 0 — Verification harness (do first; TDD red)
|
|
||||||
Bite-sized, no code change to the scheduler.
|
|
||||||
- **0.1 Token-identical greedy under mixed load.** Script: start the backend with
|
|
||||||
`n_parallel >= 4`, greedy sampling (temp 0, fixed seed). Fire (a) several short
|
|
||||||
decode streams and (b) one ~8k-token prompt concurrently (the exact repro from
|
|
||||||
PR #10718's body works). Capture each stream's full token id sequence. Re-run
|
|
||||||
with the prefill request absent. **Assert the short streams' token ids are
|
|
||||||
byte-identical** in both runs — proves interleaving does not perturb decode
|
|
||||||
numerics (KV/position correctness across chunk boundaries). Wire as a Ginkgo
|
|
||||||
spec under the backend e2e suite.
|
|
||||||
- **0.2 Mixed-workload throughput baseline.** Use `llama-batched-bench` (built from
|
|
||||||
the same tree) or a small driver hitting `/v1/chat/completions`: measure
|
|
||||||
aggregate prefill tok/s and decode tok/s, and p50/p99 ITL of the decode streams,
|
|
||||||
under the mixed workload. Record numbers for the current `n_batch==n_ubatch`
|
|
||||||
config. This is the before of Phase A/B.
|
|
||||||
|
|
||||||
Expected result of Phase 0: 0.1 already passes (interleave is correct today);
|
|
||||||
0.2 gives the baseline the decouple must beat.
|
|
||||||
|
|
||||||
### Phase A — Decouple n_batch from n_ubatch
|
|
||||||
Goal: let model config set the physical ubatch independently of the logical batch,
|
|
||||||
defaulting to today's behavior (no regression).
|
|
||||||
|
|
||||||
- **A.1 C++: accept a `batch`/`n_batch` option (and keep `ubatch`).**
|
|
||||||
In `grpc-server.cpp`, after the existing `ubatch` branch (`:781-784`), add a
|
|
||||||
sibling branch:
|
|
||||||
```cpp
|
|
||||||
} else if (!strcmp(optname, "n_batch") || !strcmp(optname, "batch")) {
|
|
||||||
if (optval != NULL) {
|
|
||||||
try { params.n_batch = std::stoi(optval_str); } catch (...) {}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
This is the missing direction (raise `n_batch` above the coupled value). Order
|
|
||||||
matters: both `:515/:519` run first (coupling as default), then option parsing
|
|
||||||
overrides either independently. Add a clamp note: if a user sets
|
|
||||||
`n_ubatch > n_batch`, llama.cpp will clamp/upbatch; log a warning. Keep the
|
|
||||||
`:519` aliasing for backward compat (rerank still works with no options).
|
|
||||||
|
|
||||||
- **A.2 Proto: add an explicit physical ubatch field.**
|
|
||||||
`backend/backend.proto:341` add `int32 NUBatch = <next free tag>;` (do not reuse
|
|
||||||
4). Regenerate with `make protogen-go` + the C++ proto build.
|
|
||||||
|
|
||||||
- **A.3 C++: honor `NUBatch` when present.**
|
|
||||||
In `grpc-server.cpp` `params_parse`, after `:519`, add:
|
|
||||||
```cpp
|
|
||||||
if (request->nubatch() > 0) {
|
|
||||||
params.n_ubatch = request->nubatch();
|
|
||||||
}
|
|
||||||
```
|
|
||||||
so an explicit physical ubatch wins over the `n_batch` alias, with the `ubatch`
|
|
||||||
string-option as a third path for users who only edit `options:`.
|
|
||||||
|
|
||||||
- **A.4 Go: config surface + plumbing.**
|
|
||||||
- Add `UBatch *int` (yaml `ubatch`) to the llama config struct alongside `Batch`
|
|
||||||
(search `core/config` for the `Batch` field; mirror it).
|
|
||||||
- In `core/backend/options.go`: add `EffectiveUBatchSize(c)` mirroring
|
|
||||||
`EffectiveBatchSize` (return `c.UBatch` if set, else
|
|
||||||
`min(EffectiveBatchSize(c), BlackwellBatchSize-or-512)` so the physical ubatch
|
|
||||||
stays at the hardware sweet spot while `n_batch` may be larger). Set
|
|
||||||
`NUBatch: int32(EffectiveUBatchSize(c))` next to `NBatch:` (`:228`).
|
|
||||||
- Keep the default such that when neither is set, `NUBatch == NBatch` ⇒
|
|
||||||
byte-identical to today.
|
|
||||||
|
|
||||||
- **A.5 Serving default (the lever).**
|
|
||||||
In `hardware_defaults.go`, introduce `BlackwellLogicalBatch = 4096` (or a
|
|
||||||
measured value) and let `EffectiveBatchSize` return it for **multi-slot serving**
|
|
||||||
configs (when `n_parallel > 1` and the model is a completion model), while
|
|
||||||
`EffectiveUBatchSize` stays at `BlackwellBatchSize = 2048`. Gate behind the same
|
|
||||||
Blackwell detection already used at `:37-40`. Single-stream/embedding/rerank
|
|
||||||
paths keep `n_batch == n_ubatch`. This is the only behavioral change shipped by
|
|
||||||
Phase A; Phase 0.2 must show it is net-positive before defaulting it on.
|
|
||||||
|
|
||||||
- **A.6 Tests.** Extend `hardware_defaults_internal_test.go` with
|
|
||||||
`EffectiveUBatchSize` cases; add a `grpcModelOpts` test asserting
|
|
||||||
`NUBatch <= NBatch` and that unset config yields `NUBatch == NBatch`. Re-run
|
|
||||||
0.1 (must still be token-identical) and 0.2 (must show aggregate-prefill gain or
|
|
||||||
neutral ITL) at `n_batch=4096, n_ubatch=2048`.
|
|
||||||
|
|
||||||
### Phase B — Decode-headroom prefill cap (optional policy, vendored patch)
|
|
||||||
Only if Phase 0.2 / A shows decode ITL jitter from fat prefill chunks. This is the
|
|
||||||
one change that touches the inherited scheduler, so it lives as a patch in
|
|
||||||
`backend/cpp/llama-cpp/patches/` (applied by `prepare.sh:6-11` / Makefile
|
|
||||||
`:141-145`), never as an edit to a checked-in upstream file.
|
|
||||||
|
|
||||||
Policy (pseudocode; insert into `update_slots()` prefill fill loop, the
|
|
||||||
`while (… && batch.n_tokens < n_batch)` at ≈ `server-context.cpp:2552`):
|
|
||||||
|
|
||||||
```
|
|
||||||
# token budget for THIS iteration, decode already seated:
|
|
||||||
n_decode_in_batch = batch.n_tokens # set after the decode phase
|
|
||||||
prefill_budget = n_batch # default == today
|
|
||||||
|
|
||||||
if serving_mode and n_decode_in_batch > 0:
|
|
||||||
# leave room so decoders are not starved/jittered by one giant prefill chunk
|
|
||||||
# max_prefill_per_iter defaults to n_ubatch (one physical tile) when decode active
|
|
||||||
prefill_budget = min(n_batch, n_decode_in_batch + max_prefill_per_iter)
|
|
||||||
|
|
||||||
# fill loop guard becomes:
|
|
||||||
while slot.prompt.n_tokens() < slot.task->n_tokens()
|
|
||||||
and batch.n_tokens < prefill_budget:
|
|
||||||
...
|
|
||||||
```
|
|
||||||
|
|
||||||
- `max_prefill_per_iter` is a new `common_params` field surfaced as an
|
|
||||||
`options:` knob (`max_prefill_tokens` / `mpt`) parsed in `grpc-server.cpp`
|
|
||||||
exactly like A.1, default `0` = disabled = today's behavior.
|
|
||||||
- Semantics mirror vLLM `long_prefill_token_threshold`: cap the prefill share so
|
|
||||||
ongoing decodes keep a steady cadence; the remaining prompt rides the next
|
|
||||||
iteration (already supported by the state machine — slot stays
|
|
||||||
`PROCESSING_PROMPT`).
|
|
||||||
- **Correctness:** unchanged KV/position path — chunk boundaries already advance
|
|
||||||
`slot.prompt.tokens.pos_next()` per added token (≈ 2570) and the slot resumes
|
|
||||||
from `slot.prompt.n_tokens()` next iteration. Capping the budget only changes
|
|
||||||
*how many* tokens are added this iteration, not *which* positions, so 0.1 must
|
|
||||||
remain token-identical.
|
|
||||||
|
|
||||||
### Phase C — Docs + defaults rollout
|
|
||||||
- Document `batch` / `ubatch` (and `max_prefill_tokens` if B ships) in
|
|
||||||
`docs/content/` model-config reference, with the serving recipe
|
|
||||||
(`n_parallel>1`, `n_batch=4096`, `ubatch=2048`).
|
|
||||||
- Note the orthogonality to paged KV (below) in
|
|
||||||
`PHASED_VLLM_PARITY_PLAN.md` Phase 3.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 4. Risk / correctness
|
|
||||||
|
|
||||||
- **KV-cache & positions across chunks:** already handled upstream. Each prefill
|
|
||||||
token added advances `pos_next()` (≈ 2570) and is pushed to `slot.prompt.tokens`
|
|
||||||
(≈ 2573); the next iteration resumes from `slot.prompt.n_tokens()`. Chunk
|
|
||||||
boundaries are transparent to the KV cache because positions are absolute, not
|
|
||||||
per-chunk. Phase A changes only budgets, not positions; Phase B changes only the
|
|
||||||
per-iteration count. The 0.1 token-identical test is the guardrail.
|
|
||||||
- **Unified KV cache (LocalAI default, `n_parallel` slots share one cache):**
|
|
||||||
unaffected — co-batching prefill+decode across slots is what the unified cache is
|
|
||||||
for; positions are per-`seq_id` (`{ slot.id }` in `common_batch_add`).
|
|
||||||
- **`n_ubatch > n_batch`:** invalid; A.4 clamps `EffectiveUBatchSize <=
|
|
||||||
EffectiveBatchSize` and A.1 logs a warning if options violate it.
|
|
||||||
- **Embeddings / rerank:** must keep `n_ubatch >= prompt length` (single pass,
|
|
||||||
`can_split()==false`). The existing `:519` alias + `EffectiveBatchSize`
|
|
||||||
context-sizing for single-pass usecases (`options.go:119-124`) must be preserved
|
|
||||||
— do not let the serving `BlackwellLogicalBatch` default leak into single-pass
|
|
||||||
configs (A.5 gates on completion + `n_parallel>1`).
|
|
||||||
- **Turboquant fork:** the fork lacks some `common_params` fields (see
|
|
||||||
`LOCALAI_LEGACY_LLAMA_CPP_SPEC` precedent at `grpc-server.cpp:755`). `n_batch` /
|
|
||||||
`n_ubatch` are ancient fields and safe; if Phase B adds `max_prefill_per_iter`,
|
|
||||||
guard the new field behind a `#ifndef` like the checkpoint block does.
|
|
||||||
|
|
||||||
## 5. Orthogonality to paged KV (Phase 2)
|
|
||||||
|
|
||||||
Keep them independent. Paged KV (the `-kvp` / block-manager effort, draft #22569,
|
|
||||||
and `paged/`) changes **where** KV blocks live (allocation/utilization). Chunked
|
|
||||||
prefill / this decouple changes **how many tokens per iteration** the scheduler
|
|
||||||
batches (the `n_batch` budget and decode/prefill interleave). They compose: paged
|
|
||||||
KV raises the concurrency ceiling (more slots), the decouple widens the per-iter
|
|
||||||
scheduling window to feed those slots; neither touches the other's data structures.
|
|
||||||
The only contact point is `update_slots()` — if both ship a vendored patch to it,
|
|
||||||
land them as separate, ordered patches in `patches/` and keep the hunks disjoint
|
|
||||||
(paged touches allocation/seq_rm; chunked-prefill Phase B touches the prefill fill
|
|
||||||
budget).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 6. Bottom line
|
|
||||||
|
|
||||||
- Chunked prefill + decode interleave: **already present and correct** on the
|
|
||||||
pinned llama.cpp — verify (Phase 0.1), do not rebuild.
|
|
||||||
- Real work: the **n_batch/n_ubatch decouple** (Phase A) — small, additive,
|
|
||||||
default-preserving — plus an **optional decode-headroom prefill cap** (Phase B)
|
|
||||||
if measurements show ITL jitter. Both are LocalAI-side: A in `grpc-server.cpp`
|
|
||||||
+ proto + `options.go`; B as a vendored `patches/` hunk.
|
|
||||||
@@ -1,215 +0,0 @@
|
|||||||
# llama.cpp multi-user decode overhead on DGX Spark (GB10, sm_121)
|
|
||||||
|
|
||||||
Investigation of the Qwen3-32B concurrent-decode throughput gap (llama.cpp ~547 t/s
|
|
||||||
vs vLLM ~667 t/s) on the GB10 box, build `~/llama.cpp-pr24423/build` (Release,
|
|
||||||
sm_121, `LLAMA_MAX_SEQ=256`, flash-attn on), model
|
|
||||||
`~/bench/q3-32b-gguf/Qwen3-32B-Q4_K_M.gguf`.
|
|
||||||
|
|
||||||
## TL;DR (the result overturns the brief's premise)
|
|
||||||
|
|
||||||
On **this** build the prime suspect is wrong and the host-overhead premise does not
|
|
||||||
hold:
|
|
||||||
|
|
||||||
1. **CUDA graphs are NOT disabled at high concurrency.** At npl=128, 94 of 98
|
|
||||||
decode `graph_compute` calls **replay a captured CUDA graph** (0 resets, stable
|
|
||||||
key, no property churn post-warmup). The keyed-warmup gate works.
|
|
||||||
2. **There is no ~170ms/step host hotspot here.** The GPU is **~96% active during
|
|
||||||
decode with graphs ON and ~96% active with graphs OFF**. Decode at npl=128 is
|
|
||||||
**GPU-compute-bound**, not host-bound.
|
|
||||||
3. The brief's "20% GPU util / 66ms GPU / 170ms host per step" was measured on a
|
|
||||||
different/earlier build (mainline without these graph fixes). It is not
|
|
||||||
reproducible on `llama.cpp-pr24423`.
|
|
||||||
4. Because the GPU is the bottleneck, re-enabling graphs cannot lift the number:
|
|
||||||
the clean A/B shows graphs ON vs OFF = **+1.5% at npl=128** (and +2.9% at
|
|
||||||
npl=32 - the benefit shrinks as concurrency rises and the GPU saturates).
|
|
||||||
5. The real gap to vLLM is the **quantized decode GEMM kernel**: `mul_mat_q`
|
|
||||||
(Q4_K + Q6_K) is ~68% of decode GPU time and runs ~2.1x above the GB10
|
|
||||||
memory-bandwidth floor. Closing the gap requires Marlin/Machete-style int4
|
|
||||||
GEMM kernels, not host-side work. This is a kernel project (the direction the
|
|
||||||
prior session's uncommitted `marlin-w4a16.cu` / `fp4-grouped-moe.cu` already
|
|
||||||
started, though those target w4a16/GPTQ-int4, not the K-quants this GGUF uses).
|
|
||||||
|
|
||||||
## 1. Why CUDA graphs are (not) disabled - exact code + measurement
|
|
||||||
|
|
||||||
### The gate (code)
|
|
||||||
|
|
||||||
PR24423 refactored the CUDA-graph path into a keyed, warmup-based scheme in
|
|
||||||
`~/llama.cpp-pr24423/ggml/src/ggml-cuda/ggml-cuda.cu`:
|
|
||||||
|
|
||||||
- `ggml_cuda_graph_get_key(cgraph)` (~L3343) keys the cached CUDA graph by
|
|
||||||
`cgraph->nodes[0]` (first-node pointer).
|
|
||||||
- `ggml_cuda_graph_check_compability(cgraph)` (~L3301) disables graphs only for:
|
|
||||||
- **split buffers** (`ggml_backend_buft_is_cuda_split`), and
|
|
||||||
- **`GGML_OP_MUL_MAT_ID`** when `src0` is non-quantized **or**
|
|
||||||
`ne[2] > get_mmvq_mmid_max(...)` (MoE expert routing needs a stream sync).
|
|
||||||
Qwen3-32B is **dense** -> no `MUL_MAT_ID` -> this condition never fires.
|
|
||||||
- `ggml_backend_cuda_graph_compute` (~L4514) warmup gate: a graph is used only
|
|
||||||
after **2 consecutive calls with no property change** (`warmup_complete`); any
|
|
||||||
property change resets warmup. `ggml_cuda_graph_update_required` (~L3347)
|
|
||||||
detects change by `memcmp` of the full `ggml_tensor` struct + per-src
|
|
||||||
data-ptr/ne/nb, with a fast path when `cgraph->uid` is unchanged.
|
|
||||||
|
|
||||||
### Why it stays enabled across decode steps
|
|
||||||
|
|
||||||
The graph stays stable because llama.cpp's host-side graph reuse holds during
|
|
||||||
decode, so node pointers/props (and `cgraph->uid`) do not churn:
|
|
||||||
|
|
||||||
- `llama_kv_cache::get_n_kv` (`src/llama-kv-cache.cpp` L1223-1233) **pads n_kv to
|
|
||||||
a multiple of 256** ("so that the graph remains constant across batches and can
|
|
||||||
be reused"). For ntg<=256 within the first KV block, n_kv is constant.
|
|
||||||
- `can_reuse_kq_mask` (`src/llama-graph.cpp` L43) keeps the KQ-mask dims stable:
|
|
||||||
`ne=[n_kv, n_tokens/n_stream, 1, n_stream]` = `[256,1,1,128]` every decode step
|
|
||||||
at npl=128.
|
|
||||||
- `can_reuse` (`src/llama-context.cpp` L1283) therefore returns true, so the
|
|
||||||
scheduler is **not** reset/re-split. `graph->uid` is only reassigned inside
|
|
||||||
`ggml_backend_sched_split_graph` (`ggml/src/ggml-backend.cpp` L1033, L1485),
|
|
||||||
which is skipped on the reuse path -> stable uid -> CUDA graph replays.
|
|
||||||
|
|
||||||
### Measurement (instrumented build, npl=128, ntg=96)
|
|
||||||
|
|
||||||
Env-gated counters added to `ggml_backend_cuda_graph_compute` /
|
|
||||||
`ggml_cuda_graph_update_required` (since `GGML_LOG_DEBUG` is compiled out in
|
|
||||||
Release / NDEBUG). End-of-run summary:
|
|
||||||
|
|
||||||
```
|
|
||||||
[GTRACE-SUMMARY] calls=98 notenab=0 warming=3 warmdone=1 RESET=0 USED=94 incompat=0 distinct_keys=1
|
|
||||||
```
|
|
||||||
|
|
||||||
94/98 decode `graph_compute` calls **replayed** a captured CUDA graph; **0**
|
|
||||||
warmup resets; a **single** distinct graph key for the whole decode; no node
|
|
||||||
property churn after warmup. Graphs are fully engaged at npl=128.
|
|
||||||
|
|
||||||
(The instrumentation was reverted afterwards; the checkout is back to its
|
|
||||||
pre-task state and the `.so` rebuilt clean.)
|
|
||||||
|
|
||||||
## 2. The per-step CPU "hotspot" - there isn't one on this build
|
|
||||||
|
|
||||||
GPU utilization during npl=128 decode (ntg=256):
|
|
||||||
|
|
||||||
- **Graphs ON** - `nvidia-smi` sampled every 0.7s through the decode phase:
|
|
||||||
steady **96% GPU util**, SM clock **2184 MHz** (not throttled), 45-47 W.
|
|
||||||
- **Graphs OFF** (`GGML_CUDA_DISABLE_GRAPHS=1`) - nsys CUDA trace, 8s window:
|
|
||||||
total GPU kernel time = `3,983,292,128 ns / 0.516` = **~7.72s of the 8s
|
|
||||||
window = ~96% GPU-active**. Even with every kernel launched individually from
|
|
||||||
the host, the GPU is still ~96% busy. There are essentially **no host gaps**.
|
|
||||||
|
|
||||||
Per-step wall = 60.6s / 256 steps = **~237 ms/step**, and the sum of one decode
|
|
||||||
graph's kernel times (nsys, graphs-on capture) is ~244 ms -> GPU kernel time per
|
|
||||||
step ~= wall time per step. The host work between steps is in the low single-digit
|
|
||||||
ms (the ~4% idle), consistent with graphs ON giving only +1.5% at npl=128.
|
|
||||||
|
|
||||||
This directly contradicts the brief's 66ms-GPU / 170ms-host split, which must have
|
|
||||||
come from a pre-graphs build.
|
|
||||||
|
|
||||||
### Per-step GPU breakdown (nsys, npl=128 decode, graphs off, 8s window)
|
|
||||||
|
|
||||||
| Kernel | % GPU time | ~ms/step |
|
|
||||||
|--------|-----------:|---------:|
|
|
||||||
| `mul_mat_q` Q4_K (type 12) | 51.6 | ~118 |
|
|
||||||
| `flash_attn_ext_f16` | 19.3 | ~44 |
|
|
||||||
| `mul_mat_q` Q6_K (type 14) | 16.2 | ~37 |
|
|
||||||
| `unary_gated` silu | 4.1 | ~9 |
|
|
||||||
| mmq stream-k fixup + quantize_q8_1 | ~5 | ~12 |
|
|
||||||
| rms_norm / rope / set_rows / add | ~4 | ~10 |
|
|
||||||
|
|
||||||
Quantized matmul = **~68%** of decode GPU time (~155 ms/step). Attention ~19%.
|
|
||||||
|
|
||||||
`perf` could not profile the host (kernel `perf_event_paranoid=4`), but it is moot:
|
|
||||||
the host is ~4% of the wall, so there is no ~170ms host hotspot to chase.
|
|
||||||
|
|
||||||
## 3. Fix attempt + measured result
|
|
||||||
|
|
||||||
### The requested fix (re-enable graphs / pad the decode batch) is a no-op here
|
|
||||||
|
|
||||||
Graphs are already enabled and the batch is already stable (n_kv padded to 256,
|
|
||||||
kq_mask dims constant). The clean cold A/B (cooldowns between every run):
|
|
||||||
|
|
||||||
| npl | graphs ON (t/s) | graphs OFF (t/s) | delta |
|
|
||||||
|----:|----------------:|-----------------:|------:|
|
|
||||||
| 32 | 242.60 | 235.75 | +2.9% |
|
|
||||||
| 64 | 398.59 | 389.06 | +2.5% |
|
|
||||||
| 128 | 543.95 | 535.71 | +1.5% |
|
|
||||||
|
|
||||||
Baseline (separate cold runs, original non-instrumented build):
|
|
||||||
npl=32 243.9, npl=64 397.1, **npl=128 544.95** (matches the ~546 baseline).
|
|
||||||
|
|
||||||
Graphs help, but the benefit **monotonically shrinks** as concurrency rises and
|
|
||||||
the GPU saturates. At npl=128 there is only ~1.5% of host launch overhead left to
|
|
||||||
remove, and GPU util is ~96% in both columns. **You cannot lift npl=128 decode
|
|
||||||
toward 667 by working on graphs/host overhead - the GPU is the bottleneck.**
|
|
||||||
|
|
||||||
### Where the number actually is, and the real lever
|
|
||||||
|
|
||||||
- vLLM 667 t/s at this concurrency = **192 ms/step**; llama.cpp 547 = **237
|
|
||||||
ms/step**. The ~45 ms/step gap maps almost entirely onto the quantized matmul.
|
|
||||||
- GB10 memory-bandwidth floor for a 32B Q4_K_M (~19.8 GB of weights, read once
|
|
||||||
per step and shared across the 128 sequences) at ~273 GB/s is **~72 ms/step**.
|
|
||||||
llama.cpp's `mul_mat_q` spends ~155 ms/step on matmul = **~2.1x the bandwidth
|
|
||||||
floor**. vLLM's Marlin/Machete int4 GEMMs run much closer to the floor; that
|
|
||||||
efficiency difference is the ~547 -> 667 gap.
|
|
||||||
- The Q6_K matmul (`mul_mat_q` type 14) also shows pathological tail latency
|
|
||||||
(median 0.89 ms, max 5.5 ms) - the MMQ kernel is not well-tuned for the skinny
|
|
||||||
n=128 decode shape.
|
|
||||||
|
|
||||||
**The lever to beat 547 is a faster quantized decode GEMM**, i.e. a Marlin-style
|
|
||||||
int4 kernel for the decode shapes. This is exactly the direction of the prior
|
|
||||||
session's uncommitted `ggml/src/ggml-cuda/marlin-w4a16.cu` and
|
|
||||||
`fp4-grouped-moe.cu` (already wired via
|
|
||||||
`if (!split && ggml_cuda_w4a16_mul_mat(...)) return;` in `ggml_cuda_mul_mat`).
|
|
||||||
Note those target **w4a16 / GPTQ-int4**, while this GGUF is **K-quant (Q4_K/Q6_K)**,
|
|
||||||
so they are inert for this model - a Marlin path for K-quants (or shipping the
|
|
||||||
model in a Marlin-friendly int4 format) would be required. That is a multi-day
|
|
||||||
kernel effort, out of scope for this session, but it is the only lever that can
|
|
||||||
move the number.
|
|
||||||
|
|
||||||
### Why the "bump LLAMA_MAX_SEQ to 1024 -> 377" data point is consistent
|
|
||||||
|
|
||||||
`llama_batch_allocr` keeps `seq_cpl` as an `LLAMA_MAX_SEQ x LLAMA_MAX_SEQ` table
|
|
||||||
(`src/llama-batch.cpp`), so per-batch seq bookkeeping scales ~O(MAX_SEQ^2). At
|
|
||||||
MAX_SEQ=1024 that host cost becomes large enough (~70 ms/step) to dominate and
|
|
||||||
drop decode to 377. At MAX_SEQ=256 the same term is ~4.4 ms/step (the ~1.5% that
|
|
||||||
graphs reclaim); lowering to 128 would save ~3 ms/step (~1%). So MAX_SEQ tuning
|
|
||||||
confirms the host term is real but tiny at 256 - not a path to 667.
|
|
||||||
|
|
||||||
## How this would land in LocalAI
|
|
||||||
|
|
||||||
- **No host/graph patch is warranted** for this build: graphs already engage and
|
|
||||||
the decode is GPU-bound. A "pad the decode batch / force graph capture" patch
|
|
||||||
would change nothing measurable at high concurrency.
|
|
||||||
- The actionable upstream/vendored work is a **Marlin-style int4 decode GEMM**
|
|
||||||
(extend the prior `marlin-w4a16.cu` to cover K-quants, or quantize the served
|
|
||||||
model into a Marlin-friendly int4 layout). That is where the ~547 -> 667+ lives.
|
|
||||||
- If a small host win is still wanted, keep `LLAMA_MAX_SEQ` no larger than the max
|
|
||||||
concurrency actually used (the per-batch `seq_cpl` table is O(MAX_SEQ^2)).
|
|
||||||
|
|
||||||
## Reproduction
|
|
||||||
|
|
||||||
```
|
|
||||||
# baseline / A/B (cold, 30s cooldowns)
|
|
||||||
llama-batched-bench -m Qwen3-32B-Q4_K_M.gguf -npp 16 -ntg 128 -npl 32,64,128 \
|
|
||||||
-ngl 99 -b 2048 -ub 2048 -fa on # graphs on
|
|
||||||
GGML_CUDA_DISABLE_GRAPHS=1 ...same... # graphs off
|
|
||||||
|
|
||||||
# GPU util (graphs on): sample nvidia-smi during decode -> ~96%, 2184 MHz
|
|
||||||
# GPU active (graphs off): nsys profile -t cuda --delay=6 --duration=8 ...
|
|
||||||
# nsys stats --report cuda_gpu_kern_sum -> sum/0.516 ~= 7.72s of 8s = ~96%
|
|
||||||
```
|
|
||||||
|
|
||||||
## UPDATE: NVFP4 closes most of the decode gap (no Marlin-for-K-quants needed)
|
|
||||||
|
|
||||||
The diagnosis above said the lever is "a more bandwidth-efficient int4 decode GEMM"
|
|
||||||
and feared a multi-day Marlin-for-K-quants kernel. But the FP4-MMA path is already
|
|
||||||
that kernel. Measured (npl=128, cold A/B, npp=16 ntg=128):
|
|
||||||
|
|
||||||
| quant | decode S_TG (t/s) | vs Q4_K | vs vLLM 667 |
|
|
||||||
|---|---|---|---|
|
|
||||||
| Q4_K_M | 547 (548/546) | - | 82% |
|
|
||||||
| **NVFP4** | **619 (617/622)** | **+13%** | **93%** |
|
|
||||||
|
|
||||||
NVFP4's `mul_mat_q<NVFP4>` runs closer to the GB10 bandwidth floor at the thin n=128
|
|
||||||
decode shape than Q4_K's int8-MMQ (which ran ~2.1x above it). So shipping the model
|
|
||||||
as NVFP4 closes the decode gap from ~22% to ~7% AND wins prefill (1209 vs Q4 767 /
|
|
||||||
vLLM 800). Net on GB10: llama.cpp+NVFP4 is ahead on prefill (1.5x) and within ~7% on
|
|
||||||
decode. The remaining ~7% would be incremental FP4-MMA decode-kernel tuning, NOT a
|
|
||||||
from-scratch Marlin kernel - a much smaller, optional effort. NVFP4 is the answer to
|
|
||||||
both the prefill and the decode gap.
|
|
||||||
@@ -1,253 +0,0 @@
|
|||||||
# Closing the vLLM Gap on Blackwell (GB10 / DGX Spark) — Living Plan & Results
|
|
||||||
|
|
||||||
Target hardware: NVIDIA **GB10** (Grace-Blackwell, `sm_121a`, 119 GiB unified LPDDR5X), `dgx.casa`.
|
|
||||||
Model under test: **Qwen3-Coder-30B-A3B-Instruct** (MoE, 128 experts, top-8, ~3B active).
|
|
||||||
Engines: llama.cpp (CUDA, `~/llama.cpp-pr24423`, build `7a6ddc5`, `CMAKE_CUDA_ARCHITECTURES=121`) vs vLLM 0.23.0 (`~/vllm-bench`, torch 2.11.0+cu130).
|
|
||||||
|
|
||||||
> This is a working document. Each phase appends measured numbers, what was learned, and what's next.
|
|
||||||
> Methodology: `llama-bench` (single-stream pp/tg, built-in reps) and `llama-batched-bench` (`-npl` sweep,
|
|
||||||
> decode-phase aggregate `S_TG`, prefill aggregate `S_PP`); vLLM via `~/bench/vllm_conc.py` (decode-phase
|
|
||||||
> aggregate matched to `S_TG`). Same model/prompt/seed. Precision matched where possible.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Baseline results (established)
|
|
||||||
|
|
||||||
### Single-stream (B=1), matched ~8-bit
|
|
||||||
| Engine / precision | prefill pp512 (t/s) | decode tg128 (t/s) |
|
|
||||||
|---|---|---|
|
|
||||||
| llama.cpp **Q8_0** | 2215 ± 15 | **54.8 / 62.2** * |
|
|
||||||
| llama.cpp **F16** | 700 ± 24 | 32.9 ± 0.05 |
|
|
||||||
| vLLM **FP8** | 9155 ± 308 | 52.45 ± 0.05 |
|
|
||||||
|
|
||||||
\* two sessions; ~55 right after worker-stop (clocks settling), ~62 steady state. Both ≥ vLLM → **single-stream parity holds**.
|
|
||||||
|
|
||||||
### Concurrency sweep (decode-phase aggregate `S_TG`, prefill aggregate)
|
|
||||||
| B | llama Q8 prefill | vLLM FP8 prefill | llama Q8 decode | vLLM FP8 decode |
|
|
||||||
|---|---|---|---|---|
|
|
||||||
| 1 | 1080 | 9644 | 60.1 | 48.0 |
|
|
||||||
| 8 | 2189 | 33373 | 160.8 | 312.4 |
|
|
||||||
| 32 | 2198 | 99398 | 357.1 | 1171 |
|
|
||||||
| 64 | 2194 | 151990 | 519.2 | 2064 |
|
|
||||||
|
|
||||||
llama F16 prefill also flat: B=1 452 → B=8 723 → B=32 778. **Prefill flat at both precisions = kernel-throughput ceiling.**
|
|
||||||
|
|
||||||
### Our paged patch (LLAMA_KV_PAGED) — concurrency effect: NONE
|
|
||||||
Same Q8 binary, paged branch confirmed firing (137 placements at B=8), throughput identical within noise:
|
|
||||||
| | B=1 | B=8 | B=32 |
|
|
||||||
|---|---|---|---|
|
|
||||||
| stock decode | 61.2 | 171.7 | 377.0 |
|
|
||||||
| paged decode | 62.7 | 170.8 | 376.8 |
|
|
||||||
|
|
||||||
Patch is placement-only correctness prototype; doesn't implement concurrency mechanics. Single-stream-neutral, concurrency-neutral.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Root-cause diagnosis (nsys + code audit)
|
|
||||||
|
|
||||||
- **74.5% of GPU compute = `mul_mat_q`** (Q8_0 int8 MMQ GEMM, the MoE experts). Only cutlass kernel seen is `cutlass_80_tensorop` = **Ampere (sm_80)**, not Blackwell.
|
|
||||||
- ggml-cuda has **NO FP8 path** (no e4m3/e5m2 GEMM, no cuBLASLt FP8). Q8_0 runs the **Ampere-class int8 `mma.sync s8.s8.s32`** even on GB10 (`mma.cuh:924`, dispatched unconditionally `mmq.cu:307`).
|
|
||||||
- ggml-cuda **DOES** have a **native Blackwell FP4 path** (MXFP4 + NVFP4, `mma...kind::mxf4...e2m1`, `mma.cuh:1126`, gated `BLACKWELL_MMA_AVAILABLE`). Merged via #17906/#20644/#21074.
|
|
||||||
- **No fused MoE grouped GEMM**, no tcgen05/wgmma (warp-level `mma.sync` only).
|
|
||||||
- **Small per-expert GEMMs**: 512-tok ubatch → ~32 tok/expert (128 exp, top-8) → thin GEMMs, memory-bound, can't fill tensor-core tiles. vLLM processes 8192 tok/step → ~512 tok/expert → compute-bound + FP8.
|
|
||||||
- **The 45–69× gap is partly apples-to-oranges**: we compared llama Q8 (Ampere int8) vs vLLM FP8 (Blackwell). Upstream/NVIDIA benches put the *real* FP4-vs-FP8 prefill gap at **~25–50% long-context**, not 45–69×.
|
|
||||||
|
|
||||||
Key upstream refs: discussion #22042 (FP8 design: `ggml_mul_mat_ext` + scale tensors), #17906 (native MXFP4), #18250 (NVFP4-MoE closed not-planned).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## The levers (cheap → expensive) — execution log
|
|
||||||
|
|
||||||
### Lever 1 — NVFP4/MXFP4 model (use existing Blackwell FP4 path) + ubatch bump
|
|
||||||
Status: **IN PROGRESS** — single-stream done, concurrency next.
|
|
||||||
Quant: `llama-quantize F16 -> MXFP4_MOE` (type 38), 15.9 GiB / 4.47 BPW. (No NVFP4 in llama-quantize; MXFP4_MOE puts experts in MXFP4 = Blackwell FP4 MMA.)
|
|
||||||
|
|
||||||
Single-stream (llama-bench), MXFP4 vs Q8 vs vLLM-FP8:
|
|
||||||
| metric | llama Q8 | **llama MXFP4** | vLLM FP8 |
|
|
||||||
|---|---|---|---|
|
|
||||||
| prefill pp512 (ub512) | 2215 | **3061 ± 22** | 9155 |
|
|
||||||
| prefill pp2048 (ub512) | ~2200 | 3137 ± 7 | — |
|
|
||||||
| prefill pp2048 (**ub2048**) | — | **3441 ± 14** | — |
|
|
||||||
| decode tg128 | 62.2 | **86.4 ± 0.3** | 52.45 |
|
|
||||||
|
|
||||||
Findings:
|
|
||||||
- **MXFP4 decode 86.4 beats vLLM FP8 52.45 by 1.65×** (4-bit = less memory traffic; decode is memory-bound). llama wins decode outright.
|
|
||||||
- MXFP4 prefill +38% over Q8; **ub2048 lifts prefill +10%** (3137→3441). Single-stream prefill gap to vLLM: 4.1× (Q8) → **2.7× (MXFP4)**.
|
|
||||||
- Caveat: MXFP4 is 4-bit vs vLLM FP8 8-bit — not precision-matched. Fair match = vLLM NVFP4 (4-bit); pending.
|
|
||||||
Concurrency (decode-phase aggregate `S_TG`, ub2048), MXFP4 vs Q8 vs vLLM-FP8:
|
|
||||||
| B | Q8 dec | **MXFP4 dec** | vLLM dec | Q8 pp | **MXFP4 pp** | vLLM pp |
|
|
||||||
|---|---|---|---|---|---|---|
|
|
||||||
| 1 | 60.1 | **83.4** | 48.0 | 1080 | 1625 | 9644 |
|
|
||||||
| 8 | 160.8 | **267.4** | 312.4 | 2189 | 3634 | 33373 |
|
|
||||||
| 32 | 357.1 | **551.2** | 1171 | 2198 | 3651 | 99398 |
|
|
||||||
| 64 | 519.2 | **770.2** | 2064 | 2194 | 3648 | 151990 |
|
|
||||||
|
|
||||||
**Lever-1 verdict:** MXFP4 is a large, free win — decode +50–66% over Q8, prefill plateau +66% (2200→3650). MXFP4 decode **wins at B=1, near-parity at B=8** vs vLLM; only falls behind at high concurrency. **Prefill still plateaus (~3650)** — the MoE prefill GEMM doesn't scale with batch (no fused grouped GEMM; ubatch-limited). That plateau is the real remaining structural gap → Levers 2–3. Quality caveat unchanged (MXFP4 4-bit vs vLLM FP8 8-bit; quality not yet evaluated).
|
|
||||||
|
|
||||||
### Lever 2 — `n_ubatch` / `n_batch` tuning (standalone)
|
|
||||||
Status: **DONE + SHIPPED (auto-default implemented)**
|
|
||||||
MXFP4 pp4096 vs ubatch: ub512=2994, **ub2048=3316**, ub4096=2820(noisy), ub8192=3180.
|
|
||||||
**Verdict:** prefill saturates at ub=2048; larger ubatch gives nothing. The ~3300–3650 ceiling is the **MoE GEMM kernel**, not batch size. → No more free config wins; the rest is kernel work (Levers 3–5).
|
|
||||||
**Implemented:** `core/backend/hardware_defaults.go` — `EffectiveBatchSize` now defaults the physical batch
|
|
||||||
(n_batch→n_ubatch alias) to **2048 on Blackwell** (`xsysinfo.IsNVIDIABlackwell`, cc≥12 / sm_120/121) when the
|
|
||||||
config leaves `batch:` unset; explicit `batch:` always wins. Detection is a shared Go helper; placed at the
|
|
||||||
common ModelOptions builder so it covers the C++ llama.cpp backend too. Tests: `hardware_defaults_internal_test.go`.
|
|
||||||
|
|
||||||
### Lever 1b — Standard Q4 vs MXFP4 (what's actually MXFP4-specific)
|
|
||||||
**Q4_K_M** (17.3 GiB) vs **MXFP4** (15.9 GiB), ub2048:
|
|
||||||
| metric | Q4_K_M | MXFP4 | Q8 |
|
|
||||||
|---|---|---|---|
|
|
||||||
| decode tg128 | **93.5** | 86.4 | 62.2 |
|
|
||||||
| prefill pp512 | 2164 | **3061** | 2215 |
|
|
||||||
| prefill pp2048 | 2953 | **3441** | ~2200 |
|
|
||||||
**Verdict:** the **decode win is just "4-bit"** — plain Q4_K_M matches/beats MXFP4 on decode (both memory-bound).
|
|
||||||
MXFP4's *only* real edge is **prefill (+41% over Q4_K_M)** via Blackwell FP4 tensor cores. So for shipping,
|
|
||||||
**"4-bit quant + ubatch=2048" captures most of the win portably**; MXFP4 is a Blackwell-only prefill extra.
|
|
||||||
|
|
||||||
### Lever 3 — Fused FP4/FP8 MoE grouped GEMM (+ activation-quant fusion)
|
|
||||||
Status: **DESIGNED + PROFILED, not built** (multi-week kernel R&D). The single biggest remaining prefill win.
|
|
||||||
|
|
||||||
**Decisive measurements:**
|
|
||||||
- Prefill does NOT scale with bigger single prompts (attention O(N²) confounds): MXFP4 pp2048=3295, pp8192=1524,
|
|
||||||
pp16384=2051. So the plateau is not a batch-size fix.
|
|
||||||
- Real gap is batched many-sequence prefill: B=32 llama 3651 vs vLLM 99398 = **27×**. llama.cpp MoE prefill runs
|
|
||||||
at only **~22 effective TFLOP/s** on the GB10 — far below the GPU. Large headroom.
|
|
||||||
- **nsys (MXFP4 pp2048):** `mul_mat_q<type39>` (MoE FP4 GEMM) = **37.2%**, `quantize_mmq_mxfp4` (act-quant) = 8.0%,
|
|
||||||
`mul_mat_q<type8>` (dense/attn, still Q8) = 10.1%, flash_attn = 8.8%. The native FP4 MMA *is* engaged — the
|
|
||||||
inefficiency is the **per-expert thin-tile MMQ scheduler** + **un-fused activation quant**.
|
|
||||||
|
|
||||||
**Target (precise):** the ~45% in `mmq.cu`'s grouped MoE path (`ggml_cuda_mul_mat_q` + `ids`, `mmid.cu`). Replace
|
|
||||||
the per-expert thin-tile scheduler with a CUTLASS-style grouped GEMM (full tiles regardless of tokens/expert) and
|
|
||||||
fuse `quantize_mmq_mxfp4` into the permute/gather. Dense Q8 matmuls (10%) are the separate Lever-4 (FP8) target.
|
|
||||||
Problem (measured): the prefill ceiling is the MoE expert GEMM. Today `ggml_cuda_mul_mat_q` with `ids`
|
|
||||||
(`mmq.cu:127`) launches one grouped MMQ over a 3D grid (z = expert), but each expert's tile is thin
|
|
||||||
(~tokens/expert columns) so int8/FP4 tensor cores run underfilled; throughput is memory-bound on weight
|
|
||||||
streaming and flat vs batch.
|
|
||||||
Approach:
|
|
||||||
- Replace the per-expert thin-tile scheduler with a **CUTLASS-style grouped GEMM** that concatenates all
|
|
||||||
experts' token-blocks into one problem with per-group offsets, so tiles are always full (m16n8k64 FP4 /
|
|
||||||
m16n8k32 FP8) regardless of per-expert token count. Mirrors vLLM's `fused_moe` + cutlass grouped GEMM.
|
|
||||||
- **Fuse activation quantization into the permute/gather** (the `quantize_mmq_q8_1`/FP4 quantize currently a
|
|
||||||
separate 3.3% kernel) so the routed activations are quantized as they're scattered into expert order.
|
|
||||||
- Files: new kernel under `ggml/src/ggml-cuda/` (e.g. `moe-grouped-gemm.cu`) + dispatch hook in
|
|
||||||
`ggml_cuda_mul_mat_id` (`ggml-cuda.cu:2622`); reuse `mmid.cu` routing/`expert_bounds`.
|
|
||||||
- Effort: high (2–4 wks expert CUDA). Risk: numerics + sm_121 tile tuning. Expected payoff: the bulk of the
|
|
||||||
prefill gap (vLLM's MoE prefill advantage is mostly this). Upstream: #18250 (NVFP4-MoE) was closed
|
|
||||||
not-planned, so this would be a LocalAI patch or a fresh upstream proposal.
|
|
||||||
|
|
||||||
### Lever 4 — FP8 (e4m3) GEMM for dense layers
|
|
||||||
Status: **DESIGNED, not built** (blocked on a core ggml API change).
|
|
||||||
Problem: ggml-cuda has no FP8 matmul (only int8/FP4). vLLM runs qkv/o_proj/lm_head in FP8 on Blackwell
|
|
||||||
tensor cores. Our dense layers run int8-MMQ or f16-cuBLAS.
|
|
||||||
Approach (two options):
|
|
||||||
- (a) **cuBLASLt FP8**: route dense `mul_mat` through `cublasLtMatmul` with `CUDA_R_8F_E4M3` A/B and FP32
|
|
||||||
compute + scale pointers. Lowest kernel effort; gets library-tuned Blackwell FP8 immediately. Needs the
|
|
||||||
scale-tensor plumbing below.
|
|
||||||
- (b) **Hand-written sm_121 `mma.sync ...e4m3.e4m3.f32`** kernels in `mma.cuh`/`mmf.cu`. More control, more work.
|
|
||||||
- Prerequisite (both): the **`ggml_mul_mat_ext` / scale-tensor API** from upstream discussion #22042 —
|
|
||||||
per-tensor FP8 scales don't fit the block-scaled quant struct; `MUL_MAT`/`MUL_MAT_ID` must accept optional
|
|
||||||
scale tensors. This is a cross-cutting ggml change (graph + ops + all backends' fallbacks).
|
|
||||||
- Effort: high (API change is the hard part; cuBLASLt path is then moderate). Payoff: closes dense-layer
|
|
||||||
prefill/compute gap; complements Lever 3. Note: for *this* MoE model the experts dominate, so Lever 3 > 4.
|
|
||||||
|
|
||||||
### Lever 5 — tcgen05 / wgmma-class kernels for large-prefill tiles
|
|
||||||
Status: **DESIGNED, not built** (very high effort; last increment).
|
|
||||||
Problem: ggml's tensor-core path is warp-level `mma.sync` only (no `wgmma`/`tcgen05`). Blackwell's
|
|
||||||
tensor-memory `tcgen05` MMA (what CUTLASS uses) extracts substantially more throughput at large prefill tiles.
|
|
||||||
Approach: introduce warpgroup/tcgen05 GEMM main-loops for the FP4/FP8 paths (effectively adopting CUTLASS
|
|
||||||
3.x collective mainloops for sm_120/121), used when tile size is large enough (prefill). Decode (thin) keeps
|
|
||||||
`mma.sync`.
|
|
||||||
- Effort: very high (CUTLASS-class engineering). Payoff: the final slice of large-prefill throughput; only
|
|
||||||
worth it after Levers 3–4 land. Realistically: depend on/upstream CUTLASS kernels rather than hand-roll.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Paged attention — complete implementation (after kernels are fair)
|
|
||||||
The placement prototype is insufficient (measured: zero concurrency benefit). A real implementation needs all
|
|
||||||
four gaps. CPU foundation already built & verified (`PagedKVManager` P0–P3, `README.md`); the in-model parts
|
|
||||||
are unbuilt. **Build order and concrete design:**
|
|
||||||
|
|
||||||
1. **On-demand block allocation from a shared pool** (capacity win — more concurrent seqs before OOM).
|
|
||||||
- Replace `find_slot`'s ring-buffer (`llama-kv-cache.cpp:818`) with `PagedKVManager` block allocation; the
|
|
||||||
KV tensor becomes a shared block pool `[n_embd, block_size*num_blocks]`, sequences draw blocks on demand
|
|
||||||
(already prototyped on CPU: `paged_kv_manager.{h,cpp}`, `test_ggml_paged_rw.cpp`).
|
|
||||||
- Win measured where it counts: max concurrent sequences before OOM (not yet benchmarked — needs this).
|
|
||||||
2. **Gather-read** so each seq attends only its own blocks (`get_k`/`get_v` `:1145/1165` → `ggml_get_rows`
|
|
||||||
gather into scratch, then existing attention). Numerically proven on CPU (`test_ggml_paged_attn.cpp`,
|
|
||||||
7.5e-08 vs reference). Needs `build_attn_paged` branch in `llama-graph.cpp` + Gate 0 in a real model.
|
|
||||||
3. **Continuous batching / scheduler** (no head-of-line blocking on mixed-length traffic). New scheduler in
|
|
||||||
the server slot path; admit/evict at block granularity; the dimension where paging beats llama.cpp's
|
|
||||||
current static batching. This is where the *real* concurrency win lives (vs our synthetic uniform test).
|
|
||||||
4. **Automatic prefix sharing** (block-hash dedup; `PagedKVManager::{compute_block_hashes,get_computed_blocks}`
|
|
||||||
already implemented & tested). Cross-tenant shared system prompts reuse physical blocks.
|
|
||||||
|
|
||||||
Status: design in `2026-06-19-paged-attention-llamacpp-design.md`; CPU P0–P3 done; in-model #1–#4 unbuilt.
|
|
||||||
**Then** measure concurrency in paging's real scenarios — **memory-pressured (max seqs before OOM)** and
|
|
||||||
**mixed-length continuous batching** — on the MXFP4 (fair-quant) footing, not the uniform/over-provisioned
|
|
||||||
test that (correctly) showed no benefit.
|
|
||||||
|
|
||||||
> Reality check from this session's data: paged attention is a **capacity + scheduling** win, not a per-token
|
|
||||||
> speed win. On GB10 with 119 GB unified memory and uniform requests we are not memory-bound at B≤64, so the
|
|
||||||
> placement prototype showed nothing. Paging's value appears under memory pressure (many/long sequences) and
|
|
||||||
> bursty mixed-length traffic. The per-token throughput gap is a **kernel** problem (Levers 1–3), separate
|
|
||||||
> from paging.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Implementation plan A — Lever 3: FP4 MoE GEMM to vLLM parity
|
|
||||||
|
|
||||||
Goal: lift batched MoE prefill from ~3.65k t/s (B=32) toward vLLM's ~99k. Root cause (profiled):
|
|
||||||
`mul_mat_q<MXFP4>` runs at ~22 effective TFLOP/s — warp-level `mma.sync`, not Blackwell tcgen05.
|
|
||||||
Cheap knobs are exhausted (ubatch saturates at 2048; `GGML_CUDA_FORCE_CUBLAS` is a no-op 3419↔3423;
|
|
||||||
tile width already full at mmq_x=128). So parity needs kernel work, done iteratively on the DGX
|
|
||||||
(`~/llama.cpp-pr24423`, editable + rebuildable; diffs captured as `patches/`).
|
|
||||||
|
|
||||||
Phases (each: hypothesis → edit `ggml/src/ggml-cuda/` → `cmake --build build --target llama-bench` →
|
|
||||||
`llama-bench` MXFP4 pp/concurrency → record):
|
|
||||||
1. **Cheap kernel tweaks (low confidence, fast).** nwarps (occupancy), `mmq_y` tile, stream-k on/off,
|
|
||||||
FP4 load-tile path. Measure each. Likely small (<1.3x) — these don't change the warp-MMA ceiling.
|
|
||||||
- **Result (nwarps):** DEAD END. `nwarps` is locked by `static_assert(nwarps*tile_C::I == mmq_y)`
|
|
||||||
(mmq.cuh:3234) → nwarps=8 for mmq_y=128. Can't raise occupancy without co-scaling mmq_y to 256
|
|
||||||
(nwarps=16), which blows Blackwell shared-memory limits. The MMQ constants are tightly coupled;
|
|
||||||
it is not freely tunable. Confirms parity needs the kernel rewrite (phase 3), not knobs.
|
|
||||||
2. **Fuse activation quant** (`quantize_mmq_mxfp4`, 8%) into the permute/gather. Removes a kernel +
|
|
||||||
a global round-trip. Tractable, ~1.1x.
|
|
||||||
- **Result:** NOT AVAILABLE as a cheap patch. `quantize_mmq_fp4_cuda` (mmq.cu:200) *already* takes
|
|
||||||
`ids_src1` — the gather is already fused into the quant. The only remaining fusion is quantize-on-load
|
|
||||||
*inside* the GEMM hot loop (intricate, ~8% ceiling, risky). ORippler's #24481 fuses the decode (MMVQ)
|
|
||||||
post-scale and intends a "BS>1" (prefill) follow-up — unwritten. Marginal; skip.
|
|
||||||
|
|
||||||
**Upstream survey (2026-06):** there is NO tcgen05/CUTLASS grouped-GEMM MoE kernel in ggml — not merged,
|
|
||||||
not in-flight, not a draft (Discussion #18369 is talk, no PR; #18250 closed not-planned). CUTLASS is not a
|
|
||||||
dependency (the profile's `cutlass_80_tensorop` is cuBLAS-internal). No fork has a portable MoE kernel
|
|
||||||
(croll83/llama.cpp-dgx is GatedDeltaNet-focused). Maintainer signal (woachk on #17906): "the path forward
|
|
||||||
is to wait for cuTile C++." So **nothing to cherry-pick; phase 3 is genuinely from-scratch.**
|
|
||||||
3. **The real lever — tcgen05 / CUTLASS FP4 grouped GEMM.** Replace the per-expert MMQ scheduler with a
|
|
||||||
CUTLASS 3.x collective-mainloop grouped GEMM (sm_120a, `e2m1` block-scaled, tcgen05 tensor-memory MMA),
|
|
||||||
one problem over all experts with per-group offsets, fused act-quant. This is what vLLM/FlashInfer use.
|
|
||||||
Multi-week; the honest path to parity. Prefer **upstream ggml** (issue drafted) over a private patch.
|
|
||||||
4. **Full-model low precision.** Quantize dense layers (qkv/o_proj/lm_head, the 10% Q8) to FP4/FP8 too so
|
|
||||||
the whole prefill runs on FP4 tensor cores, not int8-MMQ.
|
|
||||||
Exit per phase: measured t/s recorded here; stop a phase when it's a dead end (recorded as such).
|
|
||||||
Matching vLLM realistically requires phase 3; phases 1–2 are the warm-up + de-risking.
|
|
||||||
|
|
||||||
## Implementation plan B — Complete paged attention (the pivot)
|
|
||||||
|
|
||||||
CPU foundation done (P0–P3, `README.md`): vLLM-parity block manager + ggml write/gather + attention
|
|
||||||
numerics + placement Gate 0 (token-identical in-model). Remaining = make it deliver the multi-tenant wins.
|
|
||||||
Phases:
|
|
||||||
1. **On-demand shared-block pool** — replace `find_slot` ring buffer (`llama-kv-cache.cpp:818`) with
|
|
||||||
`PagedKVManager` block allocation; KV tensor = `[n_embd, block_size*num_blocks]` shared pool. Win:
|
|
||||||
fit more concurrent seqs before OOM. Test: max concurrent seqs at fixed budget vs contiguous.
|
|
||||||
2. **Gather-read** (`get_k/get_v` `:1145/1165` → `ggml_get_rows` into scratch) + `build_attn_paged` branch
|
|
||||||
in `llama-graph.cpp`. Numerically proven on CPU (7.5e-08). Gate 0: token-identical multi-seq.
|
|
||||||
3. **Continuous batching / scheduler** — admit/evict at block granularity in the server slot path. The
|
|
||||||
real concurrency win on mixed-length traffic (where the placement prototype showed nothing).
|
|
||||||
4. **Automatic prefix sharing** — block-hash dedup (`PagedKVManager::{compute_block_hashes,get_computed_blocks}`
|
|
||||||
already implemented + tested). Cross-tenant shared system prompts reuse physical blocks.
|
|
||||||
Then benchmark in paging's real regimes — **memory-pressured** + **mixed-length continuous batching** — on
|
|
||||||
the MXFP4 (fair-quant) footing. Note: GB10's 119 GB unified memory means win-1 needs genuine pressure
|
|
||||||
(long/many seqs) to show; the win is capacity + scheduling, not per-token speed.
|
|
||||||
|
|
||||||
## Honest scope note
|
|
||||||
Levers 3–5 and the complete paged implementation are each substantial (weeks of expert CUDA/systems work). This doc tracks what is **measured** vs **designed** vs **not-yet-built**, and never claims a number that wasn't run on the box.
|
|
||||||
@@ -1,59 +0,0 @@
|
|||||||
# FP4 grouped-GEMM MoE kernel (Lever 3) — scaffold + implementation plan
|
|
||||||
|
|
||||||
The one piece of work that actually closes the vLLM gap on Blackwell (GB10/sm_121). Both phases are
|
|
||||||
bottlenecked by the same kernel: `mul_mat_q<MXFP4>` (warp-level `mma.sync` grouped MMQ, ~22 TFLOP/s) is
|
|
||||||
**37%** of prefill and **54.6%** of decode-at-B=64 GPU time (`BENCHMARKS.md`). Paged attention can't touch
|
|
||||||
it (proven). The fix is a CUTLASS-3.x collective-mainloop grouped GEMM with block-scaled `e2m1` operands via
|
|
||||||
tcgen05 tensor-memory MMA — what vLLM/FlashInfer/TRT-LLM use.
|
|
||||||
|
|
||||||
## Scaffold (DONE — builds clean, default byte-identical)
|
|
||||||
|
|
||||||
Lives in the DGX checkout `~/llama.cpp-pr24423/ggml/src/ggml-cuda/` (to be rebased onto the pin as a patch /
|
|
||||||
upstreamed). Captured diff: `patches/kernel/0001-fp4-grouped-moe-scaffold.patch`.
|
|
||||||
|
|
||||||
- `fp4-grouped-moe.{cuh,cu}` — entry `ggml_cuda_fp4_grouped_moe(ctx, src0, src1, ids, dst) -> bool`
|
|
||||||
(true = handled, false = fall back to MMQ). Gated behind env `GGML_CUDA_FP4_GROUPED`. Currently always
|
|
||||||
returns false → **default build unchanged**.
|
|
||||||
- Hook in `ggml_cuda_mul_mat_id` (the MoE dispatch), before the `ggml_cuda_mul_mat_q(...ids...)` call:
|
|
||||||
`if (ggml_cuda_fp4_grouped_moe(...)) return;`. Builds via the `file(GLOB "*.cu")` (re-run cmake configure
|
|
||||||
after adding the file — GLOB is configure-time).
|
|
||||||
|
|
||||||
This is the integration seam. The kernel fills the stub.
|
|
||||||
|
|
||||||
## Implementation phases (each: build on GB10 → numerical parity vs `mul_mat_q<MXFP4>` → bench)
|
|
||||||
|
|
||||||
1. **Reference grouped GEMM (correctness first, slow OK).** Per-expert problem sizes + offsets from `ids`;
|
|
||||||
dequant `e2m1`+scales → BF16; loop CUTLASS (or cuBLAS) per group. Gate: output matches MMQ within fp tol
|
|
||||||
on a 2-expert toy + the real model (token-identical greedy). Establishes the harness + the data plumbing.
|
|
||||||
2. **CUTLASS GemmGrouped, sm_120a, BF16 operands.** Replace the loop with one `cutlass::gemm::device::
|
|
||||||
GemmGrouped` launch over all experts (per-group offsets). Measures the grouping win alone.
|
|
||||||
3. **Block-scaled FP4 operands (the real lever).** `e2m1` A/B with `e8m0`(MX)/`e4m3`(NV) block scales via the
|
|
||||||
Blackwell scaled-MMA collective (tcgen05 tensor-memory). This is where the TFLOP/s jumps. Needs CUTLASS
|
|
||||||
3.x + sm_120a; verify the block-scale layout matches ggml's MXFP4/NVFP4 packing.
|
|
||||||
4. **Fuse activation quant** (the F32→FP4 of src1) into the gather/permute prologue.
|
|
||||||
5. **Enable by default** on sm_120/121 when parity holds + faster; keep the env as an escape hatch.
|
|
||||||
|
|
||||||
## Dependencies / decisions
|
|
||||||
|
|
||||||
- **CUTLASS is not currently a ggml dependency** (the profile's `cutlass_80_tensorop` is cuBLAS-internal).
|
|
||||||
Adding it = submodule/fetch + include dir, gated to CUDA sm_120+. Float the approach with ggml maintainers
|
|
||||||
early (Discussion #18369 is the home; JohannesGaessler asked to discuss arch before big kernel work).
|
|
||||||
- Target sm_120a/121a (consumer Blackwell). Datacenter Blackwell (sm_100) is a separate tile config.
|
|
||||||
- Risk: needs ncu-driven iteration on the GB10; this is multi-week, expert-CUDA. No upstream base to fork
|
|
||||||
(exhaustive search confirmed). Net-new value upstream.
|
|
||||||
|
|
||||||
## DENSE scope — RESOLVED (TODO #28, benchmarked): dense needs an FP4 GEMM too
|
|
||||||
|
|
||||||
Benchmarked Qwen3-32B dense, vLLM W4A16 vs llama.cpp Q4_K_M (`BENCHMARKS.md`). **Dense prefill is 7.6–32×
|
|
||||||
behind** (llama int8-MMQ plateaus ~765 t/s; vLLM FP4 scales to 24.4k); decode ~parity at B=1, 2.2× at B=64.
|
|
||||||
So the kernel track is **two kernels, not one**:
|
|
||||||
|
|
||||||
- **(a) Dense FP4 GEMM** — a plain non-grouped CUTLASS/tcgen05 block-scaled FP4 GEMM. **Simpler than grouped;
|
|
||||||
land this FIRST** — it's the easier first kernel, benefits every dense model, and de-risks the FP4 collective
|
|
||||||
before the grouped variant. Hook: the non-MoE `ggml_cuda_mul_mat_q` (no `ids`) path.
|
|
||||||
- **(b) MoE grouped FP4 GEMM** — the scaffold above (`ggml_cuda_fp4_grouped_moe`), per-expert offsets.
|
|
||||||
|
|
||||||
Both share the same block-scaled `e2m1` collective; (a) is (b) with one group. Suggested order: build (a),
|
|
||||||
prove the FP4 collective + parity harness, then generalize to (b). (Aside: full W4A4 NVFP4 doesn't run on
|
|
||||||
GB10 today — FlashInfer ships no FP4 cubins for sm_121, so the dense `mm_fp4` kernel hangs/returns zeros; the
|
|
||||||
W4A16 Marlin path is the fast, correct one and is the fair comparison. See `BENCHMARKS.md` for the root cause.)
|
|
||||||
@@ -1,140 +0,0 @@
|
|||||||
# MXFP4-dense vs Q4_K_M quality check (Qwen3, GB10 / DGX Spark)
|
|
||||||
|
|
||||||
## Question
|
|
||||||
|
|
||||||
MXFP4-quantized **dense** Qwen3-32B is measurably faster on GB10 (Blackwell) than
|
|
||||||
Q4_K_M: ~1.58x concurrent prefill, ~1.2x decode, for free (just a requantize that
|
|
||||||
routes onto the FP4-MMA kernel). Before LocalAI recommends MXFP4-dense as a Blackwell
|
|
||||||
default, we must confirm its **quality is acceptable versus Q4_K** (Q4_K is normally the
|
|
||||||
stronger 4-bit format).
|
|
||||||
|
|
||||||
Critical caveat going in: the pre-existing `~/bench/q3-32b-mxfp4-dense.gguf` was built
|
|
||||||
with `--allow-requantize`, so it was suspected to be **double-quantized** (Q4_K_M ->
|
|
||||||
MXFP4), which would unfairly penalize MXFP4. The goal here was a *fair* answer.
|
|
||||||
|
|
||||||
## Verdict
|
|
||||||
|
|
||||||
**Do NOT recommend MXFP4-dense as a quality-equivalent replacement for Q4_K on
|
|
||||||
Blackwell.** A clean apples-to-apples test (same BF16 source, both 4-bit, no imatrix)
|
|
||||||
shows MXFP4-dense carries a **large** quality penalty that Q4_K does not:
|
|
||||||
|
|
||||||
- Q4_K_M costs **+2.6%** perplexity vs the BF16 baseline.
|
|
||||||
- MXFP4-dense costs **+30.8%** perplexity vs the BF16 baseline (i.e. **+27.5% worse
|
|
||||||
than Q4_K**).
|
|
||||||
|
|
||||||
The double-quant suspicion was correct but it was **not** the main culprit: even a clean
|
|
||||||
MXFP4-from-BF16 is dramatically worse than Q4_K. The ~1.58x prefill / ~1.2x decode
|
|
||||||
speedup is real, but it is not free on quality. MXFP4-dense output is still coherent (not
|
|
||||||
gibberish), so it is usable where raw throughput dominates and a quality hit is
|
|
||||||
acceptable, but it must not be presented as a drop-in, quality-neutral Q4_K replacement.
|
|
||||||
|
|
||||||
## Evidence
|
|
||||||
|
|
||||||
### 1. Provenance of the existing 32B MXFP4 (it is double-quant)
|
|
||||||
|
|
||||||
`~/dense_mxfp4.sh` (mtime matches the `q3-32b-mxfp4-dense.gguf` mtime, Jun 20 09:47)
|
|
||||||
created it:
|
|
||||||
|
|
||||||
```
|
|
||||||
SRC=$HOME/bench/q3-32b-gguf/Qwen3-32B-Q4_K_M.gguf # <-- source is Q4_K_M, not F16/BF16
|
|
||||||
OUT=$HOME/bench/q3-32b-mxfp4-dense.gguf
|
|
||||||
$QB --allow-requantize --tensor-type "attn=mxfp4" --tensor-type "ffn=mxfp4" \
|
|
||||||
"$SRC" "$OUT" MXFP4_MOE
|
|
||||||
```
|
|
||||||
|
|
||||||
Confirmed **double-quantized** (Q4_K_M -> MXFP4). Any PPL measured on this file
|
|
||||||
overstates MXFP4's true penalty, so the 32B number below is a loose upper bound, not the
|
|
||||||
fair answer.
|
|
||||||
|
|
||||||
### 2. 32B quick read (wikitext-2-raw test, 50 chunks, ctx 512, ngl 99)
|
|
||||||
|
|
||||||
`llama-perplexity`, PR build `~/llama.cpp-pr24423/build` (sm_121):
|
|
||||||
|
|
||||||
| 32B model | PPL | vs Q4_K |
|
|
||||||
|---|---|---|
|
|
||||||
| Qwen3-32B-Q4_K_M | **7.3865** +/- 0.177 | - |
|
|
||||||
| q3-32b-mxfp4-dense (double-quant) | **8.4638** +/- 0.206 | +14.6% |
|
|
||||||
|
|
||||||
MXFP4 is much worse than Q4_K here, **and** it is double-quant, so the quick read is
|
|
||||||
unfair -> escalated to a clean small-model comparison.
|
|
||||||
|
|
||||||
### 3. Fair comparison: clean small dense model (Qwen3-4B BF16)
|
|
||||||
|
|
||||||
The MXFP4-vs-Q4_K delta is a *format* property and roughly model-size-independent, so a
|
|
||||||
small model gives a fast, clean answer. Downloaded `Qwen3-4B-BF16.gguf` (unsloth, ~7.7
|
|
||||||
GiB) and quantized it **from that same BF16 source** to both formats with the identical
|
|
||||||
recipe used for the 32B (no `--allow-requantize` needed, no imatrix on either side):
|
|
||||||
|
|
||||||
```
|
|
||||||
llama-quantize q3-4b-bf16.gguf q3-4b-q4km.gguf Q4_K_M
|
|
||||||
llama-quantize --tensor-type attn=mxfp4 --tensor-type ffn=mxfp4 \
|
|
||||||
q3-4b-bf16.gguf q3-4b-mxfp4.gguf MXFP4_MOE
|
|
||||||
```
|
|
||||||
|
|
||||||
Perplexity (wikitext-2-raw test, 50 chunks, ctx 512, ngl 99):
|
|
||||||
|
|
||||||
| Qwen3-4B | size | PPL | vs BF16 | vs Q4_K |
|
|
||||||
|---|---|---|---|---|
|
|
||||||
| BF16 (baseline) | 7672 MiB | **13.3188** +/- 0.416 | - | - |
|
|
||||||
| Q4_K_M | 2497 MiB | **13.6605** +/- 0.426 | **+2.57%** | - |
|
|
||||||
| MXFP4 (clean) | 2236 MiB (4.66 BPW) | **17.4183** +/- 0.561 | **+30.78%** | **+27.5%** |
|
|
||||||
|
|
||||||
This is the apples-to-apples quality answer: **clean MXFP4-from-BF16 is ~12x more lossy
|
|
||||||
than Q4_K relative to the BF16 baseline** (30.8% vs 2.6%). Notably the clean-4B MXFP4-vs-
|
|
||||||
Q4_K gap (+27.5%) is *wider* than the 32B double-quant gap (+14.6%), consistent with
|
|
||||||
smaller models being more quantization-sensitive - the double-quant did not invent the
|
|
||||||
problem, it is intrinsic to the format as quantized by `llama-quantize`.
|
|
||||||
|
|
||||||
### 4. Coherence spot-check (32B, llama-simple, n=60)
|
|
||||||
|
|
||||||
MXFP4-dense 32B is fully coherent, not degraded gibberish:
|
|
||||||
|
|
||||||
- "The capital of France is" -> MXFP4: "...Paris, is located near the Seine River..."
|
|
||||||
(correct); Q4_K similar.
|
|
||||||
- "Q: What is 17 multiplied by 23? A:" -> MXFP4 reasons via the distributive property
|
|
||||||
(sound); Q4_K answers 391 directly (correct).
|
|
||||||
- "def fibonacci(n):" -> both emit valid Python.
|
|
||||||
|
|
||||||
So the quality cost shows up as measurably higher perplexity (and would surface on harder
|
|
||||||
/ longer tasks), not as obviously broken text at short generation lengths.
|
|
||||||
|
|
||||||
## Why
|
|
||||||
|
|
||||||
`MXFP4_MOE` is a 4-bit float format (E2M1 values, shared E8M0 scale per block of 32,
|
|
||||||
round-to-nearest) designed for MoE expert tensors (gpt-oss et al.) with a coarse
|
|
||||||
per-block scale. Q4_K uses 6-bit superblock scales plus per-sub-block mins - materially
|
|
||||||
better for dense attention/FFN weights. Forcing MXFP4 onto dense layers to reach the FP4
|
|
||||||
kernel trades ~1.58x prefill for a large accuracy loss. The FP4-MMA speed path is real,
|
|
||||||
but the weights it accepts (MXFP4 here) are lossy for dense.
|
|
||||||
|
|
||||||
## Caveat, stated precisely
|
|
||||||
|
|
||||||
This measures **llama.cpp's `llama-quantize` MXFP4** (OCP MX FP4, RTN, **no imatrix**)
|
|
||||||
against **llama.cpp's Q4_K_M** (k-quant superblocks, also no imatrix here). It is a fair
|
|
||||||
format-vs-format comparison of exactly what LocalAI would ship if it routed a requantize
|
|
||||||
through this path. It does **not** claim FP4 is fundamentally unviable on Blackwell:
|
|
||||||
|
|
||||||
- An imatrix-aware MXFP4, or a better FP4 format with two-level scaling
|
|
||||||
(**NVFP4** - there are already `q3-32b-nvfp4` / `q3-32b-nvfp4a16` dirs on the box),
|
|
||||||
may close much of this gap and is the more promising Blackwell FP4 path to evaluate.
|
|
||||||
- The result is for Qwen3 dense; other families may differ in magnitude but the
|
|
||||||
format-level disadvantage of plain MXFP4 RTN vs Q4_K is expected to hold.
|
|
||||||
|
|
||||||
## Recommendation
|
|
||||||
|
|
||||||
- **Do not** ship a blanket "use MXFP4-dense on Blackwell" recommendation as a Q4_K
|
|
||||||
quality equivalent. The ~1.58x prefill / ~1.2x decode win comes with a real ~30% PPL
|
|
||||||
inflation (vs ~2.6% for Q4_K). Q4_K_M stays the right dense default on Blackwell.
|
|
||||||
- If exposing MXFP4-dense at all, gate it as an explicit **throughput-over-quality**
|
|
||||||
option with the perplexity caveat surfaced, not a default.
|
|
||||||
- MXFP4/FP4 remains correct where the model is trained for it (MoE / gpt-oss-style).
|
|
||||||
Pursue **NVFP4** (and/or imatrix-aware FP4) as the quality-competitive Blackwell FP4
|
|
||||||
format before making any FP4-dense recommendation.
|
|
||||||
|
|
||||||
## Reproduction (DGX Spark, GB10, build `~/llama.cpp-pr24423/build`, sm_121)
|
|
||||||
|
|
||||||
- Dataset: `~/wikitext-2-raw/wiki.test.raw` (wikitext-2-raw-v1 test).
|
|
||||||
- 32B: `~/ppl32b.sh` -> `~/ppl32b.out`; coherence `~/coh32b.sh` -> `~/coh32b.out`.
|
|
||||||
- Clean 4B: `~/fair4b.sh` -> `~/fair4b.out` (quantize + 3x perplexity).
|
|
||||||
- All runs `-ngl 99`, `--chunks 50`, `-c 512`. GB10 thermal-throttles but PPL is a
|
|
||||||
correctness metric, so thermal state does not affect these numbers.
|
|
||||||
@@ -1,41 +0,0 @@
|
|||||||
CXX ?= g++
|
|
||||||
CXXFLAGS ?= -std=c++17 -O2 -Wall -Wextra -I.
|
|
||||||
|
|
||||||
TESTS = test_free_block_queue test_block_pool test_paged_kv_manager test_prefix_cache
|
|
||||||
BINS = $(addprefix tests/,$(TESTS))
|
|
||||||
|
|
||||||
all: $(BINS)
|
|
||||||
|
|
||||||
tests/%: tests/%.cpp paged_kv_manager.cpp paged_kv_manager.h
|
|
||||||
$(CXX) $(CXXFLAGS) -o $@ $< paged_kv_manager.cpp
|
|
||||||
|
|
||||||
check: all
|
|
||||||
@for t in $(BINS); do echo "== $$t =="; ./$$t || exit 1; done
|
|
||||||
|
|
||||||
paged-bench: paged-bench.cpp paged_kv_manager.cpp paged_kv_manager.h
|
|
||||||
$(CXX) $(CXXFLAGS) -o $@ paged-bench.cpp paged_kv_manager.cpp
|
|
||||||
|
|
||||||
bench: paged-bench
|
|
||||||
./paged-bench
|
|
||||||
|
|
||||||
# --- Optional ggml integration test (Phase 1: paged write/gather mechanism) ---
|
|
||||||
# Requires a built ggml. Override these to point at your checkout / build:
|
|
||||||
# make ggml-check GGML_SRC=<llama.cpp>/ggml GGML_BUILD=<ggml-build>
|
|
||||||
GGML_SRC ?= ../../llama-cpp-fallback-build/llama.cpp/ggml
|
|
||||||
GGML_BUILD ?= /tmp/ggml-build
|
|
||||||
GGML_LIBDIR = $(GGML_BUILD)/src
|
|
||||||
|
|
||||||
GGML_TESTS = test_ggml_paged_rw test_ggml_paged_attn
|
|
||||||
GGML_BINS = $(addprefix tests/,$(GGML_TESTS))
|
|
||||||
|
|
||||||
tests/test_ggml_%: tests/test_ggml_%.cpp paged_kv_manager.cpp paged_kv_manager.h
|
|
||||||
$(CXX) $(CXXFLAGS) -I$(GGML_SRC)/include -o $@ $< paged_kv_manager.cpp \
|
|
||||||
-L$(GGML_LIBDIR) -lggml -lggml-base -lggml-cpu -Wl,-rpath,$(GGML_LIBDIR)
|
|
||||||
|
|
||||||
ggml-check: $(GGML_BINS)
|
|
||||||
@for t in $(GGML_BINS); do echo "== $$t =="; ./$$t || exit 1; done
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -f $(BINS) $(GGML_BINS) paged-bench
|
|
||||||
|
|
||||||
.PHONY: all check ggml-check clean
|
|
||||||
@@ -1,114 +0,0 @@
|
|||||||
# NVFP4-dense on DGX Spark (GB10, sm_121): is it the quality-preserving FP4 win MXFP4 wasn't?
|
|
||||||
|
|
||||||
Test rig: DGX Spark GB10 (sm_121), `~/llama.cpp-pr24423/build` (PR #24423, FP4 MMA + NVFP4
|
|
||||||
kernel), wikitext-2-raw, clean BF16 source `q3-4b-bf16.gguf` (the same source used for the
|
|
||||||
established MXFP4 / Q4_K fair test). NVFP4 and all comparison quants were produced clean from
|
|
||||||
BF16, no imatrix.
|
|
||||||
|
|
||||||
## Verdict (short)
|
|
||||||
|
|
||||||
YES on all the load-bearing questions, with one honest caveat:
|
|
||||||
|
|
||||||
1. llama.cpp CAN produce an NVFP4 GGUF.
|
|
||||||
2. NVFP4 quality is Q4_K-class, NOT MXFP4-class: +7.4% PPL vs BF16 (MXFP4 was +30.8%). It is
|
|
||||||
slightly behind Q4_K (+4.8% relative) but in the same ballpark, not on the quality cliff.
|
|
||||||
3. NVFP4 routes onto the FP4 MMA kernel and gets the FP4 prefill speedup: ~1.29x Q4_K on the
|
|
||||||
4B, tracking MXFP4 to within 5% (MXFP4 hit 1.58x on the 32B; NVFP4 should track it there too).
|
|
||||||
4. Output is coherent.
|
|
||||||
|
|
||||||
Bottom line: NVFP4-dense IS the quality-preserving FP4 win MXFP4 wasn't. It delivers
|
|
||||||
essentially the full FP4 prefill speedup at roughly Q4_K quality, where MXFP4 paid a 27% quality
|
|
||||||
tax for the same speed. LocalAI can support/recommend NVFP4-dense on Blackwell for prefill-bound
|
|
||||||
workloads, with the caveat that it is marginally (~5%) behind Q4_K on perplexity; an imatrix-guided
|
|
||||||
NVFP4 quant would likely close most of that remaining gap.
|
|
||||||
|
|
||||||
## 1. Feasibility: can llama-quantize produce an NVFP4 GGUF? YES
|
|
||||||
|
|
||||||
- The type exists with a full quantize path, not just a kernel:
|
|
||||||
- `GGML_TYPE_NVFP4 = 40` (`ggml.h`), `GGML_FTYPE_MOSTLY_NVFP4 = 26`
|
|
||||||
- `quantize_nvfp4` / `quantize_row_nvfp4_ref` / `dequantize_row_nvfp4` registered in `ggml.c`
|
|
||||||
- type_name is `"nvfp4"`, block `QK_NVFP4` (per-16 FP8/E4M3 block scale + global scale)
|
|
||||||
- NVFP4 is NOT a top-level `llama-quantize` ftype (no `NVFP4` entry in the allowed-types list,
|
|
||||||
no reference in `tools/quantize/quantize.cpp` or `src/llama-quant.cpp`), BUT
|
|
||||||
`--tensor-type name=nvfp4` resolves it: `parse_ggml_type` matches the arg against
|
|
||||||
`ggml_type_name(...)`, which returns `"nvfp4"`. This is the exact same mechanism that produced
|
|
||||||
MXFP4-dense.
|
|
||||||
- Recipe used (mirrors the MXFP4-dense GGUF byte-for-byte in structure: token_embd Q8_0, all
|
|
||||||
norms F32, all 2D attn+ffn weights to FP4):
|
|
||||||
|
|
||||||
```
|
|
||||||
llama-quantize --tensor-type "attn=nvfp4" --tensor-type "ffn=nvfp4" \
|
|
||||||
q3-4b-bf16.gguf q3-4b-nvfp4.gguf Q8_0
|
|
||||||
```
|
|
||||||
|
|
||||||
Result: `q3-4b-nvfp4.gguf`, 2343.93 MiB, 4.89 BPW, ~5 s. (MXFP4-dense was 2350 MiB; same shape.)
|
|
||||||
Every `blk.N.attn_*` and `blk.N.ffn_*` reported `converting to nvfp4`; token_embd Q8_0; norms F32.
|
|
||||||
|
|
||||||
The on-box `~/bench/q3-32b-nvfp4*` dirs are vLLM HF safetensors (already 4-bit), not GGUF, and
|
|
||||||
do not feed llama.cpp - confirmed and irrelevant.
|
|
||||||
|
|
||||||
## 2. Quality (decisive): NVFP4 is Q4_K-class, not MXFP4-class
|
|
||||||
|
|
||||||
`llama-perplexity -f wiki.test.raw --chunks 50 -c 512 -ngl 99`, all clean from the same BF16 4B:
|
|
||||||
|
|
||||||
| Quant | PPL | vs BF16 | vs Q4_K |
|
|
||||||
|---------|--------|----------|----------|
|
|
||||||
| BF16 | 13.32 | - | - |
|
|
||||||
| Q4_K_M | 13.66 | +2.6% | - |
|
|
||||||
| NVFP4 | 14.31 | +7.4% | +4.8% |
|
|
||||||
| MXFP4 | 17.42 | +30.8% | +27.6% |
|
|
||||||
|
|
||||||
(NVFP4 measured this run: Final PPL = 14.3097 +/- 0.4457.)
|
|
||||||
|
|
||||||
NVFP4 lands much closer to Q4_K (gap 0.65 PPL) than to MXFP4 (gap 3.11 PPL). MXFP4's finer
|
|
||||||
sibling delivers: the two-level scaling (per-16 FP8 block scale + global scale) recovers almost
|
|
||||||
all of the quality MXFP4's coarse per-32 E8M0 scale threw away. It is not quite Q4_K, but it is
|
|
||||||
firmly in the "acceptable 4-bit" regime, not the lossy one.
|
|
||||||
|
|
||||||
## 3. Speed: NVFP4 routes onto the FP4 MMA kernel
|
|
||||||
|
|
||||||
No clean BF16 32B was on the box (only the vLLM NVFP4 safetensors and the Q4_K/MXFP4 32B GGUFs),
|
|
||||||
so per the brief this is the 4B speed signal - a 3-way cold A/B on the SAME 4B model, 45 s
|
|
||||||
cooldowns between runs (`-npp 512 -ntg 128 -npl 8,32,64 -b 2048 -ub 2048 -ngl 99`):
|
|
||||||
|
|
||||||
Prefill S_PP (t/s):
|
|
||||||
|
|
||||||
| B | Q4_K | NVFP4 | MXFP4 | NVFP4 / Q4_K | NVFP4 / MXFP4 |
|
|
||||||
|-----|--------|--------|--------|--------------|---------------|
|
|
||||||
| 8 | 4862 | 6313 | 6602 | 1.30x | 0.96x |
|
|
||||||
| 32 | 5020 | 6497 | 6836 | 1.29x | 0.95x |
|
|
||||||
| 64 | 5031 | 6490 | 6831 | 1.29x | 0.95x |
|
|
||||||
|
|
||||||
- NVFP4 prefill is within ~5% of MXFP4 at every batch size -> both land on the same FP4 MMA
|
|
||||||
kernel. NVFP4 does NOT fall back to a slow path.
|
|
||||||
- NVFP4 beats Q4_K's int8-MMQ prefill by ~1.29x on the 4B. The established 32B figures were
|
|
||||||
Q4_K S_PP ~767 and MXFP4 ~1209 (1.58x); since NVFP4 tracks MXFP4 to within 5%, NVFP4 on the
|
|
||||||
32B should likewise approach ~1.5x. (The 4B shows a smaller multiplier than the 32B because a
|
|
||||||
smaller model spends proportionally less time in the matmul the FP4 kernel accelerates.)
|
|
||||||
- Token-gen (S_TG) is comparable across all three (memory-bound), as expected.
|
|
||||||
|
|
||||||
## 4. Coherence
|
|
||||||
|
|
||||||
`llama-simple` (llama-cli hangs - avoided), NVFP4 4B:
|
|
||||||
- "The capital of France is" -> "...Paris. ...Germany is in Berlin. ...Italy is in Rome.
|
|
||||||
...Spain is in Madrid. ...Netherlands is in Amsterdam." (all correct)
|
|
||||||
- "Q: What is 17 plus 25? A:" -> "42." (correct)
|
|
||||||
|
|
||||||
Coherent and factually accurate.
|
|
||||||
|
|
||||||
## Recommendation for LocalAI on Blackwell
|
|
||||||
|
|
||||||
Support and recommend NVFP4-dense as the FP4 prefill option on Blackwell (sm_120/121), produced
|
|
||||||
via `--tensor-type attn=nvfp4 --tensor-type ffn=nvfp4` over a BF16 source (token_embd Q8_0,
|
|
||||||
norms F32). It gives ~the full FP4 prefill speedup (FP4 MMA kernel, ~1.3x Q4_K on 4B and
|
|
||||||
expected ~1.5x on larger models) at roughly Q4_K quality (+7.4% PPL vs BF16). This is the win
|
|
||||||
MXFP4 failed to deliver: MXFP4 paid a +30.8% quality tax for the same speed and was rejected.
|
|
||||||
|
|
||||||
Caveats / follow-ups:
|
|
||||||
- NVFP4 is still ~4.8% behind Q4_K on PPL. For quality-first deployments where the prefill win
|
|
||||||
does not matter, Q4_K_M remains the better pick.
|
|
||||||
- These NVFP4/Q4_K numbers are clean (no imatrix). An imatrix-guided NVFP4 quant is the obvious
|
|
||||||
next step and would likely close most of the remaining gap to Q4_K - worth measuring before a
|
|
||||||
blanket recommendation.
|
|
||||||
- A direct 32B NVFP4-vs-Q4_K speed run (needs a clean BF16 32B GGUF, not on the box) would
|
|
||||||
confirm the projected ~1.5x; the 4B signal plus the MXFP4-tracking already make this very likely.
|
|
||||||
@@ -1,115 +0,0 @@
|
|||||||
# Paged KV at high concurrency on a single GB10 - the datacenter-scale test
|
|
||||||
|
|
||||||
Closes the open question left by `PR22569_EVAL.md`: that eval could not test the
|
|
||||||
"paged KV unlocks thousands of sequences" thesis because **both** KV paths hit the
|
|
||||||
`LLAMA_MAX_SEQ=256` compile cap, and the 32B-dense model it used is compute-bound
|
|
||||||
(plateaus by npl=128 for an unrelated reason). This run removes both confounders:
|
|
||||||
**recompiled `LLAMA_MAX_SEQ=2048`** and used a **bandwidth-bound model (Qwen3-1.7B-Q8_0)**
|
|
||||||
where decode aggregate is free to keep climbing with concurrency.
|
|
||||||
|
|
||||||
Hardware: NVIDIA GB10 (sm_121, 119 GiB unified LPDDR5X, ~273 GB/s). Build:
|
|
||||||
`~/llama.cpp-pr22569` (PR #22569 paged path + the reshape fix), `LLAMA_MAX_SEQ=2048`,
|
|
||||||
sm_121 Release. Contiguous = `llama-batched-bench` (unified KV) `S_TG`. Paged =
|
|
||||||
`llama-paged -kvp --fit off` `aggregate tps`. `npp=16, ntg/n_predict=128, b=ub=2048,
|
|
||||||
-ngl 99`. Cold runs, 12 s cooldowns.
|
|
||||||
|
|
||||||
## TL;DR for the decision
|
|
||||||
|
|
||||||
**On a single GB10, paged KV does NOT deliver a throughput or concurrency win - the
|
|
||||||
aggregate-decode ceiling is set by the hardware, not the KV layout, and contiguous KV
|
|
||||||
already reaches it.** Measured across two model regimes and concurrency up to 2048
|
|
||||||
sequences:
|
|
||||||
|
|
||||||
- Aggregate decode **plateaus** once the GPU saturates - for both KV layouts:
|
|
||||||
- 32B-dense (compute-bound): ~540 t/s, flat from npl=128 (prior eval).
|
|
||||||
- 1.7B (bandwidth-bound): ~3,200-3,700 t/s, flat from npl=512 (this run).
|
|
||||||
- Paged and contiguous land at the **same ceiling**; PR #22569's paged op was 12-13%
|
|
||||||
*slower* than the mature contiguous flash-attention path at equal concurrency on 32B.
|
|
||||||
- Pushing concurrency past the plateau is **actively harmful to UX**: per-sequence
|
|
||||||
throughput collapses (23 -> 1.9 tok/s) and TTFT explodes (0.6 s -> 4.3 s avg, **64 s
|
|
||||||
max**) while aggregate stays flat.
|
|
||||||
|
|
||||||
**vLLM's ~24k aggregate headline is unreachable on a single GB10 with these models
|
|
||||||
regardless of KV layout** - it needs aggregate memory bandwidth / compute that one GB10
|
|
||||||
does not have (i.e. many GPUs). Paged KV is a **memory-capacity / anti-fragmentation /
|
|
||||||
prefix-sharing** feature, not a single-node throughput-ceiling feature. The static
|
|
||||||
single-model benchmark deliberately does not create the memory-pressure regime where
|
|
||||||
paging pays off, which is exactly why no win appears.
|
|
||||||
|
|
||||||
## The numbers
|
|
||||||
|
|
||||||
### Aggregate decode vs concurrency, Qwen3-1.7B-Q8_0 (bandwidth-bound), `LLAMA_MAX_SEQ=2048`
|
|
||||||
|
|
||||||
| npl | contiguous `S_TG` (t/s) | paged `aggregate tps` (t/s) | paged per-seq tps | paged TTFT avg / max |
|
|
||||||
|----:|------------------------:|----------------------------:|------------------:|---------------------:|
|
|
||||||
| 128 | 2,643 | 2,887 | 23-25 | - |
|
|
||||||
| 256 | 2,925 | - | - | - |
|
|
||||||
| 512 | 3,215 | 3,637 | 7.2-7.8 | 0.57 s / 0.90 s |
|
|
||||||
| 1024 | 3,118 | 3,695 | 3.7-4.2 | 1.17 s / 2.37 s |
|
|
||||||
| 2048 | (not run) | 3,608 | 1.9-14.6 | 4.28 s / **63.8 s** |
|
|
||||||
|
|
||||||
Both paths flatten by npl~512. 8x more concurrency (128->1024) buys contiguous only
|
|
||||||
**+18%** and paged **+28%**, then both stop. (The two tools meter slightly differently -
|
|
||||||
`llama-paged` aggregate vs `batched-bench` decode-only `S_TG` - so the small paged-vs-
|
|
||||||
contiguous offset is not a real paged advantage; the prior apples-to-apples 32B eval had
|
|
||||||
paged 12-13% *behind*.)
|
|
||||||
|
|
||||||
### Why it plateaus (the hardware ceiling, not the KV layout)
|
|
||||||
|
|
||||||
Decode is memory-bandwidth-bound: each step reads the model weights once and shares that
|
|
||||||
read across the whole batch. Once concurrency is high enough that the shared weight-read
|
|
||||||
is amortized, the per-step cost is dominated by KV reads + attention + host work, none of
|
|
||||||
which paging makes cheaper. The GB10's ~273 GB/s sets the floor; at the plateau the GPU
|
|
||||||
is ~saturated. Adding sequences past that point cannot raise aggregate - it only divides
|
|
||||||
the same throughput across more users (per-seq tps falls, TTFT rises). The 32B-dense case
|
|
||||||
plateaus even earlier (npl=128) because it saturates on **compute** (weight matmuls), not
|
|
||||||
bandwidth - the kernel decomposition is in `VLLM_DECOMPOSITION.md`.
|
|
||||||
|
|
||||||
## What paged KV is actually for (the honest, deliverable value)
|
|
||||||
|
|
||||||
Paging never helps a static, uniform-length, single-model benchmark on a GPU with memory
|
|
||||||
to spare - there is no fragmentation and no over-reservation to reclaim. Its real wins,
|
|
||||||
which require the regime this hardware+benchmark does not exercise, are:
|
|
||||||
|
|
||||||
1. **Concurrent-tenant capacity under memory pressure.** Block KV fits more *diverse*
|
|
||||||
in-flight sequences (variable, dynamically arriving/leaving contexts) without the
|
|
||||||
contiguous path's per-slot reservation/fragmentation. Pays off when KV memory, not
|
|
||||||
compute/bandwidth, is the binding constraint - i.e. at multi-GPU datacenter scale or
|
|
||||||
with very long/variable contexts.
|
|
||||||
2. **Cross-request prefix sharing.** A chained-hash block cache shares identical system
|
|
||||||
prompts / RAG preambles across requests (vLLM's `block_pool.py` + block-hash map). A
|
|
||||||
real token-budget win for shared-prefix workloads; PR #22569 defers this to a
|
|
||||||
non-existent Phase 2 (our from-scratch P0 has the machinery).
|
|
||||||
|
|
||||||
These are measured as **max concurrent distinct tenants** and **KV memory saved**, not as
|
|
||||||
aggregate tok/s on one model. They do not move the single-GB10 throughput ceiling.
|
|
||||||
|
|
||||||
## Recommendation
|
|
||||||
|
|
||||||
- **Do not pitch paged KV as a single-GB10 throughput lever** - it is measured flat to
|
|
||||||
the contiguous ceiling (and PR #22569 is slower). Doing so would not survive a
|
|
||||||
benchmark.
|
|
||||||
- **The single-GB10 throughput story is already strong without paging:** llama.cpp is
|
|
||||||
ahead of vLLM single-stream (MXFP4 1153 > 800) and at ~70-81% of vLLM aggregate at
|
|
||||||
npl<=128 with a near-identical batching multiplier (`VLLM_DECOMPOSITION.md`). Ship the
|
|
||||||
MXFP4/NVFP4-dense prefill win (`NVFP4_TEST.md`) - that is the cheap, real, defensible
|
|
||||||
Blackwell number.
|
|
||||||
- **If datacenter-scale (thousands of concurrent tenants) is the genuine target,** the
|
|
||||||
lever is **multiple GPUs** plus paged KV's **capacity + prefix-sharing** features -
|
|
||||||
framed and measured as concurrent-tenant capacity and KV memory saved, on a
|
|
||||||
variable-context / shared-prefix workload. A single GB10 cannot produce the ~24k
|
|
||||||
aggregate regardless of KV layout; that is a fleet-level result.
|
|
||||||
|
|
||||||
## Reproduction (DGX, `~/llama.cpp-pr22569`, `LLAMA_MAX_SEQ=2048`)
|
|
||||||
|
|
||||||
```sh
|
|
||||||
M=~/bench/draft17/Qwen3-1.7B-Q8_0.gguf
|
|
||||||
# contiguous
|
|
||||||
for NPL in 128 256 512 1024; do
|
|
||||||
./build/bin/llama-batched-bench -m $M -npp 16 -ntg 128 -npl $NPL -ngl 99 \
|
|
||||||
-b 2048 -ub 2048 -fa on -c $((NPL*160)); done
|
|
||||||
# paged
|
|
||||||
for NPL in 512 1024 2048; do
|
|
||||||
./build/bin/llama-paged -m $M -kvp --fit off -ngpub 32768 -ncpub 128 \
|
|
||||||
-np $NPL -ns $NPL -n 128 -b 2048 -ub 2048 -ngl 99; done
|
|
||||||
```
|
|
||||||
@@ -1,170 +0,0 @@
|
|||||||
# Paged KV: target-readiness (correctness, dynamic benchmark, 2xH200 projection)
|
|
||||||
|
|
||||||
Target hardware: **~2x H200** (281 GB HBM3e total, ~4.8 TB/s per GPU). The GB10 box is
|
|
||||||
the *test* rig, not the target - and several earlier "no win" findings are GB10-specific
|
|
||||||
artifacts (low bandwidth caps throughput before KV memory ever binds). This document
|
|
||||||
delivers the three things needed to push paged KV toward the real target:
|
|
||||||
|
|
||||||
1. **Correctness** of the paged path - verified (and a blocking bug found + fixed).
|
|
||||||
2. **A dynamic-load benchmark** that actually exercises where paging wins (`paged-loadgen.cpp`).
|
|
||||||
3. **A projection** of the paged-KV payoff on 2x H200, grounded in measured GB10 numbers.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 1. Correctness: PASS (after fixing the auto-fit OOM)
|
|
||||||
|
|
||||||
`test-paged-kv-e2e` checks the paged decode path against the contiguous reference
|
|
||||||
(greedy argmax + top-5 set overlap >= 4). On the box it was previously **unverified** -
|
|
||||||
it aborted at context creation. Root cause found:
|
|
||||||
|
|
||||||
- `common_fit_paged_kv_blocks` (`common/common.cpp:1144`) **unconditionally overrides**
|
|
||||||
`n_gpu_blocks` from `ggml_backend_dev_memory`, which **over-reports free VRAM on the
|
|
||||||
GB10 integrated/unified device** (it sized **~245 GB of KV on a 119 GB box** ->
|
|
||||||
`cudaMalloc` OOM -> `GGML_ASSERT` abort in `llama-kv-cache-paged.cpp:74`). The test's
|
|
||||||
explicit `n_gpu_blocks=64` was being clobbered because `params.fit_params` defaults on.
|
|
||||||
|
|
||||||
**Fix (item-1 patch, applied on the box):**
|
|
||||||
|
|
||||||
```diff
|
|
||||||
--- a/tests/test-paged-kv-e2e.cpp
|
|
||||||
+++ b/tests/test-paged-kv-e2e.cpp
|
|
||||||
@@ run_paged()
|
|
||||||
params.kv_paged = true;
|
|
||||||
+ params.fit_params = false; // honor explicit n_gpu_blocks; GB10 dev_memory over-reports free VRAM
|
|
||||||
params.n_gpu_blocks = 64;
|
|
||||||
```
|
|
||||||
|
|
||||||
**Result (Qwen3-0.6B-Q8_0, GB10):**
|
|
||||||
|
|
||||||
```
|
|
||||||
test-paged-kv-e2e: top-5 argmax match: ref=3743 paged=3743
|
|
||||||
test-paged-kv-e2e: top-5 set overlap: 5/5 (require >= 4)
|
|
||||||
test-paged-kv-e2e: PASSED
|
|
||||||
```
|
|
||||||
|
|
||||||
The paged op is **numerically greedy-equivalent to the contiguous path**. The reshape
|
|
||||||
bug from `PR22569_EVAL.md` (decoupled head_dim) is already applied in the checkout.
|
|
||||||
|
|
||||||
**Target-readiness caveat (the durable fix, not just the test):** the auto-fit itself is
|
|
||||||
brittle and must be hardened before it runs on a real serving box - even though
|
|
||||||
`ggml_backend_dev_memory` reports correctly on a discrete H200, the function should still
|
|
||||||
(a) early-return when `!params.fit_params`, (b) **clamp** the computed `n_gpu_blocks` so
|
|
||||||
`n_gpu_blocks * block_bytes <= free_vram - margin` using the *actual* KV element size, and
|
|
||||||
(c) not override an explicitly-set value. One-screen change in `common_fit_paged_kv_blocks`.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 2. Dynamic-load benchmark - `paged-loadgen.cpp`
|
|
||||||
|
|
||||||
**Why the existing tools show no paged win:** `llama-batched-bench` and the stock
|
|
||||||
`examples/paged/paged.cpp` both run **fixed-length, all-arrive-at-once, single-prompt**
|
|
||||||
load. That has no over-reservation and no fragmentation, so contiguous KV is already
|
|
||||||
memory-optimal and paging has nothing to reclaim (`PAGED_KV_HIGH_CONCURRENCY.md`). The
|
|
||||||
paged win only exists under **variable lengths + continuous arrival + shared prefixes** -
|
|
||||||
the real serving regime. No tool in the tree creates it.
|
|
||||||
|
|
||||||
`paged-loadgen.cpp` (committed here) does, via the confirmed `llama_paged_scheduler_*`
|
|
||||||
API:
|
|
||||||
|
|
||||||
- **shared system prefix** (`LG_PREFIX` tokens) prepended to every request -> exercises
|
|
||||||
cross-request prefix sharing,
|
|
||||||
- **variable prompt length** (`LG_SUFMIN..LG_SUFMAX` unique suffix),
|
|
||||||
- **bimodal generation length** (`LG_GENLONG` for `LG_LONGPCT`% of requests, else
|
|
||||||
`LG_GENSHORT`) - the over-reservation driver,
|
|
||||||
- **continuous arrival**: keeps `LG_INFLIGHT` requests live, admitting a new one each time
|
|
||||||
one finishes.
|
|
||||||
|
|
||||||
It reports the load-bearing number for the buy decision - the **capacity ratio**:
|
|
||||||
|
|
||||||
```
|
|
||||||
paged peak KV = sum over live seqs of ceil(used/block)*block * kv_bytes_per_token
|
|
||||||
contiguous reserve = peak_inflight * max_ctx * kv_bytes_per_token (worst-case per slot)
|
|
||||||
CAPACITY RATIO = contiguous_reserve / paged_peak (+ prefix sharing on top)
|
|
||||||
```
|
|
||||||
|
|
||||||
`kv_bytes_per_token = 2 * n_layer * n_head_kv * head_dim * sizeof(f16)` - confirmed against
|
|
||||||
`llama-kv-cache-paged.cpp` (e.g. Qwen3-32B: 2*64*8*128*2 = **256 KiB/token**).
|
|
||||||
|
|
||||||
**How to run (on the target):** drop into PR #22569's `examples/paged/`, add to its
|
|
||||||
CMakeLists next to `llama-paged`, build, then e.g.
|
|
||||||
`LG_INFLIGHT=2048 LG_LONGPCT=15 paged-loadgen -m <model> -kvp --fit off -ngpub <N> -ncpub <M> -ngl 99`.
|
|
||||||
Sweep `LG_INFLIGHT` to the throughput plateau and read the capacity ratio at that point.
|
|
||||||
It is written to run on the target (2x H200) where the regime exists; on GB10 it runs but
|
|
||||||
the ratio is uninteresting because throughput plateaus before memory binds (see below).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 3. Projection to 2x H200 (grounded in measured GB10 numbers)
|
|
||||||
|
|
||||||
### Measured on GB10 (this work)
|
|
||||||
|
|
||||||
| model | decode plateau (aggregate) | plateau concurrency | bound by |
|
|
||||||
|---|---|---|---|
|
|
||||||
| Qwen3-32B-Q4_K_M (dense) | ~540 t/s | npl ~128 | compute |
|
|
||||||
| Qwen3-1.7B-Q8_0 | ~3,200 t/s | npl ~512 | bandwidth |
|
|
||||||
|
|
||||||
### Hardware ratios (per GPU, then 2x TP at ~85% scaling)
|
|
||||||
|
|
||||||
| | GB10 | H200 | per-GPU x | 2x H200 (TP) x |
|
|
||||||
|---|---|---|---|---|
|
|
||||||
| mem bandwidth | 273 GB/s | ~4.8 TB/s | 17.6 | ~30 |
|
|
||||||
| BF16 compute | ~213 TFLOP | ~989 TFLOP | 4.6 | ~8 |
|
|
||||||
| HBM | 119 GB | 141 GB | 1.18 | 2.4 (281 GB) |
|
|
||||||
|
|
||||||
Decode is bandwidth-bound, so **both the aggregate ceiling and the concurrency at which it
|
|
||||||
is reached scale with bandwidth (~30x on 2x H200)**:
|
|
||||||
|
|
||||||
- **32B-dense aggregate decode ceiling:** 540 x 30 ~= **16,000 t/s**, reached at
|
|
||||||
~128 x 30 ~= **3,800 concurrent sequences**.
|
|
||||||
|
|
||||||
### Why paged KV becomes the binding lever on 2x H200 (and didn't on GB10)
|
|
||||||
|
|
||||||
To reach that ~16k t/s ceiling you must hold **~3,800 sequences** of KV. The memory math:
|
|
||||||
|
|
||||||
- 32B weights (FP8) ~= 32 GB, sharded over 2 GPUs -> ~250 GB HBM free for KV.
|
|
||||||
- 32B KV = 256 KiB/token. At an avg held context of 2,000 tokens, **per seq = 512 MiB**.
|
|
||||||
- Contiguous unified KV (reserve for the live set) fits ~250 GB / 512 MiB ~= **~490
|
|
||||||
sequences** - **8x short of the 3,800 needed to reach the throughput ceiling.**
|
|
||||||
|
|
||||||
So on 2x H200 **KV memory is the binding constraint at the throughput-optimal concurrency**,
|
|
||||||
and contiguous KV strands most of the bandwidth (you'd run at a fraction of 16k t/s). This
|
|
||||||
is the gap paged KV closes. On GB10 it never appeared because GB10's 30x-lower bandwidth
|
|
||||||
caps decode at npl ~128, whose KV fits in memory trivially - the constraint order is
|
|
||||||
inverted on the real target.
|
|
||||||
|
|
||||||
### Magnitude of the paged win
|
|
||||||
|
|
||||||
Paging recovers concurrency two ways, both multiplicative on achievable throughput:
|
|
||||||
|
|
||||||
1. **No over-reservation.** Contiguous must back `max_ctx` per slot; paging uses
|
|
||||||
`ceil(actual/block)`. For a realistic bimodal workload (most generations short, ~15%
|
|
||||||
long, prompts ~512) the average held context is several-fold below `max_ctx` ->
|
|
||||||
`paged-loadgen` capacity ratio typically **~4-10x** (it measures the exact number for
|
|
||||||
your workload's length distribution).
|
|
||||||
2. **Cross-request prefix sharing** of shared system prompts / RAG preambles - additional,
|
|
||||||
workload-dependent (chained-hash block cache; vLLM's `block_pool.py`).
|
|
||||||
|
|
||||||
Net: on 2x H200, paged KV is plausibly the difference between serving **~500 and ~3,800**
|
|
||||||
concurrent 32B sequences in HBM, i.e. between a fraction of and ~all of the **~16k t/s**
|
|
||||||
decode ceiling. **That is the datacenter payoff, and it is real on the target even though
|
|
||||||
GB10 cannot exhibit it.**
|
|
||||||
|
|
||||||
### Honest caveats for the buy case
|
|
||||||
|
|
||||||
- These are **projections** from GB10 + spec ratios; the capacity multiplier depends on the
|
|
||||||
workload's context-length distribution (more variable -> bigger paged win) and TP
|
|
||||||
efficiency. `paged-loadgen` measures it directly once you have target-GPU time.
|
|
||||||
- The **paged op itself still needs work**: PR #22569's `ggml_paged_attn` was 12-13%
|
|
||||||
*slower* than the mature contiguous flash-attention path at equal concurrency
|
|
||||||
(`PR22569_EVAL.md`), lacks prefix sharing (deferred to a non-existent Phase 2), and has
|
|
||||||
the fit-robustness bug above. Adopting paged KV for the target means either hardening
|
|
||||||
#22569 or finishing the from-scratch P4 - the capacity win above assumes a *correct,
|
|
||||||
competitive* op, which is the remaining engineering.
|
|
||||||
- Prefill on either KV layout is compute-capped, not a paged concern.
|
|
||||||
|
|
||||||
**Bottom line for the decision:** paged KV **is** the right lever for the 2x H200 target -
|
|
||||||
the GB10 "no win" result is a bandwidth artifact, not a verdict. The paged path is now
|
|
||||||
**correctness-verified**, the **benchmark to size the win exists**, and the projection
|
|
||||||
says the payoff is **~5-10x concurrent-tenant capacity -> several-fold higher aggregate
|
|
||||||
decode** on the target. The remaining work is hardening/finishing the paged op, not
|
|
||||||
proving the thesis.
|
|
||||||
@@ -1,55 +0,0 @@
|
|||||||
# Making llama.cpp/LocalAI a viable vLLM alternative — phased plan
|
|
||||||
|
|
||||||
Goal: close the practical gap to vLLM for both single-user *speed* and multi-user *throughput*, while keeping
|
|
||||||
quality (no lossy quant). Grounded in measured benchmarks + research (`BENCHMARKS.md`, `BLACKWELL_KERNEL_GAPS.md`,
|
|
||||||
`VLLM_THROUGHPUT_GAP.md`). The gap is NOT one thing — each phase targets a distinct, independent lever.
|
|
||||||
|
|
||||||
## Where vLLM actually leads (measured, GB10 / Qwen3-32B)
|
|
||||||
|
|
||||||
- **Single-user decode:** ~parity (10.2 vs 11.7) — bandwidth-bound. vLLM's edge is **spec-dec** (lossless).
|
|
||||||
- **Multi-user decode:** gap grows to ~2.2× at B=64 (kernel + scheduler).
|
|
||||||
- **Prefill aggregate:** llama plateaus ~765, vLLM scales to 24k — **paged KV + chunked prefill + kernel**.
|
|
||||||
- Note: on GB10 vLLM's FP4 trump card is *broken* (falls back to Marlin); llama.cpp runs reliably — a real
|
|
||||||
viability point. vLLM is structurally ahead mainly via **paged KV, chunked prefill, cross-request prefix cache**.
|
|
||||||
|
|
||||||
## Phases
|
|
||||||
|
|
||||||
### Phase 1 — Hardware-tuned config (PR #10411) — DONE
|
|
||||||
Folded into the hardware-defaults path (`core/config/hardware_defaults.go`):
|
|
||||||
- Blackwell physical batch (n_ubatch) = 2048.
|
|
||||||
- **VRAM-scaled `n_parallel` default** (>=32GiB→8, >=8→4, >=4→2): turns on concurrency + continuous batching,
|
|
||||||
which the backend leaves OFF at its `n_parallel=1` default. Unified KV → slots share the budget (no extra
|
|
||||||
KV memory). Single-host (local GPU) + distributed router (per node). Already-good defaults confirmed:
|
|
||||||
flash-attn=auto, context=4096.
|
|
||||||
|
|
||||||
### Phase 2 — Paged / block KV cache ← biggest structural multi-user lever
|
|
||||||
vLLM's PagedAttention lifts KV utilization ~20-38% → ~96%. llama.cpp's own A10G data (draft PR #22569):
|
|
||||||
contiguous OOMs at 26 seqs / 496 t/s → paged 247 seqs / 1256 t/s (**~9.5× concurrency, 2.5× aggregate**).
|
|
||||||
- Build on / complete **upstream draft PR #22569** (`-kvp`, block manager + paged-attn ggml op, FCFS scheduler)
|
|
||||||
rather than the from-scratch series we prototyped (`paged/`). Our CPU-verified block manager + gather-read
|
|
||||||
design informs the review/port; the upstream momentum is the place to land it.
|
|
||||||
- Phase 2b: cross-request prefix sharing (block-hash dedup) — our `PagedKVManager` already implements it.
|
|
||||||
|
|
||||||
### Phase 3 — Prefill amortization (chunked prefill + n_batch/n_ubatch split)
|
|
||||||
llama aggregate prefill plateaus because (a) one prompt saturates compute, (b) the per-forward GEMM M-dim is
|
|
||||||
capped at `n_ubatch`=512, (c) no scheduler chunked prefill (draft #10718 abandoned).
|
|
||||||
- Split logical `n_batch` from physical `n_ubatch` (LocalAI ties them today) so concurrent prefills batch into
|
|
||||||
a larger logical batch while keeping ubatch at the Blackwell sweet spot (2048).
|
|
||||||
- Chunked prefill + prefill/decode co-batching in the server slot scheduler.
|
|
||||||
|
|
||||||
### Phase 4 — Batched-GEMM kernel tuning (the decode 2.2× + prefill height)
|
|
||||||
Per `BLACKWELL_KERNEL_GAPS.md`: dense int8-MMQ at ~21% of ceiling, MoE FP4-MMA at ~5%. Both untuned for
|
|
||||||
Blackwell. To MATCH: tune MMQ or a Marlin-style W4A16 BF16 GEMM (FP4 not required — GB10 is INT8==BF16). To
|
|
||||||
BEAT (2×): fix+tune the existing FP4-MMA on sm_121 (build-flag/`-O3`-miscompile, not greenfield).
|
|
||||||
|
|
||||||
### Phase 5 — Backend GPU sampling
|
|
||||||
CPU per-sequence sampling caps GPU util ~60% beyond n_parallel ~8-16 (upstream PR #17004). Track/adopt.
|
|
||||||
|
|
||||||
### Cross-cutting — Speculative decoding (single-user speed, quality-preserving)
|
|
||||||
Dense ≥14B: lossless ~1.8-3×. llama.cpp has `-md`/`--spec-draft-*`. Wire a draft-model field in the model
|
|
||||||
config + ship Qwen3 target+draft (1.7B) pairs in the gallery. NOT for MoE-A3B (nothing to amortize).
|
|
||||||
|
|
||||||
## Sequencing rationale
|
|
||||||
Phase 1 (config) ships now — biggest immediate multi-user win for zero kernel work (concurrency was OFF).
|
|
||||||
Phase 2 (paged KV) is the highest-leverage structural build and has upstream momentum. Phases 3-4 are deeper
|
|
||||||
(scheduler + kernel). Spec-dec is independent and can land any time for single-user speed.
|
|
||||||
@@ -1,90 +0,0 @@
|
|||||||
# PR #17004 (backend / GPU sampling) evaluation on DGX Spark (GB10, sm_121)
|
|
||||||
|
|
||||||
Date: 2026-06-21. Hardware: NVIDIA GB10 (GB10, sm_121), CUDA 13.0, cmake 3.28.
|
|
||||||
Model: `Qwen3-32B-Q4_K_M.gguf`. LocalAI pin: `LLAMA_VERSION=f3e182816421c648188b5eab269853bf1531d950` (2026-06-17).
|
|
||||||
|
|
||||||
## TL;DR (clean negative)
|
|
||||||
|
|
||||||
1. **PR #17004 is MERGED and is ALREADY present in our pinned llama.cpp `f3e1828`.** There is nothing to apply / cherry-pick / patch. The `-bs/--backend-sampling` CLI arg, the `llama_set_sampler` / `llama_get_sampled_*` API, and the GPU argsort/top-k/cumsum/softmax kernels are all in the pin.
|
|
||||||
2. **The prescribed benchmark cannot test the fix.** `llama-batched-bench` does ZERO sampling - it feeds random tokens (`std::rand() % n_vocab`). Its ~540 t/s plateau is therefore **not** sampling-bound, and enabling backend sampling does nothing to it. The valid tool is `llama-batched` (examples/batched), which the PR updated to drive per-sequence sampler chains and which actually exercises `-bs`.
|
|
||||||
3. **In a controlled real-sampling A/B (same `llama-batched` harness, CPU vs GPU sampler), GPU sampling gave only +25% at np=32, +3% at np=64, and CRASHED (`GGML_ASSERT(obj_new)`, graph-context alloc) at np=128 and np=256** - exactly the multi-user regime the investigation cares about.
|
|
||||||
4. **nsys at np=64: GPU kernel profile and GPU-busy time are essentially identical with and without the fix** (CPU 392.5 t/s / GPU 404.2 t/s; total GPU kernel+memop time ~4.05 s in both). Sampling kernels do not even appear among the top GPU contributors. GPU utilization did **not** rise.
|
|
||||||
5. **Conclusion: PR #17004, in the state shipped by our pin, does NOT break the ~540 plateau and does not move decode aggregate toward the ~2700 GPU-bound ceiling or past vLLM's 667.** It is modest at low parallelism and unusable (crash) at the high parallelism in question. The PR's own guidance ("recommended `--parallel 1`", "will take time to mature") matches what we measured.
|
|
||||||
|
|
||||||
## 1. What PR #17004 does + state
|
|
||||||
|
|
||||||
- Title: "sampling : add support for backend sampling". **State: MERGED** into `master` (PR head branch `gpu-sampling`). 44 files, +4133/-296.
|
|
||||||
- `libllama`: new `llama_context_params.samplers` / `n_samplers`, `llama_set_sampler`, `llama_get_sampled_*`, `llama_sampler_seq_config`, updated `llama_sampler_i`. Sampler chain can now run inside the compute graph on the backend (GPU) instead of on the CPU after `llama_decode`.
|
|
||||||
- CUDA: optimized/new `argsort`, `top-k`, `cumsum`, `softmax` kernels; CMake option `-DGGML_CUDA_CUB_3DOT2=ON` (builds a CCCL v3.2 prerelease for faster top-k).
|
|
||||||
- Tools: new `-bs, --backend-sampling` arg in `common/arg.cpp` (line 1921); server (`server-context.cpp`) per-slot wiring; `examples/batched/batched.cpp` updated.
|
|
||||||
- Supported backend samplers: `top-k`, `top-p`, `min-p`, `temp` (+ dist). **Limitations (from the PR): not compatible with grammar sampling; single output per sequence per batch; no save/load of sampling state; recommended only with `--parallel 1` and CUB_3DOT2.** Open follow-ups: #18547 (avoid graph reallocations), #18550 (skip inactive samplers in parallel decode).
|
|
||||||
- It DOES target the CPU-side per-sequence sampling stall we hypothesised - the mechanism is correct. Maturity is the problem.
|
|
||||||
|
|
||||||
Note: the GitHub API reports `mergedAt: 2026-01-04`, but the PR contains June 2026 upstream-merge commits and the feature is verified present in our 2026-06-17 pin, so treat the date field as a metadata quirk. What matters: the code is in `f3e1828`.
|
|
||||||
|
|
||||||
## 2/3. Apply + build
|
|
||||||
|
|
||||||
No apply needed (already in pin). Built from a clean `git worktree` at `f3e1828` (`~/llama-pr17004`), to avoid disturbing the existing diffusion build:
|
|
||||||
|
|
||||||
```
|
|
||||||
cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON \
|
|
||||||
-DCMAKE_CUDA_ARCHITECTURES=121 -DLLAMA_MAX_SEQ=256 \
|
|
||||||
-DGGML_CUDA_CUB_3DOT2=ON -DLLAMA_CURL=OFF
|
|
||||||
cmake --build build --target llama-batched llama-batched-bench -j20
|
|
||||||
```
|
|
||||||
|
|
||||||
**Build: SUCCESS** (CUB_3DOT2=ON FetchContent fetched and compiled despite flaky net; sm_121; LLAMA_MAX_SEQ=256). `-bs/--backend-sampling` confirmed present in `llama-batched --help`.
|
|
||||||
|
|
||||||
## 4. Decode aggregate: fix vs baseline vs vLLM
|
|
||||||
|
|
||||||
### 4a. `llama-batched-bench` (NO sampling - reconfirms the plateau, unaffected by the fix)
|
|
||||||
`-npp 16 -ntg 128 -npl 32,64,128,256 -c 40960 -b 2048 -ub 2048`
|
|
||||||
|
|
||||||
| npl | S_TG t/s |
|
|
||||||
|-----|----------|
|
|
||||||
| 32 | 241.8 |
|
|
||||||
| 64 | 395.1 |
|
|
||||||
| 128 | 542.6 |
|
|
||||||
| 256 | 567.2 |
|
|
||||||
|
|
||||||
Reproduces the ~540 plateau. Because this tool never samples, `-bs` is irrelevant here - the plateau is decode/host-overhead-bound, not sampling-bound.
|
|
||||||
|
|
||||||
### 4b. `llama-batched` real-sampling A/B (CPU sampler vs `-bs` GPU sampler, identical harness)
|
|
||||||
`-kvu -n 128 -np {32,64,128,256} -c 40960 --seed 1` (samplers: top-k 40 / top-p 0.95 / temp 0.8)
|
|
||||||
|
|
||||||
| np | CPU sampling t/s | GPU `-bs` sampling t/s | delta |
|
|
||||||
|-----|------------------|------------------------|-------|
|
|
||||||
| 32 | 174.1 | 217.5 | +25% |
|
|
||||||
| 64 | 390.5 | 403.4 | +3.3% |
|
|
||||||
| 128 | 497.9 | **CRASH** `GGML_ASSERT(obj_new) ggml.c:1768` | - |
|
|
||||||
| 256 | 396.7 | **CRASH** `GGML_ASSERT(obj_new) ggml.c:1768` | - |
|
|
||||||
|
|
||||||
(`llama-batched` absolute t/s is lower than `batched-bench` because it does real sampling plus per-token detokenize/string/stream work; the A/B *within* this harness isolates the sampler cost.)
|
|
||||||
|
|
||||||
**Does the fix break the plateau? No.** GPU sampling helps only at low parallelism and the gain shrinks as np rises (+25% -> +3%), then the path crashes at np>=128 - i.e. it fails in exactly the multi-user regime where the plateau matters. It does not approach the ~2700 ceiling and does not pass vLLM's 667. The CPU-sampling curve itself peaks at np=128 (498) and *drops* at np=256 (397), confirming CPU sampling is a scaling wall - but PR #17004 as shipped does not lift it because the GPU path is unstable there.
|
|
||||||
|
|
||||||
## 5. GPU-utilization mechanism (nsys, np=64, the highest np where `-bs` survives)
|
|
||||||
|
|
||||||
`nsys profile -t cuda ... -n 96 -np 64`
|
|
||||||
|
|
||||||
| mode | decode t/s | total GPU kernel+memop time | top GPU contributors |
|
|
||||||
|------|-----------|------------------------------|----------------------|
|
|
||||||
| CPU sampling | 392.5 | ~4.07 s | mul_mat_q (55%+17%), flash_attn (5.7%), mul_mat_vec (2%) |
|
|
||||||
| GPU `-bs` | 404.2 | ~4.04 s | identical set; sampling kernels not in top contributors |
|
|
||||||
|
|
||||||
GPU-busy time and the kernel mix are **essentially unchanged** between modes. The argsort/top-k/cumsum/softmax sampling kernels are negligible in the timeline; the only visible difference is H2D memcpy *instances* rising 1,495 -> 7,076 (pinned-memory sampler transfers) at ~unchanged total memcpy time. **GPU utilization did not rise.** This directly refutes the idea that, at this workload, the GPU idle is dominated by CPU sampler arithmetic - moving the sampler onto the GPU barely changed throughput (+3%) and did not raise GPU occupancy. The ~80% idle measured elsewhere is dominated by something other than the sampler math (host-side batch construction / synchronization / detokenize), which PR #17004 does not address.
|
|
||||||
|
|
||||||
(np=256 nsys "with fix" could not be captured: `-bs` aborts there. Fixing the crash needs the unmerged follow-ups #18547/#18550, not in our pin.)
|
|
||||||
|
|
||||||
## LocalAI adoption path
|
|
||||||
|
|
||||||
**The code arrives transparently with a version bump; enabling it is not transparent.**
|
|
||||||
|
|
||||||
- `backend/cpp/llama-cpp/prepare.sh` copies all of upstream `llama.cpp/tools/server/*` (including the #17004-modified `server-context.cpp` / `server-task.cpp` / `server-common.cpp`) into `tools/grpc-server/`, and `grpc-server.cpp` `#include`s them. So once `LLAMA_VERSION` points at a commit containing #17004 (our pin `f3e1828` already does), the backend-sampling machinery compiles into `grpc-server` automatically. **No vendored patch in `patches/` is required for the code.**
|
|
||||||
- The vendored `server-context.cpp` already does the per-slot wiring (around line 1615): `backend_sampling &= task.params.sampling.backend_sampling`, also disabled for speculative decode and for pre-sampling logits (`n_probs>0`), then `llama_set_sampler(ctx_tgt, slot.id, common_sampler_get(slot.smpl))`.
|
|
||||||
- **But it is OFF unless `task.params.sampling.backend_sampling == true`.** LocalAI's `grpc-server` builds `params` itself from the gRPC request and never sets this flag (and does not pass the upstream `--backend-sampling` CLI arg). So as-is, LocalAI compiles the feature but never uses it. **A small grpc-server change is needed**: read a LocalAI model option / env and set `params.sampling.backend_sampling = true` (global or per-request).
|
|
||||||
- For performant CUDA top-k, add `-DGGML_CUDA_CUB_3DOT2=ON` to the llama-cpp CUDA `CMAKE_ARGS` in the Makefile (optional; a non-CUB fallback exists).
|
|
||||||
- **Caveats that blunt the benefit for LocalAI specifically:** grammar-constrained requests (JSON-schema / tool calls - a large share of LocalAI traffic), `logprobs`/`n_probs>0`, and speculative decoding all fall back to CPU sampling by the gating above; and the GPU path crashes at np>=128 in this pin. So even after wiring the flag, the multi-user throughput case would not benefit (and would crash) until the follow-up PRs (#18547/#18550) land and stabilise high-parallelism backend sampling.
|
|
||||||
|
|
||||||
### Recommendation
|
|
||||||
Do **not** adopt PR #17004 as the multi-user throughput fix yet. It is already in the tree but is immature at the parallelism that matters (crashes at np>=128, modest gains below). The measured bottleneck at this workload is not the sampler arithmetic (nsys shows GPU-busy unchanged when sampling moves to GPU). Re-evaluate after #18547/#18550 merge into a future pin; revisit the host-side decode/batch-construction overhead as the more likely real lever.
|
|
||||||
@@ -1,136 +0,0 @@
|
|||||||
# Evaluation: llama.cpp PR #22569 (paged KV cache, `-kvp`) on DGX Spark (GB10, sm_121)
|
|
||||||
|
|
||||||
Question: is upstream draft PR #22569 the right base to give LocalAI vLLM-class
|
|
||||||
high-concurrency GPU throughput, or should we finish our own from-scratch P4
|
|
||||||
(`backend/cpp/llama-cpp/paged/`)?
|
|
||||||
|
|
||||||
Date: 2026-06-21. Hardware: NVIDIA GB10 (compute 12.1 / sm_121), 122502 MiB unified
|
|
||||||
memory, CUDA 13.0, gcc 13.3. Models: `Qwen3-32B-Q4_K_M.gguf` (18.4 GB, 64 layers,
|
|
||||||
n_head 64 / n_head_kv 8 / head_dim 128 / n_embd 5120) and `Qwen3-0.6B-Q8_0.gguf` for
|
|
||||||
the correctness gate.
|
|
||||||
|
|
||||||
## TL;DR verdict: DO NOT adopt #22569. Finish our own P4.
|
|
||||||
|
|
||||||
On GB10 with a 32B dense model, PR #22569 delivers **no throughput win and no concurrency
|
|
||||||
win** - it is ~12% *slower* than the existing contiguous path and hits the *same*
|
|
||||||
256-sequence ceiling. The "scale to thousands of sequences like vLLM" premise does not
|
|
||||||
hold for this PR or this hardware/model. On top of that it is broken out of the box,
|
|
||||||
wired to the wrong integration surface, and a contested draft.
|
|
||||||
|
|
||||||
## 1. Builds? Correct?
|
|
||||||
|
|
||||||
- **Builds: YES.** Cloned `matiaslin/llama.cpp@paged_attention` (PR #22569, single commit
|
|
||||||
`0b0f7bd...`, base = current master). Clean CUDA build for sm_121
|
|
||||||
(`-DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=121 -DCMAKE_BUILD_TYPE=Release`).
|
|
||||||
`llama-paged`, `llama-batched-bench`, `test-paged-kv`, `test-paged-kv-e2e` all link.
|
|
||||||
It is self-contained (ships its own CPU+CUDA `ggml_paged_attn` op) and does **not**
|
|
||||||
depend on the competing CUDA PR #17579 (ericcurtin, `--pagedattention`).
|
|
||||||
|
|
||||||
- **Runs out of the box: NO.** `llama-paged -kvp` on Qwen3-32B *and* Qwen3-0.6B crashes
|
|
||||||
at context creation:
|
|
||||||
`build_attn(llm_graph_input_attn_kv_paged*) -> ggml_reshape_2d ->`
|
|
||||||
`GGML_ASSERT(ggml_nelements(a) == ne0*ne1)` (src/llama-graph.cpp:2556). Same crash with
|
|
||||||
`--fit off` (so it is the real graph, not just the memory probe).
|
|
||||||
**Root cause:** the paged path hardcodes `ggml_reshape_2d(cur, hparams.n_embd, ...)`,
|
|
||||||
wrong for any model where `n_head*head_dim != n_embd`. Qwen3 decouples head_dim:
|
|
||||||
32B = 64*128 = **8192** vs n_embd 5120; 0.6B = 16*128 = **2048** vs 1024. The PR's
|
|
||||||
"qwen3 verified" claim does **not** hold against current Qwen3 GGUFs. Fix is ~1 line
|
|
||||||
(use the real attention width `cur->ne[0]*cur->ne[1]`); applied for the rest of the eval.
|
|
||||||
|
|
||||||
- **`fit_params` (`-ngpub` auto-sizing) also crashed on GB10** in the same reshape path
|
|
||||||
during the device-memory probe (before the fix). After the reshape fix, paged
|
|
||||||
auto-fit works (sized 96624 GPU blocks on the 0.6B from 85 GiB free).
|
|
||||||
|
|
||||||
- **Correctness after the reshape fix:** paged decode runs and produces **coherent**
|
|
||||||
output on Qwen3-32B (sensible mercury / miso-soup / Starry-Night answers across 128 and
|
|
||||||
256 concurrent sequences), indicating the `ggml_paged_attn` op is functionally roughly
|
|
||||||
correct. PR's own greedy/top-K equivalence test (`test-paged-kv-e2e`, top-K argmax +
|
|
||||||
top-5 overlap >= 4 + first-4-token greedy match vs non-paged) on Qwen3-0.6B did
|
|
||||||
**not** reach a PASS/FAIL verdict on GB10: its paged auto-fit grabs ~88 GiB
|
|
||||||
(96531 blocks) and the run then stalls at cache init (a third GB10 fit-robustness
|
|
||||||
issue, distinct from the reshape bug). So the formal greedy-equivalence gate is
|
|
||||||
**unverified on this box**, but the qualitative evidence (coherent multi-sequence 32B
|
|
||||||
output with explicit small `-ngpub`) indicates the fixed op is roughly correct. This
|
|
||||||
does not change the verdict, which is decided by throughput below.
|
|
||||||
|
|
||||||
## 2. Throughput: paged vs contiguous on GB10 (Qwen3-32B-Q4_K_M)
|
|
||||||
|
|
||||||
Contiguous = `llama-batched-bench` (unified KV, continuous batching), S_TG decode tok/s.
|
|
||||||
Paged = `llama-paged -kvp --fit off` (its scheduler-driven continuous-batching loop),
|
|
||||||
`aggregate tps`. Both `npp~16, ntg/n_predict=128, n_batch=n_ubatch=2048, -ngl 99`.
|
|
||||||
|
|
||||||
| npl | contiguous (S_TG t/s) | paged `-kvp` (agg t/s) | outcome |
|
|
||||||
|------|----------------------|------------------------|---------|
|
|
||||||
| 128 | **537** (S 553) | **477** | both run; paged ~12% slower |
|
|
||||||
| 256 | **541** (S 550) | **471** | both run; paged ~13% slower; neither gains over 128 |
|
|
||||||
| 512 | FAIL | FAIL | **both** die: `n_seq_max must be <= 256` |
|
|
||||||
| 1024 | FAIL | FAIL | **both** die: `n_seq_max must be <= 256` |
|
|
||||||
|
|
||||||
### The decisive facts
|
|
||||||
|
|
||||||
1. **PR #22569 does NOT lift the 256-sequence ceiling.** Both contiguous and paged fail
|
|
||||||
identically at npl 512/1024 with `n_seq_max must be <= 256` (llama.cpp's compile-time
|
|
||||||
`LLAMA_MAX_SEQ`). It is **not** an OOM - GB10 has 119 GiB and at npl=256 contiguous KV
|
|
||||||
is only 16 GiB. Paging gives **zero** concurrency headroom over contiguous here. The
|
|
||||||
"paged unlocks thousands of seqs" premise is false for this PR.
|
|
||||||
|
|
||||||
2. **Paged is slower, not faster.** The fresh `ggml_paged_attn` op (477/471 t/s) loses to
|
|
||||||
the mature CUDA flash-attention contiguous path (537/541 t/s) by ~12-13% at equal
|
|
||||||
concurrency. The PR's A10G "2.5x" came entirely from contiguous OOMing at 26 seqs on a
|
|
||||||
24 GiB card; that lever does not exist on GB10's 119 GiB.
|
|
||||||
|
|
||||||
3. **The 32B dense model is compute-bound and plateaus by npl=128 on GB10.** Aggregate is
|
|
||||||
flat from 128->256 (contiguous 537->541; paged 477->471). Doubling concurrency buys
|
|
||||||
nothing because the GPU is already saturated on the 32B weight matmuls. Even if we
|
|
||||||
recompiled with a larger `LLAMA_MAX_SEQ`, aggregate would not climb - so vLLM-class
|
|
||||||
~24k aggregate is **unreachable for 32B-dense on a single GB10 regardless of KV
|
|
||||||
layout**. The throughput gap to vLLM at this model/hardware is a compute/bandwidth
|
|
||||||
problem, not a KV-fragmentation problem.
|
|
||||||
|
|
||||||
## 3. Verdict and reasoning: finish our own P4
|
|
||||||
|
|
||||||
**Do not adopt #22569 as the base.** Reasons:
|
|
||||||
|
|
||||||
- **No win on target hardware.** Even fully completed, on GB10 + 32B it is slower than
|
|
||||||
what we already have and capped at the same 256 seqs. There is no throughput or
|
|
||||||
concurrency dividend to harvest here.
|
|
||||||
- **Wrong integration surface.** Paged is driven only by a brand-new parallel C API
|
|
||||||
(`llama_paged_scheduler_init/add_request/prepare_batch/get_batch_info/update/...`) and a
|
|
||||||
bespoke `examples/paged` loop. `-kvp`/`--kv-paged` is gated to `LLAMA_EXAMPLE_PAGED`
|
|
||||||
only - it is NOT wired into `llama-server`/`batched-bench`/`parallel`, i.e. NOT the path
|
|
||||||
LocalAI's grpc-server derives from. Adopting it means rewriting LocalAI's serving loop
|
|
||||||
around the new scheduler API.
|
|
||||||
- **Broken / restricted.** Crashes out of the box on all current Qwen3 (and any
|
|
||||||
decoupled-head-dim model); fit_params crashed; Phase-1 restrictions enforced at context
|
|
||||||
creation: single CUDA device, full offload only, `n_batch == n_ubatch`, no SWA
|
|
||||||
(gemma3/llama4/etc. unsupported), no CoW / prefix-caching, no
|
|
||||||
`seq_cp`/`seq_keep`/`seq_div`/`seq_add`, no state save/load.
|
|
||||||
- **Contested draft.** Unmerged; the author is openly asking maintainers whether the C
|
|
||||||
API is even the right design; maintainers are skeptical of paged for single-node use.
|
|
||||||
|
|
||||||
**What P4 should actually target (re-scoped by this data).** The aggregate-throughput
|
|
||||||
gap to vLLM on a compute-bound dense model on one GB10 is not addressable by paged KV.
|
|
||||||
The durable, real LocalAI wins from paging are the ones our from-scratch P0 already
|
|
||||||
implements the machinery for and that #22569 explicitly omits:
|
|
||||||
- **on-demand KV sizing** (fit more *diverse* concurrent tenants without per-seq
|
|
||||||
over-reservation), and
|
|
||||||
- **automatic cross-tenant prefix sharing** (chained-hash block cache - shared system
|
|
||||||
prompts / RAG preambles), which #22569 defers to a non-existent Phase 2.
|
|
||||||
|
|
||||||
Finish our own P4 (CPU gather-read + a CUDA gather-read) against these capacity/
|
|
||||||
prefix-sharing objectives - measured as max concurrent *distinct* tenants and KV memory
|
|
||||||
saved, not single-model aggregate tok/s. To chase raw aggregate, the levers are lifting
|
|
||||||
`LLAMA_MAX_SEQ` and smaller/MoE models in memory-bandwidth-bound regimes - orthogonal to
|
|
||||||
paged attention. The ~1-line reshape fix found here (and the GB10 fit_params crash) are
|
|
||||||
worth upstreaming to #22569 regardless, but the PR is not our base.
|
|
||||||
|
|
||||||
### Reproduction (DGX, `~/llama.cpp-pr22569`)
|
|
||||||
```sh
|
|
||||||
export PATH=/usr/local/cuda/bin:$PATH
|
|
||||||
# contiguous
|
|
||||||
./build/bin/llama-batched-bench -m Qwen3-32B-Q4_K_M.gguf -ngl 99 -npp 16 -ntg 128 \
|
|
||||||
-npl 128 -c 20480 -b 2048 -ub 2048 # 256/512/1024 -> n_seq_max must be <= 256
|
|
||||||
# paged (needs the src/llama-graph.cpp:2556 reshape fix: hparams.n_embd -> cur->ne[0]*cur->ne[1])
|
|
||||||
./build/bin/llama-paged -m Qwen3-32B-Q4_K_M.gguf -kvp --fit off -ngpub 2048 -ncpub 128 \
|
|
||||||
-np 128 -ns 128 -n 128 -b 2048 -ub 2048 -ngl 99 # 512/1024 -> n_seq_max must be <= 256
|
|
||||||
```
|
|
||||||
@@ -1,95 +0,0 @@
|
|||||||
# Paged Attention for llama.cpp (vLLM-parity), CPU-first
|
|
||||||
|
|
||||||
A from-scratch port of vLLM V1's paged KV-cache model into the llama.cpp / ggml
|
|
||||||
world, built CPU-first and verified incrementally. The host-side block manager is
|
|
||||||
a faithful port of vLLM; the compute stays in ggml (no new op — the read path
|
|
||||||
gathers blocks with `ggml_get_rows` and feeds the existing attention ops).
|
|
||||||
|
|
||||||
Design: `docs/superpowers/specs/2026-06-19-paged-attention-llamacpp-design.md`
|
|
||||||
Plan: `docs/superpowers/plans/2026-06-19-paged-attention-llamacpp.md`
|
|
||||||
|
|
||||||
## Status
|
|
||||||
|
|
||||||
| Phase | What | State |
|
|
||||||
|------|------|-------|
|
|
||||||
| P0 | vLLM-parity host block manager (`FreeBlockQueue`, `BlockPool`, `PagedKVManager`, chained-hash prefix cache) | ✅ verified — `make check`, 4/4 suites |
|
|
||||||
| P1 | ggml paged write/gather mechanism (`set_rows` by slot_mapping → `get_rows` gather) | ✅ verified — `make ggml-check`, non-contiguous blocks `[2,1,5]` round-trip + isolation |
|
|
||||||
| P2 (core) | attention over gathered paged KV matches independent host reference | ✅ verified — max abs err **7.5e-08** |
|
|
||||||
| P3 (partial) | capacity & prefix-sharing wins | ✅ measured — `make bench`: **9.2×** more concurrent seqs, **11.3×** less KV memory |
|
|
||||||
| **P3 (in-model placement)** | **paged, non-contiguous block KV placement in the real model** | ✅ **Gate 0 PASSED** — Qwen3-0.6B token-identical (`patches/0001-paged-kv-block-placement.patch`) |
|
|
||||||
| P4 (in-model compute) | gather-read (`build_attn_paged`, read only a seq's blocks) + win-2 throughput + multi-seq | ⛔ remaining |
|
|
||||||
|
|
||||||
The design's central risk — *does paged (non-contiguous) KV produce correct attention?* —
|
|
||||||
is **retired at two levels**: (1) at the ggml-op level (P2, 7.5e-08 vs reference) and
|
|
||||||
(2) **in a real model** (P3): with KV physically scattered across permuted, non-contiguous
|
|
||||||
blocks (cells `0-15, 144-159, 32-47, …`), Qwen3-0.6B greedy generation is **token-for-token
|
|
||||||
identical** to the contiguous cache. Reproduce:
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# from backend/cpp/llama-cpp-fallback-build/llama.cpp (patch applied, CPU build)
|
|
||||||
B=build-cpu/bin/llama-simple; M=<Qwen3-0.6B.Q4_K_M.gguf>; P="...long prompt..."
|
|
||||||
"$B" -m "$M" -n 40 "$P" > base.txt
|
|
||||||
LLAMA_KV_PAGED=1 "$B" -m "$M" -n 40 "$P" > paged.txt
|
|
||||||
diff base.txt paged.txt && echo TOKEN-IDENTICAL
|
|
||||||
# LLAMA_KV_PAGED_DEBUG=1 prints the permuted physical cells per step
|
|
||||||
```
|
|
||||||
|
|
||||||
This proves the **storage/placement** layer of paged attention in-model. What remains (P4)
|
|
||||||
is the **compute** optimization that yields the throughput win: a gather-read that attends
|
|
||||||
only a sequence's own blocks (instead of scanning `[0,n_kv)` with a mask), plus the
|
|
||||||
multi-sequence driver to measure tok/s vs concurrency. The patch is single-sequence scope.
|
|
||||||
|
|
||||||
## Build & test
|
|
||||||
|
|
||||||
```sh
|
|
||||||
make check # P0 host-manager unit suites (pure C++, no deps)
|
|
||||||
make ggml-check GGML_SRC=<llama.cpp>/ggml GGML_BUILD=<ggml-build> # P1/P2 ggml tests
|
|
||||||
make bench # P3 capacity + prefix-sharing numbers
|
|
||||||
```
|
|
||||||
|
|
||||||
`ggml-check` needs a built ggml. To build one CPU-only from a llama.cpp checkout:
|
|
||||||
`cmake -S <llama.cpp>/ggml -B /tmp/ggml-build -DGGML_CUDA=OFF -DCMAKE_BUILD_TYPE=Release && cmake --build /tmp/ggml-build -j`
|
|
||||||
(if it complains about a missing `ggml.pc.in`, add a minimal pkg-config stub).
|
|
||||||
|
|
||||||
## Files
|
|
||||||
|
|
||||||
- `paged_kv_manager.{h,cpp}` — the vLLM-parity block manager (no ggml/llama dep).
|
|
||||||
- `tests/test_free_block_queue.cpp` — intrusive LRU free list.
|
|
||||||
- `tests/test_block_pool.cpp` — alloc/touch/free/evict/cache.
|
|
||||||
- `tests/test_paged_kv_manager.cpp` — allocate/block_table/slot_mapping/free.
|
|
||||||
- `tests/test_prefix_cache.cpp` — chained block hashing + first-miss cache hit.
|
|
||||||
- `tests/test_ggml_paged_rw.cpp` — paged write/gather through real ggml ops.
|
|
||||||
- `tests/test_ggml_paged_attn.cpp` — attention over paged KV vs host reference.
|
|
||||||
- `paged-bench.cpp` — capacity (win 1) + prefix-sharing (win 3) measurements.
|
|
||||||
|
|
||||||
## Remaining work — integration map (for the next session)
|
|
||||||
|
|
||||||
Target: a paged read path active behind a flag, producing **token-identical** greedy
|
|
||||||
output vs the contiguous cache on a real model (Gate 0), then `paged-bench` win 2.
|
|
||||||
|
|
||||||
Exact seams in the vendored llama.cpp (`backend/cpp/llama-cpp-fallback-build/llama.cpp`,
|
|
||||||
the pinned build fetches `LLAMA_VERSION=f3e182816421…`):
|
|
||||||
|
|
||||||
1. **Memory type** — `src/llama-model.cpp:2070` `create_memory()` constructs `llama_kv_cache`.
|
|
||||||
Add a paged variant (or a flag on the existing cache) implementing `llama_memory_i`
|
|
||||||
(`src/llama-memory.h`), backed by `PagedKVManager`.
|
|
||||||
2. **Allocation** — `src/llama-kv-cache.cpp:818` `find_slot()` produces `slot_info.idxs`.
|
|
||||||
Replace the ring-buffer scan with block-aligned allocation from `PagedKVManager`.
|
|
||||||
3. **Read path** — `src/llama-kv-cache.cpp:1145/1165` `get_k`/`get_v` return a contiguous
|
|
||||||
`[0,n_kv)` view. For paged, gather the sequence's blocks (`ggml_get_rows`) into scratch.
|
|
||||||
The new branch lives alongside `build_attn` in `src/llama-graph.cpp` (`build_attn_mha`).
|
|
||||||
4. **Mask** — `src/llama-graph.cpp` `build_attn_inp_kq_mask` sizes the mask to the gathered
|
|
||||||
length per sequence.
|
|
||||||
5. **Gate 0 driver** — `build-cpu/bin/llama-simple` (greedy argmax) on
|
|
||||||
`Qwen3-0.6B.Q4_K_M.gguf`; assert paged output == contiguous output token-for-token.
|
|
||||||
|
|
||||||
### Honest caveats (from the maintainer discussion + reading `find_slot`)
|
|
||||||
|
|
||||||
- llama.cpp's **unified cache already shares one KV pool** across sequences and already
|
|
||||||
tolerates non-contiguous slots. So win-1 vs *unified* is smaller than vs per-seq
|
|
||||||
reservation (stream mode). The durable LocalAI wins are **on-demand sizing** and
|
|
||||||
**automatic cross-tenant prefix sharing** (P0 implements the block-hash machinery).
|
|
||||||
- vLLM's classic `paged_attention_v1/v2` CUDA kernel is **deprecated**; the live path is
|
|
||||||
FlashAttention/FlashInfer over a block table. The port targets that pattern, not the
|
|
||||||
old kernel. Upstream draft PRs #22569 (new `ggml_paged_attn` op) and #17579 (CUDA) are
|
|
||||||
unmerged; maintainers are skeptical for single-user use.
|
|
||||||
@@ -1,78 +0,0 @@
|
|||||||
# Upstream ggml issue draft: MXFP4 MoE prefill underutilizes Blackwell (GB10) — ~22 TFLOP/s, ~27× behind vLLM
|
|
||||||
|
|
||||||
**Title:** CUDA: MXFP4 MoE prefill runs the Ampere-class warp `mma.sync`, far below Blackwell FP4 peak (GB10 / sm_121)
|
|
||||||
|
|
||||||
## Summary
|
|
||||||
|
|
||||||
On a GB10 (DGX Spark, sm_121), MXFP4 MoE prefill for Qwen3-Coder-30B-A3B is bottlenecked by
|
|
||||||
`mul_mat_q<MXFP4>` (the per-expert grouped MMQ), which runs at only **~22 effective TFLOP/s** — a small
|
|
||||||
fraction of the GPU's FP4 capability. Batched prefill plateaus at ~3.65k tok/s (B=32) vs vLLM FP8 ~99k
|
|
||||||
on the same box (~27×). The native FP4 block-scaled `mma.sync` path (PR #17906 et al.) *is* engaged — the
|
|
||||||
limit is that it's a warp-level MMA kernel, not a tcgen05/CUTLASS-class grouped GEMM.
|
|
||||||
|
|
||||||
## Hardware / build
|
|
||||||
|
|
||||||
- NVIDIA GB10, compute capability 12.1, 119 GiB unified LPDDR5X.
|
|
||||||
- llama.cpp built `-DCMAKE_CUDA_ARCHITECTURES=121` (sm_121a/compute_121a confirmed in cubins).
|
|
||||||
- Model: Qwen3-Coder-30B-A3B-Instruct, `MXFP4_MOE` (15.9 GiB, 4.47 BPW).
|
|
||||||
|
|
||||||
## Measurements
|
|
||||||
|
|
||||||
Single-stream (`llama-bench`, ub2048):
|
|
||||||
|
|
||||||
| metric | Q8_0 | MXFP4 | vLLM FP8 |
|
|
||||||
|---|---|---|---|
|
|
||||||
| prefill pp2048 | ~2200 | 3441 | — |
|
|
||||||
| decode tg128 | 62 | 86 | 52 |
|
|
||||||
|
|
||||||
Batched (decode-phase aggregate `S_TG`; prefill aggregate `S_PP`):
|
|
||||||
|
|
||||||
| B | llama MXFP4 prefill | vLLM FP8 prefill | llama MXFP4 decode | vLLM FP8 decode |
|
|
||||||
|---|---|---|---|---|
|
|
||||||
| 1 | 1625 | 9644 | 83 | 48 |
|
|
||||||
| 8 | 3634 | 33373 | 267 | 312 |
|
|
||||||
| 32 | 3651 | 99398 | 551 | 1171 |
|
|
||||||
| 64 | 3648 | 151990 | 770 | 2064 |
|
|
||||||
|
|
||||||
Decode is competitive (we win at B=1). **Prefill plateaus and is the gap.**
|
|
||||||
|
|
||||||
## Profiling (nsys, MXFP4 pp2048 kernel time)
|
|
||||||
|
|
||||||
| kernel | % |
|
|
||||||
|---|---|
|
|
||||||
| `mul_mat_q<(ggml_type)39>` (MXFP4 MoE GEMM) | **37.2** |
|
|
||||||
| `mul_mat_q<(ggml_type)8>` (dense/attn, still Q8) | 10.1 |
|
|
||||||
| `flash_attn_ext_f16` | 8.8 |
|
|
||||||
| `quantize_mmq_mxfp4` (activation quant) | 8.0 |
|
|
||||||
|
|
||||||
Only cutlass kernel present is `cutlass_80_tensorop` (Ampere). No tcgen05 / wgmma anywhere.
|
|
||||||
|
|
||||||
## What we ruled out (so it's the kernel, not config)
|
|
||||||
|
|
||||||
- **ubatch**: saturates at 2048 (pp4096: ub512 2994 → ub2048 3316 → ub8192 3180).
|
|
||||||
- **tile width**: `mmq_x` already selects the full 128-wide tile at ub2048 (~128 tokens/expert).
|
|
||||||
- **cuBLAS fallback**: `GGML_CUDA_FORCE_CUBLAS` is a no-op (3419 ↔ 3423 t/s) — dequant→cuBLAS-FP16 neither
|
|
||||||
helps nor hurts, i.e. the FP4 MMQ kernel isn't worse than FP16 cuBLAS, both hit a common ceiling.
|
|
||||||
- prefill does **not** scale with bigger single prompts (attention O(N²) confounds): pp2048 3295, pp8192
|
|
||||||
1524, pp16384 2051 — so it's the many-sequence batched MoE GEMM, not batch size.
|
|
||||||
|
|
||||||
## Proposal
|
|
||||||
|
|
||||||
A tcgen05 / CUTLASS-3.x grouped-GEMM path for FP4 (MXFP4 + NVFP4) MoE on sm_120/121:
|
|
||||||
- One grouped GEMM over all experts with per-group token offsets (full tiles regardless of tokens/expert),
|
|
||||||
vs today's per-expert MMQ scheduler.
|
|
||||||
- Block-scaled `e2m1` operands via tcgen05 tensor-memory MMA (`mma.sync.aligned.kind::mxf4…` is the
|
|
||||||
warp-level form; the collective-mainloop/tcgen05 form is what extracts Blackwell throughput at prefill
|
|
||||||
tile sizes).
|
|
||||||
- Fuse activation quantization (`quantize_mmq_mxfp4`, ~8%) into the permute/gather.
|
|
||||||
- Optionally extend to dense layers (qkv/o_proj/lm_head) so full-model prefill is FP4/FP8.
|
|
||||||
|
|
||||||
This mirrors what vLLM/FlashInfer/TensorRT-LLM do for Blackwell MoE. Happy to test iterations on the GB10.
|
|
||||||
|
|
||||||
## Repro
|
|
||||||
|
|
||||||
```sh
|
|
||||||
llama-quantize qwen3coder-f16.gguf qwen3coder-mxfp4.gguf MXFP4_MOE
|
|
||||||
llama-bench -m qwen3coder-mxfp4.gguf -ngl 99 -p 2048 -n 0 -ub 2048
|
|
||||||
llama-batched-bench -m qwen3coder-mxfp4.gguf -ngl 99 -c 45056 -b 2048 -ub 2048 -npp 512 -ntg 128 -npl 1,8,32,64
|
|
||||||
```
|
|
||||||
@@ -1,83 +0,0 @@
|
|||||||
# What makes vLLM fast on GB10 — kernel vs scheduler (code-grounded, measured)
|
|
||||||
|
|
||||||
Decisive analysis (vLLM v0.23.0, torch 2.11+cu130, sm_121, model `RedHatAI/Qwen3-32B-NVFP4A16`, source at tag
|
|
||||||
`v0.23.0`). **Answer: it's the scheduler, not the kernel.** This closes the kernel track and opens the
|
|
||||||
scheduler track.
|
|
||||||
|
|
||||||
## The decomposition (measured on the DGX, prefix-cache OFF, unique prompts)
|
|
||||||
|
|
||||||
| | vLLM W4A16 Marlin | llama.cpp | verdict |
|
|
||||||
|---|---|---|---|
|
|
||||||
| **single-stream prefill** | ~800 t/s (~52 TFLOPS) | 718 MMQ / **1153 MXFP4** | **tied; llama.cpp MXFP4 wins** |
|
|
||||||
| decode batch-1 | 11.8 t/s | ~similar | bandwidth-bound (≈190/273 GB/s); no kernel helps |
|
|
||||||
| **aggregate decode** | 328 (N32) / 569 (N64) / **667 (N128)** | the gap | **~56× multiplier = scheduler** |
|
|
||||||
|
|
||||||
vLLM's single-stream Marlin is **not** at the roofline — it's in the same ~4×-under regime as MMQ. The 24k
|
|
||||||
headline is entirely the aggregate decode multiplier.
|
|
||||||
|
|
||||||
## The kernel vLLM actually runs on sm_121 (W4A16, forced)
|
|
||||||
|
|
||||||
Dispatch (vLLM v0.23.0): `compressed_tensors.py:704` (NVFP4 + no input-quant → `W4A4Fp4(use_a16=True)`) →
|
|
||||||
`compressed_tensors_w4a4_nvfp4.py:28` → `kernels/linear/__init__.py:894` (`if use_a16: force_kernel =
|
|
||||||
MarlinNvFp4LinearKernel`, **unconditional, no cc gate**) → `nvfp4/marlin.py` → `marlin_utils_fp4.py:182`
|
|
||||||
`ops.marlin_gemm(b_q_type=float4_e2m1f)`, activations FP16/BF16. csrc: `csrc/quantization/marlin/marlin.cu`
|
|
||||||
+ `marlin_template.h` + `marlin.cuh`.
|
|
||||||
|
|
||||||
Techniques = **exactly the playbook we proved loses on GB10**: XOR shared swizzle (`marlin_template.h:722
|
|
||||||
^ (row%8)`), 4-stage cp.async pipeline (`marlin.cu:396 stages=4`, `cp_async_wait<stages-2>`), ldmatrix+mma,
|
|
||||||
FP16/BF16 acts. Native FP4 (`FlashInferB12xNvFp4LinearKernel`) needs `Sm120BlockScaledDenseGemm` cubins absent
|
|
||||||
on GB10 → W4A4 hangs → forced W4A16 Marlin fallback. **Nothing to port; vLLM's kernel is occupancy-blocked too.**
|
|
||||||
|
|
||||||
## The scheduler (the real multiplier) — what llama.cpp lacks
|
|
||||||
|
|
||||||
- **Paged KV cache** (`vllm/v1/core/kv_cache_manager.py`, `block_pool.py`): block KV, no fragmentation → very
|
|
||||||
high concurrent batch. **llama.cpp: NO** (contiguous per-slot KV → fragmentation caps real concurrency).
|
|
||||||
- **Chunked prefill** (`config/scheduler.py:84 enable_chunked_prefill=True`, default ON): interleaves prefill
|
|
||||||
chunks with decode so decode batches stay full. **llama.cpp: NO** (a long prefill stalls the decode batch).
|
|
||||||
- **Continuous batching** (`v1/core/sched/scheduler.py`): per-step admit/evict. **llama.cpp: YES** (`n_parallel`,
|
|
||||||
rudimentary — we enabled VRAM-scaled slots in #10411).
|
|
||||||
|
|
||||||
## Sizing the scheduler gap — MEASURED (llama.cpp aggregate, the surprise)
|
|
||||||
|
|
||||||
`llama-batched-bench` Qwen3-32B-Q4_K_M, npp=128 ntg=128, npl scaling (DGX):
|
|
||||||
|
|
||||||
| npl | S_PP (agg prefill) | **S_TG (agg decode)** | vLLM decode | llama % of vLLM |
|
|
||||||
|---|---|---|---|---|
|
|
||||||
| 1 | 628 | 10.2 | 11.8 | 86% |
|
|
||||||
| 8 | 773 | 59.8 | - | - |
|
|
||||||
| 32 | 763 | **235** | **328** | **72%** |
|
|
||||||
| 64 | 761 | **391** | **569** | **69%** |
|
|
||||||
| 128 | 762 | **540** | **667** | **81%** |
|
|
||||||
|
|
||||||
**The "30x gap" headline is wrong for realistic concurrency.** llama.cpp's continuous batching already
|
|
||||||
captures **~70-81% of vLLM's aggregate decode** at npl<=128, with a near-identical multiplier (10.2 -> 540 =
|
|
||||||
**53x**, vs vLLM's 56x). And it is still climbing linearly at 128 (not plateaued). Combined with llama.cpp being
|
|
||||||
*ahead* single-stream (MXFP4 1153 > vLLM 800), **llama.cpp is already broadly competitive with vLLM on GB10 at
|
|
||||||
self-hosted concurrency.**
|
|
||||||
|
|
||||||
Two real findings remain:
|
|
||||||
1. **Aggregate prefill is flat ~760** regardless of npl - but that is the **GB10 compute roofline** (vLLM single-
|
|
||||||
stream is ~800; neither can prefill faster aggregate, it is compute-bound). So prefill is **not a throughput
|
|
||||||
gap**; chunked prefill is a **latency/TTFT** win (stop a long prefill stalling the decode batch), not a
|
|
||||||
throughput one.
|
|
||||||
2. **vLLM's ~24k headline lives at thousands-of-sequences concurrency**, which **paged KV** unlocks (block KV,
|
|
||||||
no fragmentation). llama.cpp's contiguous KV caps how far npl can scale before memory/fragmentation bite. So
|
|
||||||
paged KV is the **high-concurrency (datacenter) lever**, not a moderate-concurrency one.
|
|
||||||
|
|
||||||
## Recommendation
|
|
||||||
|
|
||||||
**Pivot to the scheduler; treat the GEMM kernel as good-enough / roofline-blocked on GB10.**
|
|
||||||
Now that the gap is measured, ROI-ordered:
|
|
||||||
1. **Ship the MXFP4-dense win** — 1153 t/s single-stream beats vLLM's 800; a Blackwell dense-quant
|
|
||||||
recommendation (requantize, no kernel work). Already documented in `BLACKWELL_KERNEL_GAPS.md` §6. Cheapest.
|
|
||||||
2. **Chunked prefill** — the tractable scheduler win: interleave prefill chunks with decode so a long prompt
|
|
||||||
doesn't stall the decode batch. Payoff is **latency/TTFT under mixed load** (and steadier decode batches),
|
|
||||||
not aggregate prefill throughput (that's GB10-compute-capped at ~760-800 for both engines). A grpc-server
|
|
||||||
scheduler change; no KV-layout rewrite.
|
|
||||||
3. **Paged KV** — the **high-concurrency (thousands-of-seqs) lever** that unlocks vLLM's 24k regime. Heavy
|
|
||||||
(block KV manager; contested upstream PR #22569 / vendored `patches/`). Worth it only if datacenter-scale
|
|
||||||
concurrency is a target; at self-hosted concurrency (npl<=128) llama.cpp is already ~75-80% of vLLM.
|
|
||||||
|
|
||||||
**Reframed expectation:** llama.cpp on GB10 is NOT 30x behind vLLM. It is ahead single-stream (MXFP4) and
|
|
||||||
~70-81% of vLLM aggregate at npl<=128. The genuine differentiator vLLM still has is **scaling to very high
|
|
||||||
concurrency via paged KV**. Kernel tracks (W4A16 178 t/s; FP4-MMA) stay **banked** - not the lever.
|
|
||||||
@@ -1,59 +0,0 @@
|
|||||||
# Where vLLM beats llama.cpp on a DGX Spark (GB10), and how to close it — keeping quality
|
|
||||||
|
|
||||||
The question: "vLLM is faster at the end — what do we improve, while keeping good quality?" Answer: the
|
|
||||||
gap is **three independent things**, and the biggest *per-user, quality-preserving* one is **speculative
|
|
||||||
decoding**, which llama.cpp already supports.
|
|
||||||
|
|
||||||
## Decomposition (measured + researched)
|
|
||||||
|
|
||||||
| vLLM advantage | helps single user? | llama.cpp answer | quality cost | status |
|
|
||||||
|---|---|---|---|---|
|
|
||||||
| **Per-user decode speed** | **yes** | **speculative decoding** (Qwen3 draft / EAGLE3) | **none** (target-verified, lossless) | mature in llama.cpp; **the main lever** |
|
|
||||||
| Prefill / TTFT | no (it's first-token latency) | tune FP4-MMA / Marlin W4A16 kernel | none | hard; `BLACKWELL_KERNEL_GAPS.md` |
|
|
||||||
| Aggregate throughput @ concurrency | no (per-user = 0) | continuous batching (paged engine) | none | also kernel-bound |
|
|
||||||
|
|
||||||
Key measured fact: **single-user decode is already at parity** (Qwen3-32B: llama 10.2 vs vLLM 11.7 t/s) —
|
|
||||||
both hit GB10's ~273 GB/s bandwidth wall (~15 t/s ceiling) **without** spec-dec. So vLLM's real per-user
|
|
||||||
speed edge is spec-dec, not architecture.
|
|
||||||
|
|
||||||
## Why spec-dec is THE lever here (and quality-safe)
|
|
||||||
|
|
||||||
- **Lossless:** the 32B target verifies every drafted token (accept/reject) — output distribution is
|
|
||||||
identical to no-drafting. So you keep **Q4_K_M quality** (no lossy MXFP4 needed) *and* get speed.
|
|
||||||
- **GB10 is best-case for it:** decode is bandwidth-bound (one ~17 GB weight-read per token) with huge idle
|
|
||||||
compute. Spec-dec verifies K drafted tokens in **one** weight-read → converts the loop to compute-bound,
|
|
||||||
where GB10 has headroom. Realized speedup ≈ mean accepted length.
|
|
||||||
- **Measured (others, same model class):** llama.cpp Qwen2.5-32B dense + 0.5B draft = **2.9×** (13→38 t/s);
|
|
||||||
vLLM EAGLE3 on Qwen3-32B = ~1.8–2.5× general, up to ~3× code/structured. **Competitive.**
|
|
||||||
- **Regime caveat:** spec-dec gives **~nothing for MoE-A3B** models (only ~3B active → not bandwidth-bound,
|
|
||||||
nothing to amortize). It shines for **dense** 27–32B — the opposite regime. So this lever is *dense-model*
|
|
||||||
specific.
|
|
||||||
|
|
||||||
## Qwen3-32B specifics
|
|
||||||
|
|
||||||
- **No native MTP head** (MTP is a Qwen3-*Next*/MoE feature). Options: a **same-family draft**
|
|
||||||
(Qwen3-0.6B or **1.7B** — same tokenizer, llama.cpp vocab check passes) or an external **EAGLE3 head**
|
|
||||||
(RedHatAI/AngelSlim Qwen3-32B-eagle3, accept length 2.15–2.49).
|
|
||||||
- Draft pick: **lean Qwen3-1.7B** (0.6B had ~60% lower acceptance in AWS's test; on a bandwidth-bound box the
|
|
||||||
32B weight-read dwarfs the draft cost, so maximize acceptance). `--spec-draft-n-max 5–8`.
|
|
||||||
|
|
||||||
## Recommended LocalAI actions (quality-preserving, ranked)
|
|
||||||
|
|
||||||
1. **Make speculative decoding easy/recommended for dense ≥14B models on Blackwell** — a draft-model field in
|
|
||||||
the model config (`-md` / `--spec-draft-*`), with a suggested Qwen3-1.7B draft for the Qwen3 family. This
|
|
||||||
is the biggest per-user speed win, lossless, available **now** (no kernel). Gallery: ship target+draft pairs.
|
|
||||||
2. Kernel work (FP4-MMA tuning / Marlin W4A16) — improves **prefill/TTFT**, separate metric.
|
|
||||||
3. Continuous batching (paged engine) — **aggregate** concurrency only; per-user = 0.
|
|
||||||
|
|
||||||
## Honesty / status
|
|
||||||
|
|
||||||
The research conclusion is solid (sources below). **Our own empirical spec-dec run on the DGX is pending** —
|
|
||||||
the box rebooted mid-session and `llama-cli` now hangs at 0% GPU (while `llama-bench` works), plus the network
|
|
||||||
is dropping ssh mid-command. Drafts (Qwen3-0.6B/1.7B Q8) are downloaded and the spec-dec flags are confirmed;
|
|
||||||
re-run `llama-cli -m Qwen3-32B-Q4_K_M -md Qwen3-1.7B-Q8_0 -ngl 99 -ngld 99 --spec-draft-n-max 8` when the box
|
|
||||||
is stable to confirm the ~2× locally. The conclusion does not depend on it (it's measured-reproducible by
|
|
||||||
others on this exact model class), but we should bank our own number.
|
|
||||||
|
|
||||||
Sources: llama.cpp Discussion #10466 (Qwen2.5-32B+0.5B = 2.9×), #16578 (DGX Spark), DandinPower/llama.cpp_bench
|
|
||||||
(32B = 10.7 t/s, bandwidth-bound); vLLM MTP docs + Red Hat EAGLE3 article (lossless, up to 2.5×); AWS spec-dec
|
|
||||||
blog (Qwen3-32B+1.7B up to 3×, 0.6B ~60% lower accept); RedHatAI/AngelSlim Qwen3-32B-eagle3 heads.
|
|
||||||
@@ -1,176 +0,0 @@
|
|||||||
# W4A16 Marlin-style GEMM for ggml-cuda on Blackwell (sm_120/121) — implementation plan
|
|
||||||
|
|
||||||
> **STOPPED (2026-06-21): the kernel is NOT the lever — validated by a code-grounded vLLM analysis.**
|
|
||||||
> Measured on the DGX: vLLM's single-stream W4A16 prefill on GB10 = **~800 t/s (~52 TFLOPS), statistically TIED
|
|
||||||
> with llama.cpp MMQ (718/47)** — and vLLM uses the *exact* XOR-swizzle + 4-stage cp.async Marlin we proved
|
|
||||||
> collapses GB10 occupancy (vLLM even warns at load that Marlin "may degrade performance for compute-heavy
|
|
||||||
> workloads"). There is no kernel trick to port. Moreover llama.cpp's **MXFP4 path (1153 t/s) already BEATS
|
|
||||||
> vLLM single-stream (800)** — vLLM has no FP4 cubins on sm_121 and falls back to slower W4A16 Marlin, so
|
|
||||||
> llama.cpp is *ahead* on the kernel. **vLLM's entire 24k headline is the aggregate decode multiplier (~56×)
|
|
||||||
> from paged KV + chunked prefill + continuous batching — a SCHEDULER win.** llama.cpp lacks paged KV +
|
|
||||||
> chunked prefill. **Effort pivots to the scheduler** (see the paged-attention work). This kernel work is
|
|
||||||
> banked + resumable (178 t/s, P0/P1/P2/P3/P3b committed) but is not the throughput lever on GB10. Detail:
|
|
||||||
> `VLLM_DECOMPOSITION.md`.
|
|
||||||
|
|
||||||
The committed multi-week kernel. Goal: get 4-bit-weight dense matmul to the GB10 **BF16 ceiling (~213
|
|
||||||
TFLOP/s ≈ ~3,300 t/s prefill on Qwen3-32B)**, ~4.3× over today's 765. This is the *match-vLLM* path; vLLM's
|
|
||||||
own GB10 dense throughput runs on W4A16 Marlin (its FP4 path is broken on sm_121).
|
|
||||||
|
|
||||||
## Why a custom kernel (validated, not assumed)
|
|
||||||
|
|
||||||
On GB10 (sm_121), measured: **both** llama-MMQ (int8, Ampere-tuned) **and** cuBLAS-FP16 sit at ~46 TFLOP/s
|
|
||||||
(~21% of peak). cuBLAS falls back to an Ampere `cutlass_80_tensorop` kernel (CUDA-13 has no sm_121 GEMM for
|
|
||||||
these shapes); rebuilt with `-DGGML_CUDA_FORCE_CUBLAS=ON` it's *slower* than MMQ (690 vs 750). **No library
|
|
||||||
path reaches the ceiling on consumer Blackwell** — a hand-tuned sm_120a kernel is required. `mmapeak` measures
|
|
||||||
the 213 BF16 peak as reachable, and vLLM's Marlin hits it, so the ceiling is real; the work is reaching it.
|
|
||||||
|
|
||||||
## What Marlin does (the design we mirror)
|
|
||||||
|
|
||||||
Weights stored 4-bit, **dequantized in-register/shared-mem** in-flight; GEMM math on **FP16/BF16 tensor
|
|
||||||
cores** (`mma.sync m16n8k16`). Speed comes from: `cp.async` global→shared with a **multi-stage double-buffered
|
|
||||||
pipeline**, **offline weight reshuffle** into the MMA-friendly layout, activations kept resident in registers,
|
|
||||||
and **Stream-K** partitioning. Sources: IST-DASLab/marlin, arXiv 2408.11743, vLLM machete (Hopper successor).
|
|
||||||
|
|
||||||
## Phases (each ends with: numerical parity vs MMQ + a prefill benchmark)
|
|
||||||
|
|
||||||
### P0 — Harness + baseline — DONE
|
|
||||||
- **Correctness gate (GREEN):** `test-backend-ops test -o MUL_MAT -b CUDA0` → **1103/1103 passed** (CUDA vs CPU
|
|
||||||
reference, covers Q4_0/Q4_K at the real FFN shapes m=4096,k=14336,n=1..512). This is *the* parity check the
|
|
||||||
W4A16 kernel must keep green at every phase — it tests the CUDA MUL_MAT path the kernel will hook. The
|
|
||||||
`not supported` lines are `type_b=f16` combos (irrelevant; prefill uses f32 activations).
|
|
||||||
- **Perf baseline:** `llama-bench` dense Q4_K prefill = **~750 t/s (pp512 718 / pp2048 750) ≈ 46 TFLOP/s ≈ 21%
|
|
||||||
of the 213 BF16 ceiling**. The kernel must beat this toward ~3,300. (`test-backend-ops perf -o MUL_MAT` gives
|
|
||||||
per-shape GFLOPS too; build it once with the harness.)
|
|
||||||
- **Op-level baseline (the canonical kernel target), `test-backend-ops perf -o MUL_MAT`, m=4096 k=14336 (FFN):**
|
|
||||||
| n (tokens) | q4_0 | q4_K | regime |
|
|
||||||
|---|---|---|---|
|
|
||||||
| 1 | 817 GFLOPS | 761 GFLOPS | decode / mat-vec (memory-bound) |
|
|
||||||
| 8 | 5.77 TFLOPS | 4.11 TFLOPS | small-batch |
|
|
||||||
| **512** | **49.5 TFLOPS** | **47.1 TFLOPS** | **prefill GEMM — ~22% of the 213 ceiling** |
|
|
||||||
|
|
||||||
So the prefill GEMM target: lift q4_K n=512 from **47 → toward ~213 TFLOPS** (~4.5×). This per-shape number
|
|
||||||
is cleaner than end-to-end for kernel iteration.
|
|
||||||
- **Harness script:** `~/p0harness.sh` on the DGX (build test-backend-ops + correctness + perf). Reusable each
|
|
||||||
phase: `test-backend-ops test -o MUL_MAT -b CUDA0` must stay 1103/1103; the q4_K n=512 perf must climb from 47.
|
|
||||||
- test-backend-ops needed `-DLLAMA_BUILD_TESTS=ON`; now built in `~/llama.cpp-pr24423/build`.
|
|
||||||
|
|
||||||
### P1 — Dispatch seam (no behavior change) — DONE
|
|
||||||
- `marlin-w4a16.{cuh,cu}` + a gated hook in `ggml_cuda_mul_mat` (dense, non-ids path), behind
|
|
||||||
`GGML_CUDA_W4A16` + sm_120/121 (`cc >= GGML_CUDA_CC_BLACKWELL`) + type∈{Q4_0,Q4_K} + f32 activations.
|
|
||||||
Returns false → falls back to MMQ. Source + apply instructions: `kernel/w4a16/` (`HOOK.md`).
|
|
||||||
- **Verified on GB10:** clean build; `test-backend-ops MUL_MAT` = **1103/1103** (byte-identical default);
|
|
||||||
`llama-bench` dense Q4 pp512 unchanged (717.77 default / 718.26 with flag); `GGML_CUDA_W4A16=1` reaches the
|
|
||||||
seam (stderr `[w4a16] ... P1 seam - using MMQ`) and falls back. The empty frame P2/P3 fills.
|
|
||||||
|
|
||||||
### P2 — Correctness-first kernel (slow OK) — DONE
|
|
||||||
- **Kernel:** `marlin-w4a16.cu` replaces the P1 TODO with a real W4A16 GEMM. In-kernel dequant Q4→BF16 into
|
|
||||||
shared mem, `mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32` via ggml's `mma.cuh` tile abstractions
|
|
||||||
(`tile<16,8,nv_bfloat162>` A, `tile<8,8,nv_bfloat162>` B, `tile<16,8,float>` C), F32 accumulate, F32 write.
|
|
||||||
One warp per 16(M)x8(N) output tile, K looped in steps of 16. Both src0 (weights, row m) and src1 (acts,
|
|
||||||
row n) are row-major `[row][k]`, so A and B load symmetrically via `load_generic`; the mma does the dot over k.
|
|
||||||
- **Types handled:** Q4_0 and Q4_K. Q4_0 dequant `w=d*(q-8)` inline; Q4_K via the superblock decode mirrored
|
|
||||||
from `convert.cu` (`get_scale_min_k4`, 8x32 sub-blocks, `d*q-m`).
|
|
||||||
- **Shape classes handled:** contiguous 2D GEMM (the prefill path), `ne2==ne3==1`, f32 activations, K%16==0
|
|
||||||
(always true: Q4_0 K%32, Q4_K K%256). **Falls back to MMQ (returns false)** for batched (bs!=[1,1]),
|
|
||||||
broadcast (nr!=[1,1]), permuted / non-contiguous (per!=[0,1,2,3]), and any non-f32 activation (e.g. f16) -
|
|
||||||
keeps the gate green. M / N boundaries are zero-padded in-kernel (handles M not %16, N not %8).
|
|
||||||
- **Parity (the gate):** `GGML_CUDA_W4A16=1 test-backend-ops test -o MUL_MAT -b CUDA0` = **1103/1103 passed**
|
|
||||||
(the Q4_0/Q4_K f32 contiguous shapes run the kernel and match the CPU reference; batched/permuted/f16 fall
|
|
||||||
back). Default (flag-unset) build still **1103/1103** (byte-identical, seam returns false).
|
|
||||||
- **Model sanity / P2 perf:** `GGML_CUDA_W4A16=1 llama-bench -m Qwen3-32B-Q4_K_M.gguf -ngl 99 -p 512 -n 16
|
|
||||||
-ub 2048` runs clean: **pp512 = 31.75 t/s**, tg16 = 6.28 t/s. Slow as expected (naive 1-warp/tile, weights
|
|
||||||
re-dequantized per n-tile, no pipeline) - this is the correctness checkpoint; P3 brings the speedup. The real
|
|
||||||
Q4_K model matmul path engages the kernel without error.
|
|
||||||
|
|
||||||
### P3 — The Marlin pipeline (the speedup) — STEP 1 + SKEW-PAD/TILING LANDED; PREPACK + PIPELINE + STREAM-K DEFERRED
|
|
||||||
Goal: `cp.async` double/triple-buffered global->shared; offline weight reshuffle (a one-time repack of the Q4
|
|
||||||
tensor into the mma+pipeline layout); register-resident activation tiles; Stream-K split for the prefill M.
|
|
||||||
Target: >=150 TFLOP/s (>=~2,300 t/s), then ~213. **MMQ baseline to beat: 47.1 TFLOPS (q4_K n=512) / pp512 718.**
|
|
||||||
|
|
||||||
**Kernel structure now (committed P3b):** block-tiled multi-warp GEMM with a CONFLICT-FREE shared feed via skew
|
|
||||||
padding. `blockDim=(32, WM*WN)` so `threadIdx.x` is the warp lane (required by `mma.cuh` get_i/get_j) and
|
|
||||||
`threadIdx.y` is the warp index; the original 1-warp P2 launch put 128 threads on `threadIdx.x` and exploded
|
|
||||||
`get_j` into an out-of-bounds shared read (found via compute-sanitizer). `WM*WN` warps compute a
|
|
||||||
`BM(=WM*FM*16) x BN(=WN*FN*8)` output tile; each warp owns an `FM x FN` grid of m16n8k16 mma fragments
|
|
||||||
accumulated in F32. Per k-step (16-deep): all warps cooperatively dequant the `BM x 16` Q4 weight strip + load
|
|
||||||
the `BN x 16` f32->bf16 activation strip into shared, one `__syncthreads`, then `ldmatrix.x4` (A) / `ldmatrix.x2`
|
|
||||||
(B) fragments + `FM*FN` mmas. The shared rows hold 8 bf162 of data but are stored at a PADDED stride of 12 bf162
|
|
||||||
(`W4A16_SPAD`): ldmatrix's per-lane address is `row*stride`, and the natural stride 8 (a divisor of the
|
|
||||||
32-bank / 128-byte cycle) collides rows 0,4,8,12 into a 2-way bank conflict; skewing to 12 (4-byte aligned, so
|
|
||||||
ldmatrix's 16-byte alignment holds) makes `{r*12 mod 32}` hit 8 distinct bank-quads for r in 0..7, so both
|
|
||||||
halves of ldmatrix are conflict-free at only +50% on the small staged tile (~12 KB at the shipping tile).
|
|
||||||
Shipping config `WM=4,WN=4,FM=2,FN=4` -> `BM=128, BN=128`, 16 warps, 8 m16n8 C-tiles per warp (keeping
|
|
||||||
register pressure low is what lets BN grow without an occupancy cliff). M/N tails zero-padded in-kernel; still
|
|
||||||
gated to contiguous 2D Q4_0/Q4_K f32 prefill, else falls back to MMQ.
|
|
||||||
|
|
||||||
**Per-step results (q4_K n=512 via `test-backend-ops perf`; pp512/pp2048 via llama-bench Qwen3-32B-Q4_K_M):**
|
|
||||||
|
|
||||||
| step | q4_K n=512 | q4_0 n=512 | pp512 | pp2048 | vs MMQ 47 / 718 | notes |
|
|
||||||
|---|---|---|---|---|---|---|
|
|
||||||
| P2 (1 warp/tile) | ~2 TFLOPS | - | 31.75 | - | 0.04x | correctness checkpoint |
|
|
||||||
| Step 1: block tiling (load_generic, BM64/4w) | 6.63 (cold) | 7.53 | 119 | 123 | 0.14x | original committed kernel |
|
|
||||||
| P3b-1: skew-pad ldmatrix + BM128/8w | 8.50 (cold) | 10.56 | 148.5 | 153.9 | 0.18x | +28% q4_K, +40% q4_0 over step 1 |
|
|
||||||
| **P3b-2: + BN128/16w (current)** | **9.92 (cold)** | **11.68** | **177.6** | **185.0** | **0.21x** | +17% q4_K, +20% pp512 over P3b-1 (+49% pp512 over step 1) |
|
|
||||||
|
|
||||||
Parity gate **1103/1103** at every step, flag set and unset (byte-identical when unset). All P3b numbers above
|
|
||||||
are from thermally-bracketed cold A/B sessions (committed measured immediately before AND after each candidate,
|
|
||||||
identical both times -> the deltas are real, not thermal). P3b-1 cold A/B: 6.63/7.53 vs 8.52/10.49. P3b-2 cold
|
|
||||||
A/B: BN64/8w 10.56/8.50 then 10.51/8.45 (bracket) vs BN128/16w 11.68/9.92.
|
|
||||||
|
|
||||||
**What landed / what was tried (honest):**
|
|
||||||
- **P3b - LANDED (committed).** Two combined changes lift the prior committed kernel: (1) **skew-pad
|
|
||||||
conflict-free ldmatrix** (shared row stride 8->12 bf162; makes `ldmatrix.x4`/`.x2` bank-conflict-free at near
|
|
||||||
zero occupancy cost) and (2) **bigger tile / more warps** (`BM=128, BN=64`, 8 warps). Cold A/B: q4_K
|
|
||||||
6.63->8.52 (+28%), q4_0 7.53->10.49 (+40%), pp512 119->148.5 (+25%). **Still ~5.5x under MMQ (47) per-op and
|
|
||||||
~4.8x under pp512 718 - does NOT beat MMQ.** This is forward progress, not the finish line.
|
|
||||||
- **The XOR-swizzle-FIRST plan was tested and is WRONG for this GPU - documented so it is not re-tried.** A
|
|
||||||
wide-row (BK=64, 128-byte rows) XOR swizzle `seg ^ (row&7)` IS conflict-free, but the 16 KB shared it needs
|
|
||||||
collapsed occupancy and dropped q4_K n=512 to **2.84 TFLOPS** (worse than the unswizzled 6.63) - the same
|
|
||||||
occupancy cliff P3 hit with a 32 KB pipeline. The conflict-free feed must be bought WITHOUT widening shared:
|
|
||||||
skew padding (above) does exactly that (6 KB), which is why it is the committed form. Lesson: on GB10 occupancy
|
|
||||||
dominates bank-conflict latency; never trade occupancy for a conflict-free layout.
|
|
||||||
- **Conflict-free feed alone did NOT beat the unswizzled kernel - the limiter moved.** At the SAME BM64/4w tile,
|
|
||||||
skew-pad ldmatrix (6.70) ~= load_generic (6.63): removing bank conflicts bought ~nothing. The win came only
|
|
||||||
when the tile grew (BM128/8w). A 5-config tile sweep then split the two quant types:
|
|
||||||
- **q4_0 SCALES with warps/tiles** (7.7 -> 10.5 -> **15.8 TFLOPS at BM128/16w**): feed/global-traffic bound,
|
|
||||||
helped by cutting redundant activation re-reads (more BM = fewer M-blocks each re-reading the act column).
|
|
||||||
- **q4_K is largely DEQUANT-COMPUTE bound** (the BM64/16w tile gives q4_0=15.8 but q4_K=6.8 - they diverge
|
|
||||||
hard). This **refines P3's "within 12%" finding**: that held only in the low-throughput memory-bound regime;
|
|
||||||
once the feed is unblocked, q4_K's per-element 6-bit superblock decode (`get_scale_min_k4` + superblock
|
|
||||||
indexing, redone every k-step AND re-done by every N-block) becomes the wall. BM256 regressed both (too few
|
|
||||||
blocks / register pressure).
|
|
||||||
- **Growing BN partly relieves the q4_K dequant wall (P3b-2).** Because every N-block re-decodes the same
|
|
||||||
weight strip, halving the N-block count (BN 64->128) halves that redundant q4_K decode - but only when BN is
|
|
||||||
spread across MORE WARPS (16w, 8 C-tiles/warp), not more fragments-per-warp: the FN=8 / FM=4 variants (16
|
|
||||||
C-tiles/warp) regressed to ~6.6 on register pressure, while WM=4,WN=4,FM=2,FN=4 (16w, 8 tiles/warp) lifted
|
|
||||||
q4_K 8.5->9.9 and q4_0 10.6->11.7 cold. BN=256 was no better and costs more shared. **BN128/16w is the
|
|
||||||
shipping tile.**
|
|
||||||
- **Next blocker (the remaining q4_K unlock) = offline prepack.** BN growth only divides the redundant decode by
|
|
||||||
the N-block count; it cannot remove the per-k-step decode itself. The full fix is the **one-time offline
|
|
||||||
repack** - decode the Q4 tensor ONCE into a cached device buffer keyed off the tensor data pointer, in a layout
|
|
||||||
with the scale/min pre-applied (store reshuffled 4-bit + per-subblock bf16 d,m, ~1.25x the q4 size, NOT a full
|
|
||||||
bf16 blow-up which would be ~4x), so the in-kernel path becomes a cheap `q*d - m` with coalesced loads. Then
|
|
||||||
`cp.async` multi-stage (sized to NOT widen shared past the occupancy cliff) and **Stream-K** over M. These
|
|
||||||
remain the multi-week core; **prepack is the highest-value next step for q4_K specifically** (it should let
|
|
||||||
q4_K join q4_0 on the feed-bound scaling curve instead of plateauing at ~10).
|
|
||||||
- **Methodology note (unchanged):** the box thermally throttles under sustained perf+bench runs (identical code
|
|
||||||
~8.8 cold vs ~6.6 hot earlier), so only same-session A/Bs are trustworthy. The P3b deltas above were taken in
|
|
||||||
one bracketed cold session for exactly this reason.
|
|
||||||
|
|
||||||
### P4 — Tune
|
|
||||||
- Tile (mmq_x/y analogues), warps, pipeline depth, occupancy. We have nsys (throughput) but **not ncu** on the
|
|
||||||
DGX — tuning is empirical (sweep configs, measure t/s). Note ncu would need sudo/driver perms we lack.
|
|
||||||
|
|
||||||
### P5 — Enable
|
|
||||||
- Default on for sm_120/121 + Q4_0/Q4_K dense when parity holds + faster; keep the flag as an escape hatch.
|
|
||||||
Ship as a LocalAI llama.cpp patch (the patches/ series) and/or upstream (ggml has no Marlin-equivalent —
|
|
||||||
issue #1519 — so it's net-new upstream value; float it with maintainers first).
|
|
||||||
|
|
||||||
## Risks / notes
|
|
||||||
- **Multi-week, expert-CUDA, DGX-only** (GB10 is the only sm_121). The session's network flakiness +
|
|
||||||
`llama-cli` hang make `llama-bench`/`test-backend-ops` the reliable verification tools (both work).
|
|
||||||
- Quantization correctness: Q4_K's superblock structure (256-elem, 6-bit scales) is more complex to dequant
|
|
||||||
in-kernel than Q4_0; consider landing Q4_0 first, then Q4_K.
|
|
||||||
- **Beat-path follow-on:** the FP4-MMA path (`mul_mat_q<MXFP4>`, ~5% of FP4 peak) tuned/fixed on sm_121 reaches
|
|
||||||
~6,600 (2× BF16). Separate track; this W4A16 kernel is the match-path foundation.
|
|
||||||
- Reuse ggml's `mma.cuh` tile abstractions (MMQ already uses them) rather than raw PTX where possible.
|
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
# W4A16 seam — how to apply to a llama.cpp / ggml-cuda checkout
|
|
||||||
|
|
||||||
Two source files + two one-line edits to `ggml/src/ggml-cuda/ggml-cuda.cu`. The build picks up the
|
|
||||||
new `.cu` via the existing `file(GLOB)` after a `cmake -S . -B build` reconfigure (no CMakeLists edit).
|
|
||||||
|
|
||||||
## Files (copy into `ggml/src/ggml-cuda/`)
|
|
||||||
- `marlin-w4a16.cuh`
|
|
||||||
- `marlin-w4a16.cu`
|
|
||||||
|
|
||||||
## Edit `ggml/src/ggml-cuda/ggml-cuda.cu`
|
|
||||||
|
|
||||||
1. **Include** — after the existing `#include "ggml-cuda/fp4-grouped-moe.cuh"` (sibling-header style):
|
|
||||||
```cpp
|
|
||||||
#include "ggml-cuda/marlin-w4a16.cuh"
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Dispatch hook** — immediately before the dense dispatch chain, i.e. before
|
|
||||||
`if (!split && use_mul_mat_vec_f) {` in `ggml_cuda_mul_mat(...)` (after `const int cc = ...`):
|
|
||||||
```cpp
|
|
||||||
if (!split && ggml_cuda_w4a16_mul_mat(ctx, src0, src1, dst)) { return; }
|
|
||||||
```
|
|
||||||
|
|
||||||
## Verify (P1 acceptance — met)
|
|
||||||
- `cmake --build build --target test-backend-ops llama-bench` → builds clean.
|
|
||||||
- `test-backend-ops test -o MUL_MAT -b CUDA0` → **1103/1103** (byte-identical default).
|
|
||||||
- `llama-bench` dense Q4 pp512 → unchanged (~718, MMQ).
|
|
||||||
- `GGML_CUDA_W4A16=1 llama-bench` → unchanged + stderr `[w4a16] ... P1 seam - using MMQ` (seam reached,
|
|
||||||
gating passes on sm_121, falls back).
|
|
||||||
|
|
||||||
The kernel body (P2 correctness → P3 Marlin pipeline) replaces the `TODO(P2/P3)` block in `marlin-w4a16.cu`
|
|
||||||
and returns `true` once parity holds.
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
# W4A16 kernel - subagent dispatch briefs (P3, P4, P5)
|
|
||||||
|
|
||||||
**Dispatch strategy.** Each phase = one fresh **Opus-4.8** subagent handed a complete zero-context brief.
|
|
||||||
Phases are **sequential** (P3 needs P2's correct kernel; P4 needs P3's pipeline; P5 needs P4's tuned kernel),
|
|
||||||
so dispatch phase N+1 only after phase N's commit lands, and before dispatching, splice phase N's *actual*
|
|
||||||
deliverable (final kernel shape, configs, fallback set) into the next brief. P2's brief (already dispatched)
|
|
||||||
is the template; reuse the COMMON section below verbatim in every dispatch.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## COMMON (paste into every phase brief)
|
|
||||||
|
|
||||||
- **Kernel dev is on the remote DGX** (GB10, sm_121): `ssh -o ConnectTimeout=25 -o ServerAliveInterval=10 -o ServerAliveCountMax=10 dgx.casa '<cmd>'`. Network is FLAKY (re-poll on drop; nohup jobs survive). `llama-cli` HANGS - never use it. Only `llama-bench` + `test-backend-ops` work.
|
|
||||||
- Checkout `~/llama.cpp-pr24423`, build `~/llama.cpp-pr24423/build` (sm_121, `-DLLAMA_BUILD_TESTS=ON`). Kernel file `ggml/src/ggml-cuda/marlin-w4a16.cu`. Build auto-GLOBs it; no CMakeLists edits. Hook already in `ggml-cuda.cu`, gated behind env `GGML_CUDA_W4A16`.
|
|
||||||
- Dense test model: `~/bench/q3-32b-gguf/Qwen3-32B-Q4_K_M.gguf`.
|
|
||||||
- **Builds run detached + poll** (never blocking foreground): write a `~/pN.sh` that builds `--target test-backend-ops llama-bench`, echoes `RC=$?`, runs the gate, echoes `PN_DONE`; `nohup` it; poll `for i in $(seq 1 90); do grep -q PN_DONE ~/pN.out && break; sleep 20; done; tail ~/pN.out`.
|
|
||||||
- **GPU hygiene:** check `docker ps | grep local-ai` + `nvidia-smi`; `docker stop` a running localai worker if present (authorized); never pkill native procs; never start model servers.
|
|
||||||
- **Parity gate (must stay green every step):** `GGML_CUDA_W4A16=1 CUDA_VISIBLE_DEVICES=0 ./build/bin/test-backend-ops test -o MUL_MAT -b CUDA0` = **1103/1103**; and flag-unset stays 1103/1103 (byte-identical). A wrong result is worse than a fallback - return false for any shape you can't do correctly.
|
|
||||||
- **Perf measurement:** `test-backend-ops perf -o MUL_MAT -b CUDA0` (per-shape GFLOPS; the canonical target is q4_K m=4096 k=14336 **n=512**, baseline **47.1 TFLOPS**, ceiling ~213) + `llama-bench -m <model> -ngl 99 -p 512,2048 -n 0 -ub 2048` (baseline pp512 ~718).
|
|
||||||
- **LocalAI repo (commit here; you do NOT inherit cwd - `cd` explicitly):** `/home/mudler/_git/LocalAI/.claude/worktrees/feat+paged-attention`. Plan: `backend/cpp/llama-cpp/paged/W4A16_MARLIN_KERNEL_PLAN.md`. Source mirror: `backend/cpp/llama-cpp/paged/kernel/w4a16/`. After a phase passes: fetch the final `marlin-w4a16.cu` from the DGX (`ssh ... 'cat ...'`), overwrite the mirror, update the plan (mark the phase DONE with numbers), `git commit -s` (DCO sign-off; user is Ettore Di Giacinto <mudler@localai.io>). **No `Co-Authored-By`. No em-dashes anywhere. Trailer `Assisted-by: Claude:opus-4.8 [Claude Code]`. Do NOT push.**
|
|
||||||
- Final message = the result (gate ?/1103, the perf delta, blockers + resolutions, commit hash). A precise partial result beats a vague success claim.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## P3 brief - the Marlin pipeline (the speedup)
|
|
||||||
|
|
||||||
**Goal.** Take P2's correct-but-slow kernel from ~47 toward ~150+ TFLOPS (then ~213) on the q4_K n=512 prefill GEMM, **without ever breaking parity**. This is the Marlin design: the math is the same BF16 mma; the speed comes from feeding the tensor cores without stalling.
|
|
||||||
|
|
||||||
**Implement, incrementally (re-run the parity gate after each):**
|
|
||||||
1. **`cp.async` multi-stage pipeline** - double/triple-buffer global->shared loads of both the Q4 weight tiles and the activation tiles so dequant+mma on stage k overlaps the load of stage k+1. (Study `mma.cuh` + how `mmq.cu`/`mmf.cu` stage shared memory; ggml already uses `cp.async`/`__pipeline_*`.)
|
|
||||||
2. **Offline weight reshuffle** - repack the Q4 weights once into the mma+pipeline-friendly layout (Marlin's interleave) so loads are coalesced and the mma fragment maps directly. Do this as a load-time transform of src0 (a new prepacked buffer keyed off the tensor) - NOT per-call. Document where the repack lives + its memory cost.
|
|
||||||
3. **Register-resident activation tiles + Stream-K** split of the M dimension across blocks for the prefill (large-M) case so all SMs stay busy.
|
|
||||||
|
|
||||||
**Acceptance.** Parity gate stays **1103/1103** at every commit; `test-backend-ops perf` q4_K n=512 climbs materially above 47 TFLOPS (target >=150) and `llama-bench` pp512 climbs above ~718. Report the TFLOPS + t/s after each of the 3 steps so the contribution of each is visible. If a step regresses parity, revert it and report why.
|
|
||||||
|
|
||||||
**Reference.** IST-DASLab/marlin (github), arXiv 2408.11743, vLLM machete. Mirror `mmf.cu`'s BF16 GEMM structure; Marlin = that + Q4 dequant-on-load + the pipeline/reshuffle.
|
|
||||||
|
|
||||||
**Splice before dispatch:** P2's final kernel structure (tile sizes, which types/shapes it handles vs falls back, helper functions it defined).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## P4 brief - tune to the ceiling
|
|
||||||
|
|
||||||
**Goal.** Drive the P3 kernel as close to the ~213 TFLOPS ceiling as empirical tuning allows. **No `ncu` on this box** (no driver perms) - tune by throughput: `test-backend-ops perf` + `llama-bench` + `nsys` (throughput only).
|
|
||||||
|
|
||||||
**Do.** Parametrize the kernel (template params / constants) over: tile M/N/K, warps per block, pipeline depth (stages), and occupancy (regs, shared-mem budget). Sweep systematically (a script that rebuilds + benches each config, logs q4_K n=512 TFLOPS + pp512/pp2048 t/s), pick the best, hard-set it (with a short comment on the sweep). Check both prefill shapes (n=512 and n=2048) and confirm decode (n=1) didn't regress (it should still route to mat-vec, not this kernel - verify the gating).
|
|
||||||
|
|
||||||
**Acceptance.** Best config maximizes q4_K n=512 TFLOPS (stretch ~150-213) with parity **1103/1103** intact; the sweep table (config -> TFLOPS/t-s) is recorded in the plan's P4 section. Report the chosen config + the final pp512/pp2048 t/s vs the 718/750 baseline and vs vLLM's ~3300 single-stream target.
|
|
||||||
|
|
||||||
**Splice before dispatch:** P3's pipeline structure + the perf it reached + which knobs are already fixed vs free.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## P5 brief - enable + package + (maybe) upstream
|
|
||||||
|
|
||||||
**Goal.** Make W4A16 the default dense-Q4 path on Blackwell and ship it through LocalAI.
|
|
||||||
|
|
||||||
**Do.**
|
|
||||||
1. **Flip the gate:** default-ON for sm_120/121 + Q4_0/Q4_K dense when faster, keep an opt-out env (e.g. `GGML_CUDA_W4A16=0`) as an escape hatch. The existing return-false-on-unhandled-shape path is the correctness safety net; keep it. Verify the default (no env) build now runs W4A16 for dense Q4, gate green, faster than the old MMQ baseline.
|
|
||||||
2. **Package as a LocalAI llama.cpp patch:** produce `backend/cpp/llama-cpp/paged/patches/kernel/0002-w4a16-marlin.patch` (the new files + the `ggml-cuda.cu` hook + the gate flip) that applies cleanly to the pinned llama.cpp, mirroring the existing `patches/kernel/0001-fp4-grouped-moe-scaffold.patch`. Confirm LocalAI's `make backends/llama-cpp` build path can consume it (read `.agents/llama-cpp-backend.md` + the build memory: `make -C backend/cpp/llama-cpp clean` before rebuilds).
|
|
||||||
3. **Docs:** update `BLACKWELL_KERNEL_GAPS.md` + the plan with the shipped result; add a short note to the LocalAI docs if there's a Blackwell/performance page.
|
|
||||||
4. **Upstream decision (do NOT open without surfacing first):** ggml has no Marlin-equivalent (issue #1519) so this is net-new upstream value. Draft (do not submit) an upstream PR description + note the sm_121 build-flag caveats; report it for the user to decide.
|
|
||||||
|
|
||||||
**Acceptance.** Default Blackwell build uses W4A16 for dense Q4, parity 1103/1103, measurably faster than MMQ; the patch applies + the LocalAI llama-cpp backend builds with it (verify or, if the full backend build is too heavy, document the exact build command + that the patch applies cleanly). Report the end-to-end LocalAI dense-Q4 prefill number vs the start-of-project 765 t/s.
|
|
||||||
|
|
||||||
**Splice before dispatch:** P4's final kernel + config + the measured ceiling reached; the exact enable condition decided.
|
|
||||||
@@ -1,258 +0,0 @@
|
|||||||
#include "marlin-w4a16.cuh"
|
|
||||||
#include "mma.cuh"
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cuda_bf16.h>
|
|
||||||
|
|
||||||
// W4A16 Marlin-style GEMM.
|
|
||||||
//
|
|
||||||
// In-kernel dequantize Q4 weights -> BF16, multiply against BF16-converted F32
|
|
||||||
// activations using mma.sync m16n8k16 BF16 tensor-core ops, accumulate in F32,
|
|
||||||
// write F32 output. Handles only the contiguous 2D GEMM (prefill) case for
|
|
||||||
// Q4_0 / Q4_K; everything else returns false and falls back to MMQ.
|
|
||||||
//
|
|
||||||
// ggml MUL_MAT convention: dst[m,n] = sum_k src0[k,m] * src1[k,n].
|
|
||||||
// src0 (weights): ne0=K (contiguous), ne1=M -> row m is K contiguous quants.
|
|
||||||
// src1 (acts,f32): ne0=K (contiguous), ne1=N -> row n is K contiguous floats.
|
|
||||||
// dst (f32): ne0=M (contiguous), ne1=N -> element (m,n) at m + n*M.
|
|
||||||
// Both operands are row-major [row][k]; m16n8k16 computes C[m,n] += sum_k A[m,k]*B[n,k].
|
|
||||||
//
|
|
||||||
// Thread layout: blockDim = (32, WM*WN). threadIdx.x is the warp lane (0..31,
|
|
||||||
// required by mma.cuh get_i/get_j), threadIdx.y is the warp index.
|
|
||||||
//
|
|
||||||
// P3b step 1 - conflict-free shared layout via SKEW PADDING:
|
|
||||||
// - WM*WN warps compute a BM(=WM*FM*16) x BN(=WN*FN*8) output tile; each warp
|
|
||||||
// owns an FM x FN grid of m16n8k16 mma fragments accumulated in F32.
|
|
||||||
// - Per 16-deep k-step the warps cooperatively dequant the BM x 16 Q4 weight
|
|
||||||
// strip + load the BN x 16 f32->bf16 activation strip into shared, then feed
|
|
||||||
// the tensor cores with ldmatrix.x4 (A) / ldmatrix.x2 (B).
|
|
||||||
// - The shared rows are PADDED to SPAD(=12) bf162 instead of the natural 8.
|
|
||||||
// ldmatrix's per-lane address is row*stride; with the natural stride 8 (a
|
|
||||||
// divisor of the 32-bank / 128-byte cycle) rows 0,4,8,12 collide -> 2-way
|
|
||||||
// bank conflict on every fragment load (this is why P3 measured a plain
|
|
||||||
// ldmatrix swap as neutral). Skewing the stride to 12 (4-byte aligned, so
|
|
||||||
// ldmatrix's 16-byte alignment holds) makes {r*12 mod 32} hit 8 distinct
|
|
||||||
// bank-quads for r in 0..7, so both halves of ldmatrix.x4 and ldmatrix.x2 are
|
|
||||||
// conflict-free. The pad costs only +50% on the small (~4 KB) staged tile, so
|
|
||||||
// unlike a 128-byte-row XOR swizzle it does NOT collapse occupancy on GB10
|
|
||||||
// (a wide-row swizzle pushed shared to 16 KB and dropped this to ~2.8 TFLOPS).
|
|
||||||
//
|
|
||||||
// Dead-ends already proven (do not re-try): a double-buffered KSTAGE=64 cp.async
|
|
||||||
// pipeline collapsed occupancy (32 KB shared -> 2.7 TFLOPS); a plain ldmatrix on
|
|
||||||
// the UNpadded layout was neutral (bank conflicts); a wide-row (BK=64) XOR swizzle
|
|
||||||
// was conflict-free but occupancy-starved (16 KB shared -> 2.8 TFLOPS). Skew
|
|
||||||
// padding gets the conflict-free feed at near-zero occupancy cost.
|
|
||||||
|
|
||||||
using namespace ggml_cuda_mma;
|
|
||||||
|
|
||||||
typedef tile<16, 8, nv_bfloat162> tile_A; // 16(M) x 16(K)
|
|
||||||
typedef tile< 8, 8, nv_bfloat162> tile_B; // 8(N) x 16(K)
|
|
||||||
typedef tile<16, 8, float> tile_C; // 16(M) x 8(N)
|
|
||||||
|
|
||||||
// bf162 columns actually live per shared row (16 k-values = 8 bf162) ...
|
|
||||||
#define W4A16_KP 8
|
|
||||||
// ... padded to this stride to bank-skew the ldmatrix row addresses.
|
|
||||||
#define W4A16_SPAD 12
|
|
||||||
|
|
||||||
static bool w4a16_enabled() {
|
|
||||||
static const bool en = (std::getenv("GGML_CUDA_W4A16") != nullptr);
|
|
||||||
return en;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 6-bit packed scale/min decode for Q4_K (mirrors convert.cu get_scale_min_k4).
|
|
||||||
static __device__ __forceinline__ void w4a16_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
|
|
||||||
if (j < 4) {
|
|
||||||
d = q[j] & 63; m = q[j + 4] & 63;
|
|
||||||
} else {
|
|
||||||
d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
|
|
||||||
m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dequantize a single Q4_0 weight at column k of a row.
|
|
||||||
static __device__ __forceinline__ float w4a16_dq_q4_0(const char * row, int k) {
|
|
||||||
const block_q4_0 * blk = (const block_q4_0 *) row + (k / QK4_0);
|
|
||||||
const int j = k % QK4_0;
|
|
||||||
const float d = __half2float(blk->d);
|
|
||||||
const int q = (j < QK4_0/2) ? (blk->qs[j] & 0xF) : (blk->qs[j - QK4_0/2] >> 4);
|
|
||||||
return (q - 8) * d;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dequantize a single Q4_K weight at column k of a row.
|
|
||||||
static __device__ __forceinline__ float w4a16_dq_q4_K(const char * row, int k) {
|
|
||||||
const block_q4_K * blk = (const block_q4_K *) row + (k / QK_K);
|
|
||||||
const int e = k % QK_K;
|
|
||||||
const int il = e / 64; // 0..3
|
|
||||||
const int within = e % 64;
|
|
||||||
const int half = within / 32; // 0..1
|
|
||||||
const int pos = within % 32;
|
|
||||||
const int ir = pos / 4; // 0..7
|
|
||||||
const int l = pos % 4; // 0..3
|
|
||||||
const int is = 2*il + half;
|
|
||||||
const float dall = __low2half (blk->dm);
|
|
||||||
const float dmin = __high2half(blk->dm);
|
|
||||||
uint8_t sc, mn;
|
|
||||||
w4a16_scale_min_k4(is, blk->scales, sc, mn);
|
|
||||||
const float d = dall * sc;
|
|
||||||
const float m = dmin * mn;
|
|
||||||
const uint8_t qb = blk->qs[32*il + 4*ir + l];
|
|
||||||
const int q = (half == 0) ? (qb & 0xF) : (qb >> 4);
|
|
||||||
return d * q - m;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <bool IS_Q4_K, int WM, int WN, int FM, int FN>
|
|
||||||
static __global__ void __launch_bounds__(WM*WN*32, 1)
|
|
||||||
w4a16_gemm_kernel(
|
|
||||||
const char * __restrict__ src0,
|
|
||||||
const char * __restrict__ src1,
|
|
||||||
float * __restrict__ dst,
|
|
||||||
const int M, const int N, const int K,
|
|
||||||
const int64_t nb01, const int64_t nb11, const int64_t dst_ne0) {
|
|
||||||
constexpr int KP = W4A16_KP; // 8 bf162 = 16 k per row
|
|
||||||
constexpr int SPAD = W4A16_SPAD; // padded row stride (bank skew)
|
|
||||||
constexpr int BM = WM*FM*16;
|
|
||||||
constexpr int BN = WN*FN*8;
|
|
||||||
constexpr int NTH = WM*WN*32;
|
|
||||||
|
|
||||||
const int m0 = blockIdx.x * BM;
|
|
||||||
const int n0 = blockIdx.y * BN;
|
|
||||||
|
|
||||||
const int warp_id = threadIdx.y; // 0 .. WM*WN-1
|
|
||||||
const int warp_n = warp_id % WN;
|
|
||||||
const int warp_m = warp_id / WN;
|
|
||||||
const int tid = threadIdx.y*32 + threadIdx.x;
|
|
||||||
|
|
||||||
__shared__ nv_bfloat162 sW[BM*SPAD]; // [m][kpair], padded row stride SPAD
|
|
||||||
__shared__ nv_bfloat162 sB[BN*SPAD]; // [n][kpair], padded row stride SPAD
|
|
||||||
|
|
||||||
tile_C C[FM][FN]; // zero-initialized accumulators
|
|
||||||
|
|
||||||
for (int k0 = 0; k0 < K; k0 += 16) {
|
|
||||||
// Dequantize the BM x 16 weight strip once; reused across the block's BN span.
|
|
||||||
#pragma unroll
|
|
||||||
for (int idx = tid; idx < BM*KP; idx += NTH) {
|
|
||||||
const int m = idx / KP;
|
|
||||||
const int kk = idx % KP;
|
|
||||||
const int k = k0 + 2*kk;
|
|
||||||
float w0 = 0.0f, w1 = 0.0f;
|
|
||||||
if (m0 + m < M) {
|
|
||||||
const char * row = src0 + (int64_t)(m0 + m) * nb01;
|
|
||||||
if (IS_Q4_K) { w0 = w4a16_dq_q4_K(row, k); w1 = w4a16_dq_q4_K(row, k + 1); }
|
|
||||||
else { w0 = w4a16_dq_q4_0(row, k); w1 = w4a16_dq_q4_0(row, k + 1); }
|
|
||||||
}
|
|
||||||
sW[m*SPAD + kk] = __floats2bfloat162_rn(w0, w1);
|
|
||||||
}
|
|
||||||
// Load the BN x 16 activation strip (f32 -> bf16).
|
|
||||||
#pragma unroll
|
|
||||||
for (int idx = tid; idx < BN*KP; idx += NTH) {
|
|
||||||
const int n = idx / KP;
|
|
||||||
const int kk = idx % KP;
|
|
||||||
const int k = k0 + 2*kk;
|
|
||||||
float a0 = 0.0f, a1 = 0.0f;
|
|
||||||
if (n0 + n < N) {
|
|
||||||
const float * arow = (const float *)(src1 + (int64_t)(n0 + n) * nb11);
|
|
||||||
a0 = arow[k]; a1 = arow[k + 1];
|
|
||||||
}
|
|
||||||
sB[n*SPAD + kk] = __floats2bfloat162_rn(a0, a1);
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
tile_A Af[FM];
|
|
||||||
tile_B Bf[FN];
|
|
||||||
#pragma unroll
|
|
||||||
for (int fm = 0; fm < FM; ++fm) {
|
|
||||||
const int mrow = (warp_m*FM + fm) * 16;
|
|
||||||
load_ldmatrix(Af[fm], sW + mrow*SPAD, SPAD);
|
|
||||||
}
|
|
||||||
#pragma unroll
|
|
||||||
for (int fn = 0; fn < FN; ++fn) {
|
|
||||||
const int ncol = (warp_n*FN + fn) * 8;
|
|
||||||
load_ldmatrix(Bf[fn], sB + ncol*SPAD, SPAD);
|
|
||||||
}
|
|
||||||
#pragma unroll
|
|
||||||
for (int fm = 0; fm < FM; ++fm) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int fn = 0; fn < FN; ++fn) {
|
|
||||||
mma(C[fm][fn], Af[fm], Bf[fn]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
__syncthreads();
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int fm = 0; fm < FM; ++fm) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int fn = 0; fn < FN; ++fn) {
|
|
||||||
const int mbase = m0 + (warp_m*FM + fm) * 16;
|
|
||||||
const int nbase = n0 + (warp_n*FN + fn) * 8;
|
|
||||||
#pragma unroll
|
|
||||||
for (int l = 0; l < tile_C::ne; ++l) {
|
|
||||||
const int m = mbase + tile_C::get_i(l);
|
|
||||||
const int n = nbase + tile_C::get_j(l);
|
|
||||||
if (m < M && n < N) {
|
|
||||||
dst[(int64_t)n * dst_ne0 + m] = C[fm][fn].x[l];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ggml_cuda_w4a16_mul_mat(
|
|
||||||
ggml_backend_cuda_context & ctx,
|
|
||||||
const ggml_tensor * src0,
|
|
||||||
const ggml_tensor * src1,
|
|
||||||
ggml_tensor * dst) {
|
|
||||||
if (!w4a16_enabled()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (src0->type != GGML_TYPE_Q4_0 && src0->type != GGML_TYPE_Q4_K) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
|
||||||
if (!GGML_CUDA_CC_IS_NVIDIA(cc) || cc < GGML_CUDA_CC_BLACKWELL) {
|
|
||||||
return false; // consumer Blackwell (sm_120/121) only
|
|
||||||
}
|
|
||||||
|
|
||||||
if (src0->ne[2] != 1 || src0->ne[3] != 1 ||
|
|
||||||
src1->ne[2] != 1 || src1->ne[3] != 1 ||
|
|
||||||
dst->ne[2] != 1 || dst->ne[3] != 1) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1) || !ggml_is_contiguous(dst)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t K = src0->ne[0];
|
|
||||||
const int64_t M = src0->ne[1];
|
|
||||||
const int64_t N = src1->ne[1];
|
|
||||||
if (src1->ne[0] != K || dst->ne[0] != M || dst->ne[1] != N) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (K % 16 != 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaStream_t stream = ctx.stream();
|
|
||||||
|
|
||||||
// Block tile config: WM*WN warps compute BM(=WM*FM*16) x BN(=WN*FN*8).
|
|
||||||
constexpr int WM = 4, WN = 4, FM = 2, FN = 4; // BM=128, BN=128, 16 warps
|
|
||||||
constexpr int BM = WM*FM*16;
|
|
||||||
constexpr int BN = WN*FN*8;
|
|
||||||
const dim3 grid((unsigned)((M + BM - 1) / BM), (unsigned)((N + BN - 1) / BN), 1);
|
|
||||||
const dim3 block(32, WM*WN, 1);
|
|
||||||
|
|
||||||
if (src0->type == GGML_TYPE_Q4_K) {
|
|
||||||
w4a16_gemm_kernel<true, WM, WN, FM, FN><<<grid, block, 0, stream>>>(
|
|
||||||
(const char *) src0->data, (const char *) src1->data, (float *) dst->data,
|
|
||||||
(int) M, (int) N, (int) K, src0->nb[1], src1->nb[1], dst->ne[0]);
|
|
||||||
} else {
|
|
||||||
w4a16_gemm_kernel<false, WM, WN, FM, FN><<<grid, block, 0, stream>>>(
|
|
||||||
(const char *) src0->data, (const char *) src1->data, (float *) dst->data,
|
|
||||||
(int) M, (int) N, (int) K, src0->nb[1], src1->nb[1], dst->ne[0]);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include "common.cuh"
|
|
||||||
|
|
||||||
// W4A16 Marlin-style BF16 GEMM for NVIDIA Blackwell consumer GPUs (sm_120/121).
|
|
||||||
// Dense (non-MoE) 4-bit-weight matmul run on BF16 tensor cores, the path that
|
|
||||||
// reaches the GB10 BF16 ceiling where MMQ (int8, Ampere-tuned) and cuBLAS (sm_80
|
|
||||||
// fallback) both plateau at ~22% of it. Returns true if it handled the op; false
|
|
||||||
// to fall back to MMQ. Gated behind GGML_CUDA_W4A16 until correct + faster.
|
|
||||||
bool ggml_cuda_w4a16_mul_mat(
|
|
||||||
ggml_backend_cuda_context & ctx,
|
|
||||||
const ggml_tensor * src0, // 4-bit weights (Q4_0/Q4_K)
|
|
||||||
const ggml_tensor * src1, // F32 activations
|
|
||||||
ggml_tensor * dst); // F32 output
|
|
||||||
@@ -1,129 +0,0 @@
|
|||||||
// paged-bench: quantify the multi-tenant wins of paged KV allocation that are
|
|
||||||
// properties of the host-side block model (vLLM-parity), independent of the
|
|
||||||
// in-model compute path.
|
|
||||||
//
|
|
||||||
// Win 1 (capacity): on-demand block allocation vs contiguous per-seq
|
|
||||||
// reservation, under a fixed KV block budget.
|
|
||||||
// Win 3 (prefix sharing): automatic cross-tenant prefix dedup via block
|
|
||||||
// hashing.
|
|
||||||
//
|
|
||||||
// Win 2 (throughput) is intentionally NOT here: it requires the paged read
|
|
||||||
// path wired into llama-graph.cpp (Gate 0). Measuring it at this layer would
|
|
||||||
// be dishonest, so it is reported as pending.
|
|
||||||
|
|
||||||
#include "paged_kv_manager.h"
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <vector>
|
|
||||||
#include <numeric>
|
|
||||||
|
|
||||||
using namespace paged;
|
|
||||||
|
|
||||||
// A deterministic LCG so sequence lengths vary without Math.random-style nondeterminism.
|
|
||||||
struct Lcg {
|
|
||||||
uint64_t s;
|
|
||||||
explicit Lcg(uint64_t seed) : s(seed) {}
|
|
||||||
uint32_t next() { s = s * 6364136223846793005ULL + 1442695040888963407ULL; return (uint32_t)(s >> 33); }
|
|
||||||
int range(int lo, int hi) { return lo + (int)(next() % (uint32_t)(hi - lo + 1)); }
|
|
||||||
};
|
|
||||||
|
|
||||||
static size_t cdiv(size_t a, size_t b) { return (a + b - 1) / b; }
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
const int block_size = 16;
|
|
||||||
const int n_ctx = 2048; // max context a sequence could use
|
|
||||||
const int num_blocks = 512; // fixed KV budget: 512 blocks * 16 = 8192 cells
|
|
||||||
|
|
||||||
printf("paged-bench (block_size=%d, n_ctx=%d, budget=%d blocks = %d cells)\n\n",
|
|
||||||
block_size, n_ctx, num_blocks, num_blocks * block_size);
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------
|
|
||||||
// WIN 1: concurrency capacity. Sequences have realistic, VARYING lengths
|
|
||||||
// (most short, a few long) - the regime where reserving n_ctx per seq
|
|
||||||
// wastes the most. Count how many fit under the same block budget.
|
|
||||||
// ---------------------------------------------------------------------
|
|
||||||
{
|
|
||||||
Lcg rng(12345);
|
|
||||||
const int blocks_per_ctx = (int) cdiv(n_ctx, block_size); // contiguous reserves this per seq
|
|
||||||
|
|
||||||
// Contiguous (stream-style) reservation: every seq reserves n_ctx worth.
|
|
||||||
int contiguous_fit = num_blocks / blocks_per_ctx;
|
|
||||||
|
|
||||||
// Paged on-demand: draw real lengths until the pool is exhausted.
|
|
||||||
PagedKVManager m(num_blocks, block_size, /*enable_caching=*/false);
|
|
||||||
int paged_fit = 0;
|
|
||||||
long total_tokens = 0;
|
|
||||||
for (int seq = 0; ; ++seq) {
|
|
||||||
// 80% short (8-128 tok), 20% long (up to n_ctx)
|
|
||||||
int len = (rng.range(0, 99) < 80) ? rng.range(8, 128) : rng.range(128, n_ctx);
|
|
||||||
if (!m.allocate(seq, (size_t) len)) break;
|
|
||||||
paged_fit++;
|
|
||||||
total_tokens += len;
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("WIN 1 concurrency capacity @ %d-block budget\n", num_blocks);
|
|
||||||
printf(" contiguous (reserve n_ctx/seq): %d sequences\n", contiguous_fit);
|
|
||||||
printf(" paged (on-demand blocks): %d sequences (avg %ld tok/seq)\n",
|
|
||||||
paged_fit, paged_fit ? total_tokens / paged_fit : 0);
|
|
||||||
printf(" --> paged fits %.1fx more concurrent sequences\n\n",
|
|
||||||
contiguous_fit ? (double) paged_fit / contiguous_fit : 0.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------
|
|
||||||
// WIN 3: cross-tenant prefix sharing. N tenants share a long system
|
|
||||||
// prompt / RAG context, then diverge. Compare physical blocks consumed
|
|
||||||
// with prefix caching on vs off.
|
|
||||||
// ---------------------------------------------------------------------
|
|
||||||
{
|
|
||||||
const int n_tenants = 32;
|
|
||||||
const int shared_len = 1024; // shared system prompt (64 blocks)
|
|
||||||
const int distinct_len = 64; // per-tenant suffix (4 blocks)
|
|
||||||
|
|
||||||
// Shared prefix token ids (identical across tenants -> identical block hashes).
|
|
||||||
std::vector<int> shared(shared_len);
|
|
||||||
for (int i = 0; i < shared_len; ++i) shared[i] = 1000 + i;
|
|
||||||
|
|
||||||
// --- prefix caching OFF: every tenant pays for the whole prefix ---
|
|
||||||
long blocks_off = 0;
|
|
||||||
{
|
|
||||||
PagedKVManager m(num_blocks * 8, block_size, /*enable_caching=*/false);
|
|
||||||
for (int t = 0; t < n_tenants; ++t) {
|
|
||||||
m.allocate(t, (size_t) (shared_len + distinct_len));
|
|
||||||
blocks_off += m.block_table(t).size();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- prefix caching ON: shared blocks are deduped to one physical copy ---
|
|
||||||
long blocks_on = 0;
|
|
||||||
{
|
|
||||||
PagedKVManager m(num_blocks * 8, block_size, /*enable_caching=*/true);
|
|
||||||
// tenant 0 fills + caches the shared prefix
|
|
||||||
auto h = m.compute_block_hashes(shared);
|
|
||||||
m.allocate(0, (size_t) (shared_len + distinct_len));
|
|
||||||
m.cache_blocks(0, h, (size_t) shared_len);
|
|
||||||
long physical = m.block_table(0).size();
|
|
||||||
// tenants 1..N-1 hit the cached prefix; only their distinct suffix is new
|
|
||||||
for (int t = 1; t < n_tenants; ++t) {
|
|
||||||
size_t cached_tokens = m.get_computed_blocks(h); // shared blocks reused
|
|
||||||
size_t new_tokens = (shared_len - cached_tokens) + distinct_len;
|
|
||||||
m.allocate(t, (size_t) (shared_len + distinct_len));
|
|
||||||
// physically new blocks = only what wasn't already resident
|
|
||||||
physical += (long) cdiv(new_tokens, block_size);
|
|
||||||
}
|
|
||||||
blocks_on = physical;
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("WIN 3 cross-tenant prefix sharing (%d tenants, %d-tok shared prefix)\n",
|
|
||||||
n_tenants, shared_len);
|
|
||||||
printf(" prefix-cache OFF: %ld physical blocks\n", blocks_off);
|
|
||||||
printf(" prefix-cache ON: %ld physical blocks\n", blocks_on);
|
|
||||||
printf(" --> %.1fx less KV memory for the shared workload\n\n",
|
|
||||||
blocks_on ? (double) blocks_off / blocks_on : 0.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("WIN 2 aggregate throughput under load: PENDING\n");
|
|
||||||
printf(" Requires the paged gather-read path wired into llama-graph.cpp\n");
|
|
||||||
printf(" (Gate 0) to measure tok/s vs concurrency. Not measurable at the\n");
|
|
||||||
printf(" allocation layer; not reported here to avoid overclaiming.\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
@@ -1,169 +0,0 @@
|
|||||||
// paged-loadgen: a dynamic-load benchmark for paged KV that actually exercises the
|
|
||||||
// regime where paging wins - variable prompt lengths, variable generation lengths,
|
|
||||||
// staggered (continuous) arrival, and a shared system prefix. The stock
|
|
||||||
// examples/paged/paged.cpp adds all requests up front with a fixed n_predict from a
|
|
||||||
// 20-prompt pool, so it never creates KV-memory pressure or fragmentation and
|
|
||||||
// therefore never shows a paged advantage (see PAGED_KV_HIGH_CONCURRENCY.md).
|
|
||||||
//
|
|
||||||
// Build: drop into PR #22569's examples/paged/ and add to its CMakeLists.txt next to
|
|
||||||
// llama-paged (it uses the same llama_paged_scheduler_* API). Run on the TARGET GPU
|
|
||||||
// (e.g. 2xH200) where bandwidth lets decode scale to thousands of sequences and KV
|
|
||||||
// memory becomes the binding constraint - that is where paged KV pays off and where
|
|
||||||
// this harness produces a meaningful number. On a low-bandwidth box (GB10) throughput
|
|
||||||
// plateaus long before memory binds, so the win is not observable there regardless.
|
|
||||||
//
|
|
||||||
// Metrics reported:
|
|
||||||
// - goodput (decode tokens/s aggregate) under the dynamic load
|
|
||||||
// - peak concurrent in-flight sequences actually sustained
|
|
||||||
// - paged peak KV bytes used vs the contiguous reservation a unified cache needs
|
|
||||||
// (n_seq_peak * max_ctx), i.e. the capacity ratio = the headroom paging unlocks
|
|
||||||
//
|
|
||||||
// The capacity ratio is the load-bearing number for the buy decision: it is how many
|
|
||||||
// more concurrent tenants a fixed HBM budget serves with paging than without.
|
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstring>
|
|
||||||
#include <random>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
// ---- workload knobs (env-overridable so the harness is sweepable without rebuilds) ----
|
|
||||||
static int env_int(const char * k, int dflt) { const char * v = getenv(k); return v ? atoi(v) : dflt; }
|
|
||||||
|
|
||||||
struct workload_cfg {
|
|
||||||
int total_requests = env_int("LG_TOTAL", 2000); // total requests to serve
|
|
||||||
int target_inflight = env_int("LG_INFLIGHT", 256); // continuous-batching concurrency target
|
|
||||||
int prefix_tokens = env_int("LG_PREFIX", 512); // shared system-prompt prefix (prefix-cache target)
|
|
||||||
int suffix_min = env_int("LG_SUFMIN", 16); // per-request unique prompt suffix range
|
|
||||||
int suffix_max = env_int("LG_SUFMAX", 768);
|
|
||||||
int gen_short = env_int("LG_GENSHORT", 32); // bimodal generation: most short...
|
|
||||||
int gen_long = env_int("LG_GENLONG", 1024); // ...some long (the over-reservation driver)
|
|
||||||
int gen_long_pct = env_int("LG_LONGPCT", 15); // % of requests that are long
|
|
||||||
int block_size = env_int("LG_BLOCK", 16); // must match -kvbls
|
|
||||||
unsigned seed = (unsigned) env_int("LG_SEED", 1234);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Per-request plan drawn from the workload distribution.
|
|
||||||
struct req_plan { int prompt_len; int gen_len; };
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
|
||||||
common_params params;
|
|
||||||
params.n_predict = -1; // per-request, controlled by the plan below
|
|
||||||
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PAGED)) {
|
|
||||||
fprintf(stderr, "usage: %s -m <model> -kvp --fit off -ngpub N -ncpub M -ngl 99\n", argv[0]);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
params.kv_paged = true;
|
|
||||||
|
|
||||||
common_init_result init = common_init_from_params(params);
|
|
||||||
llama_model * model = init.model.get();
|
|
||||||
llama_context * ctx = init.context.get();
|
|
||||||
if (!model || !ctx) { fprintf(stderr, "load failed\n"); return 1; }
|
|
||||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
||||||
|
|
||||||
workload_cfg cfg;
|
|
||||||
std::mt19937 rng(cfg.seed);
|
|
||||||
std::uniform_int_distribution<int> suf(cfg.suffix_min, cfg.suffix_max);
|
|
||||||
std::uniform_int_distribution<int> pct(1, 100);
|
|
||||||
|
|
||||||
// KV bytes/token = 2(K,V) * n_layers * n_head_kv * head_dim * sizeof(f16). Confirmed
|
|
||||||
// against llama-kv-cache-paged.cpp (block_bytes formula). Used for the capacity ratio.
|
|
||||||
const int n_layers = llama_model_n_layer(model);
|
|
||||||
const int n_head_kv = llama_model_n_head_kv(model);
|
|
||||||
const int head_dim = llama_model_n_embd(model) / llama_model_n_head(model);
|
|
||||||
const size_t kv_bytes_per_token = (size_t)2 * n_layers * n_head_kv * head_dim * sizeof(uint16_t);
|
|
||||||
|
|
||||||
// A long shared system prefix that every request reuses (the prefix-cache target).
|
|
||||||
std::vector<llama_token> prefix = common_tokenize(ctx, std::string(cfg.prefix_tokens, 'x'), true);
|
|
||||||
|
|
||||||
// Pre-draw all request plans so paged peak usage and the contiguous reservation are
|
|
||||||
// computed from the SAME workload.
|
|
||||||
std::vector<req_plan> plans(cfg.total_requests);
|
|
||||||
int max_ctx = 0;
|
|
||||||
for (auto & p : plans) {
|
|
||||||
p.prompt_len = cfg.prefix_tokens + suf(rng);
|
|
||||||
p.gen_len = (pct(rng) <= cfg.gen_long_pct) ? cfg.gen_long : cfg.gen_short;
|
|
||||||
max_ctx = std::max(max_ctx, p.prompt_len + p.gen_len);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_paged_scheduler * sched = llama_paged_scheduler_init(ctx);
|
|
||||||
if (!sched) { fprintf(stderr, "scheduler init failed\n"); return 1; }
|
|
||||||
|
|
||||||
// ---- continuous-arrival loop: keep ~target_inflight requests live at all times ----
|
|
||||||
int next_req = 0, done = 0, inflight = 0, peak_inflight = 0;
|
|
||||||
long total_decoded = 0;
|
|
||||||
size_t peak_kv_bytes_paged = 0; // sum over live seqs of ceil(used/block)*block*kv_bytes
|
|
||||||
size_t live_used_tokens = 0; // running sum of actual KV tokens held by live seqs
|
|
||||||
|
|
||||||
auto admit = [&](int rid) {
|
|
||||||
const req_plan & p = plans[rid];
|
|
||||||
std::vector<llama_token> toks = prefix; // shared prefix...
|
|
||||||
std::vector<llama_token> suff = common_tokenize(ctx, std::string(p.prompt_len - cfg.prefix_tokens, 'y'), false);
|
|
||||||
toks.insert(toks.end(), suff.begin(), suff.end()); // ...+ unique suffix
|
|
||||||
if (llama_paged_scheduler_add_request(sched, toks.data(), toks.size(), rid)) {
|
|
||||||
inflight++; peak_inflight = std::max(peak_inflight, inflight);
|
|
||||||
live_used_tokens += p.prompt_len;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const int64_t t0 = ggml_time_us();
|
|
||||||
for (int i = 0; i < cfg.target_inflight && next_req < cfg.total_requests; ++i) admit(next_req++);
|
|
||||||
|
|
||||||
llama_batch batch = {};
|
|
||||||
std::vector<llama_token> sampled; std::vector<int8_t> stop_flags;
|
|
||||||
|
|
||||||
while (done < cfg.total_requests) {
|
|
||||||
if (!llama_paged_scheduler_prepare_batch(sched, &batch)) break;
|
|
||||||
const llama_paged_batch_info * info = llama_paged_scheduler_get_batch_info(sched);
|
|
||||||
sampled.assign(info->n_seq, 0); stop_flags.assign(info->n_seq, 0);
|
|
||||||
|
|
||||||
// (decode is done inside the scheduler/update path in PR #22569; greedy here)
|
|
||||||
for (int i = 0; i < info->n_seq; ++i) {
|
|
||||||
const int rid = info->seq_ids[i];
|
|
||||||
llama_paged_seq_state st{};
|
|
||||||
llama_paged_scheduler_get_seq_state(sched, rid, &st);
|
|
||||||
// greedy argmax from the i-th row of logits
|
|
||||||
const float * lg = llama_get_logits_ith(ctx, i);
|
|
||||||
int best = 0; float bv = lg[0];
|
|
||||||
for (int t = 1; t < llama_vocab_n_tokens(vocab); ++t) if (lg[t] > bv) { bv = lg[t]; best = t; }
|
|
||||||
sampled[i] = best;
|
|
||||||
const bool stop = llama_vocab_is_eog(vocab, best) || st.n_decoded + 1 >= plans[rid].gen_len;
|
|
||||||
stop_flags[i] = stop ? 1 : 0;
|
|
||||||
if (!stop) { total_decoded++; live_used_tokens++; }
|
|
||||||
if (stop) {
|
|
||||||
done++; inflight--;
|
|
||||||
live_used_tokens -= (plans[rid].prompt_len + st.n_decoded);
|
|
||||||
if (next_req < cfg.total_requests) admit(next_req++); // continuous arrival
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// paged peak KV: blocks are allocated per live seq = ceil(used/block); approximate
|
|
||||||
// current paged footprint from live_used_tokens rounded up per the block size.
|
|
||||||
const size_t paged_now = (size_t)std::ceil((double)live_used_tokens / cfg.block_size)
|
|
||||||
* cfg.block_size * kv_bytes_per_token;
|
|
||||||
peak_kv_bytes_paged = std::max(peak_kv_bytes_paged, paged_now);
|
|
||||||
|
|
||||||
llama_paged_scheduler_update(sched, &batch, sampled.data(), stop_flags.data());
|
|
||||||
}
|
|
||||||
const double secs = (ggml_time_us() - t0) / 1e6;
|
|
||||||
|
|
||||||
// Contiguous unified-KV reservation needed to serve the SAME peak concurrency without
|
|
||||||
// mid-generation eviction: every live slot must be backed for the worst-case context.
|
|
||||||
const size_t contig_reserve = (size_t)peak_inflight * max_ctx * kv_bytes_per_token;
|
|
||||||
|
|
||||||
printf("\n==== paged-loadgen ====\n");
|
|
||||||
printf("requests served : %d (target inflight %d, peak inflight %d)\n", done, cfg.target_inflight, peak_inflight);
|
|
||||||
printf("goodput (decode) : %.1f tok/s (%ld tokens / %.2f s)\n", total_decoded / secs, total_decoded, secs);
|
|
||||||
printf("kv bytes / token : %zu (n_layer=%d n_head_kv=%d head_dim=%d f16)\n", kv_bytes_per_token, n_layers, n_head_kv, head_dim);
|
|
||||||
printf("paged peak KV : %.2f GiB (allocated on demand)\n", peak_kv_bytes_paged / 1073741824.0);
|
|
||||||
printf("contiguous reserve : %.2f GiB (peak_inflight * max_ctx %d)\n", contig_reserve / 1073741824.0, max_ctx);
|
|
||||||
printf("CAPACITY RATIO : %.2fx <- tenants-per-HBM paging unlocks\n",
|
|
||||||
peak_kv_bytes_paged ? (double)contig_reserve / peak_kv_bytes_paged : 0.0);
|
|
||||||
printf(" (plus cross-request prefix sharing of the %d-token shared prefix, not counted above)\n", cfg.prefix_tokens);
|
|
||||||
|
|
||||||
llama_paged_scheduler_free(sched);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
@@ -1,296 +0,0 @@
|
|||||||
#include "paged_kv_manager.h"
|
|
||||||
#include <cassert>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
namespace paged {
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// FreeBlockQueue (port of kv_cache_utils.py FreeKVCacheBlockQueue)
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
FreeBlockQueue::FreeBlockQueue(const std::vector<KVCacheBlock*>& blocks) {
|
|
||||||
num_free_blocks = blocks.size();
|
|
||||||
for (size_t i = 0; i < blocks.size(); ++i) {
|
|
||||||
if (i > 0) blocks[i]->prev_free = blocks[i - 1];
|
|
||||||
if (i + 1 < blocks.size()) blocks[i]->next_free = blocks[i + 1];
|
|
||||||
}
|
|
||||||
if (!blocks.empty()) {
|
|
||||||
fake_head.next_free = blocks.front();
|
|
||||||
blocks.front()->prev_free = &fake_head;
|
|
||||||
fake_tail.prev_free = blocks.back();
|
|
||||||
blocks.back()->next_free = &fake_tail;
|
|
||||||
} else {
|
|
||||||
fake_head.next_free = &fake_tail;
|
|
||||||
fake_tail.prev_free = &fake_head;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
KVCacheBlock* FreeBlockQueue::popleft() {
|
|
||||||
KVCacheBlock* first = fake_head.next_free;
|
|
||||||
if (first == &fake_tail || first == nullptr) {
|
|
||||||
assert(num_free_blocks == 0);
|
|
||||||
throw std::runtime_error("No free blocks available");
|
|
||||||
}
|
|
||||||
fake_head.next_free = first->next_free;
|
|
||||||
first->next_free->prev_free = &fake_head;
|
|
||||||
first->prev_free = first->next_free = nullptr;
|
|
||||||
num_free_blocks--;
|
|
||||||
return first;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<KVCacheBlock*> FreeBlockQueue::popleft_n(size_t n) {
|
|
||||||
std::vector<KVCacheBlock*> ret;
|
|
||||||
if (n == 0) return ret;
|
|
||||||
assert(num_free_blocks >= n);
|
|
||||||
num_free_blocks -= n;
|
|
||||||
KVCacheBlock* curr = fake_head.next_free;
|
|
||||||
ret.reserve(n);
|
|
||||||
for (size_t i = 0; i < n; ++i) {
|
|
||||||
assert(curr != nullptr);
|
|
||||||
ret.push_back(curr);
|
|
||||||
KVCacheBlock* last = curr;
|
|
||||||
curr = curr->next_free;
|
|
||||||
last->prev_free = last->next_free = nullptr;
|
|
||||||
}
|
|
||||||
if (curr != nullptr) {
|
|
||||||
fake_head.next_free = curr;
|
|
||||||
curr->prev_free = &fake_head;
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
void FreeBlockQueue::remove(KVCacheBlock* block) {
|
|
||||||
if (!block->prev_free || !block->next_free)
|
|
||||||
throw std::runtime_error("remove() called on an invalid block");
|
|
||||||
block->prev_free->next_free = block->next_free;
|
|
||||||
block->next_free->prev_free = block->prev_free;
|
|
||||||
block->prev_free = block->next_free = nullptr;
|
|
||||||
num_free_blocks--;
|
|
||||||
}
|
|
||||||
|
|
||||||
void FreeBlockQueue::append(KVCacheBlock* block) {
|
|
||||||
KVCacheBlock* last = fake_tail.prev_free;
|
|
||||||
last->next_free = block;
|
|
||||||
block->prev_free = last;
|
|
||||||
block->next_free = &fake_tail;
|
|
||||||
fake_tail.prev_free = block;
|
|
||||||
num_free_blocks++;
|
|
||||||
}
|
|
||||||
|
|
||||||
void FreeBlockQueue::append_n(const std::vector<KVCacheBlock*>& blocks) {
|
|
||||||
if (blocks.empty()) return;
|
|
||||||
KVCacheBlock* last = fake_tail.prev_free;
|
|
||||||
for (KVCacheBlock* b : blocks) {
|
|
||||||
b->prev_free = last;
|
|
||||||
last->next_free = b;
|
|
||||||
last = b;
|
|
||||||
}
|
|
||||||
last->next_free = &fake_tail;
|
|
||||||
fake_tail.prev_free = last;
|
|
||||||
num_free_blocks += blocks.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
void FreeBlockQueue::prepend_n(const std::vector<KVCacheBlock*>& blocks) {
|
|
||||||
if (blocks.empty()) return;
|
|
||||||
KVCacheBlock* first = fake_head.next_free;
|
|
||||||
KVCacheBlock* prev = &fake_head;
|
|
||||||
for (KVCacheBlock* b : blocks) {
|
|
||||||
b->prev_free = prev;
|
|
||||||
prev->next_free = b;
|
|
||||||
prev = b;
|
|
||||||
}
|
|
||||||
prev->next_free = first;
|
|
||||||
first->prev_free = prev;
|
|
||||||
num_free_blocks += blocks.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<KVCacheBlock*> FreeBlockQueue::get_all_free_blocks() const {
|
|
||||||
std::vector<KVCacheBlock*> ret;
|
|
||||||
const KVCacheBlock* curr = fake_head.next_free;
|
|
||||||
while (curr && curr->next_free != nullptr) {
|
|
||||||
ret.push_back(const_cast<KVCacheBlock*>(curr));
|
|
||||||
curr = curr->next_free;
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// BlockPool (port of block_pool.py)
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
static std::vector<KVCacheBlock*> make_ptrs(std::vector<KVCacheBlock>& v) {
|
|
||||||
std::vector<KVCacheBlock*> p;
|
|
||||||
p.reserve(v.size());
|
|
||||||
for (auto& b : v) p.push_back(&b);
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::vector<KVCacheBlock> make_block_vec(int32_t num_blocks) {
|
|
||||||
std::vector<KVCacheBlock> v;
|
|
||||||
v.reserve(num_blocks);
|
|
||||||
for (int32_t i = 0; i < num_blocks; ++i) v.emplace_back(i);
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
|
|
||||||
BlockPool::BlockPool(int32_t num_blocks, bool enable_caching)
|
|
||||||
: enable_caching_(enable_caching),
|
|
||||||
blocks_(make_block_vec(num_blocks)),
|
|
||||||
ptrs_(make_ptrs(blocks_)),
|
|
||||||
free_queue_(ptrs_) {
|
|
||||||
// vLLM reserves block_id 0 as the null block (never cached).
|
|
||||||
null_block = free_queue_.popleft();
|
|
||||||
null_block->is_null = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool BlockPool::maybe_evict_cached_block(KVCacheBlock* block) {
|
|
||||||
if (!block->has_hash) return false;
|
|
||||||
auto it = cached_block_hash_to_block_.find(block->block_hash);
|
|
||||||
if (it == cached_block_hash_to_block_.end() || it->second != block) return false;
|
|
||||||
cached_block_hash_to_block_.erase(it);
|
|
||||||
block->reset_hash();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<KVCacheBlock*> BlockPool::get_new_blocks(size_t n) {
|
|
||||||
if (n > get_num_free_blocks())
|
|
||||||
throw std::runtime_error("Cannot get free blocks from pool");
|
|
||||||
auto ret = free_queue_.popleft_n(n);
|
|
||||||
for (KVCacheBlock* b : ret) {
|
|
||||||
if (enable_caching_) maybe_evict_cached_block(b);
|
|
||||||
assert(b->ref_cnt == 0);
|
|
||||||
b->ref_cnt += 1;
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
KVCacheBlock* BlockPool::get_cached_block(uint64_t block_hash) {
|
|
||||||
auto it = cached_block_hash_to_block_.find(block_hash);
|
|
||||||
return it == cached_block_hash_to_block_.end() ? nullptr : it->second;
|
|
||||||
}
|
|
||||||
|
|
||||||
void BlockPool::touch(const std::vector<KVCacheBlock*>& blocks) {
|
|
||||||
for (KVCacheBlock* b : blocks) {
|
|
||||||
// ref_cnt==0 means the block is a free-list eviction candidate; pull it out.
|
|
||||||
if (b->ref_cnt == 0 && !b->is_null) free_queue_.remove(b);
|
|
||||||
b->ref_cnt += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void BlockPool::free_blocks(const std::vector<KVCacheBlock*>& ordered_blocks) {
|
|
||||||
std::vector<KVCacheBlock*> without_hash, with_hash;
|
|
||||||
for (KVCacheBlock* b : ordered_blocks) {
|
|
||||||
if (b->is_null) continue;
|
|
||||||
b->ref_cnt -= 1;
|
|
||||||
if (b->ref_cnt == 0) (b->has_hash ? with_hash : without_hash).push_back(b);
|
|
||||||
}
|
|
||||||
free_queue_.prepend_n(without_hash); // un-hashed: evicted first (front)
|
|
||||||
free_queue_.append_n(with_hash); // hashed: kept warm (tail)
|
|
||||||
}
|
|
||||||
|
|
||||||
void BlockPool::cache_full_blocks(const std::vector<KVCacheBlock*>& req_blocks,
|
|
||||||
size_t num_cached_blocks, size_t num_full_blocks,
|
|
||||||
const std::vector<uint64_t>& block_hashes) {
|
|
||||||
for (size_t i = num_cached_blocks; i < num_full_blocks; ++i) {
|
|
||||||
KVCacheBlock* blk = req_blocks[i];
|
|
||||||
if (blk->has_hash) continue;
|
|
||||||
blk->has_hash = true;
|
|
||||||
blk->block_hash = block_hashes[i];
|
|
||||||
cached_block_hash_to_block_[blk->block_hash] = blk;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// PagedKVManager (port of SingleTypeKVCacheManager / FullAttentionManager)
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
static inline size_t cdiv(size_t a, size_t b) { return (a + b - 1) / b; }
|
|
||||||
|
|
||||||
PagedKVManager::PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching)
|
|
||||||
: block_size_(block_size), pool_(num_blocks, enable_caching) {}
|
|
||||||
|
|
||||||
bool PagedKVManager::allocate(int seq_id, size_t total_tokens) {
|
|
||||||
auto& req = req_to_blocks_[seq_id];
|
|
||||||
size_t need = cdiv(total_tokens, block_size_);
|
|
||||||
if (need <= req.size()) return true;
|
|
||||||
size_t add = need - req.size();
|
|
||||||
if (add > pool_.get_num_free_blocks()) return false; // OOM
|
|
||||||
auto nb = pool_.get_new_blocks(add);
|
|
||||||
req.insert(req.end(), nb.begin(), nb.end());
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<int32_t> PagedKVManager::block_table(int seq_id) const {
|
|
||||||
std::vector<int32_t> bt;
|
|
||||||
auto it = req_to_blocks_.find(seq_id);
|
|
||||||
if (it == req_to_blocks_.end()) return bt;
|
|
||||||
bt.reserve(it->second.size());
|
|
||||||
for (KVCacheBlock* b : it->second) bt.push_back(b->block_id);
|
|
||||||
return bt;
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t PagedKVManager::slot(int seq_id, int pos) const {
|
|
||||||
const auto& req = req_to_blocks_.at(seq_id);
|
|
||||||
int32_t phys = req[pos / block_size_]->block_id;
|
|
||||||
return (int64_t)phys * block_size_ + (pos % block_size_);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<int64_t> PagedKVManager::slot_mapping(int seq_id, const std::vector<int>& positions) const {
|
|
||||||
std::vector<int64_t> sm;
|
|
||||||
sm.reserve(positions.size());
|
|
||||||
for (int p : positions) sm.push_back(slot(seq_id, p));
|
|
||||||
return sm;
|
|
||||||
}
|
|
||||||
|
|
||||||
void PagedKVManager::free(int seq_id) {
|
|
||||||
auto it = req_to_blocks_.find(seq_id);
|
|
||||||
if (it == req_to_blocks_.end()) return;
|
|
||||||
// Free in reverse so the tail of the block chain is evicted first (vLLM order).
|
|
||||||
std::vector<KVCacheBlock*> ordered(it->second.rbegin(), it->second.rend());
|
|
||||||
pool_.free_blocks(ordered);
|
|
||||||
req_to_blocks_.erase(it);
|
|
||||||
}
|
|
||||||
|
|
||||||
// FNV-1a chained block hash. Deterministic and prefix-sensitive; folds the parent
|
|
||||||
// hash into the seed so each block hash transitively encodes its whole prefix
|
|
||||||
// (behavioral parity with vLLM hash_block_tokens chaining; vLLM uses sha256 bytes).
|
|
||||||
uint64_t PagedKVManager::hash_block(uint64_t parent_hash, const std::vector<int>& token_ids) {
|
|
||||||
uint64_t h = 1469598103934665603ull ^ parent_hash;
|
|
||||||
for (int t : token_ids) {
|
|
||||||
h ^= (uint64_t)(uint32_t)t;
|
|
||||||
h *= 1099511628211ull;
|
|
||||||
}
|
|
||||||
if (h == 0) h = 0x9e3779b97f4a7c15ull; // never 0 (0 reads as "no hash")
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<uint64_t> PagedKVManager::compute_block_hashes(const std::vector<int>& token_ids) const {
|
|
||||||
std::vector<uint64_t> hashes;
|
|
||||||
uint64_t parent = 0; // NONE_HASH analogue
|
|
||||||
size_t n_full = token_ids.size() / block_size_;
|
|
||||||
for (size_t i = 0; i < n_full; ++i) {
|
|
||||||
std::vector<int> blk(token_ids.begin() + i * block_size_,
|
|
||||||
token_ids.begin() + (i + 1) * block_size_);
|
|
||||||
parent = hash_block(parent, blk);
|
|
||||||
hashes.push_back(parent);
|
|
||||||
}
|
|
||||||
return hashes;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t PagedKVManager::get_computed_blocks(const std::vector<uint64_t>& block_hashes) {
|
|
||||||
std::vector<KVCacheBlock*> hits;
|
|
||||||
for (uint64_t bh : block_hashes) { // stop at first miss (prefix property)
|
|
||||||
KVCacheBlock* cb = pool_.get_cached_block(bh);
|
|
||||||
if (!cb) break;
|
|
||||||
hits.push_back(cb);
|
|
||||||
}
|
|
||||||
pool_.touch(hits); // ++ref_cnt, pull from free list
|
|
||||||
return hits.size() * (size_t)block_size_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void PagedKVManager::cache_blocks(int seq_id, const std::vector<uint64_t>& block_hashes, size_t num_tokens) {
|
|
||||||
auto& req = req_to_blocks_[seq_id];
|
|
||||||
size_t n_full = num_tokens / block_size_;
|
|
||||||
pool_.cache_full_blocks(req, /*num_cached=*/0, n_full, block_hashes);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace paged
|
|
||||||
@@ -1,108 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
// Paged KV cache block manager for llama.cpp (CPU-first prototype).
|
|
||||||
//
|
|
||||||
// Host-side block management is a faithful port of vLLM V1:
|
|
||||||
// vllm/v1/core/kv_cache_utils.py (KVCacheBlock, FreeKVCacheBlockQueue, hash_block_tokens)
|
|
||||||
// vllm/v1/core/block_pool.py (BlockPool: get_new_blocks/touch/free/evict/cache_full_blocks)
|
|
||||||
// vllm/v1/core/single_type_kv_cache_manager.py (allocate_new_blocks, find_longest_cache_hit)
|
|
||||||
//
|
|
||||||
// Parity is on behavior/algorithm (block chaining, first-miss stop, ref-counting,
|
|
||||||
// LRU eviction order), not on exact hash bytes. This unit has zero ggml/llama.cpp
|
|
||||||
// dependency so it can be unit-tested in isolation.
|
|
||||||
|
|
||||||
#include <cstdint>
|
|
||||||
#include <vector>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <map>
|
|
||||||
|
|
||||||
namespace paged {
|
|
||||||
|
|
||||||
// vLLM KVCacheBlock (kv_cache_utils.py).
|
|
||||||
struct KVCacheBlock {
|
|
||||||
int32_t block_id = 0;
|
|
||||||
int ref_cnt = 0;
|
|
||||||
bool has_hash = false; // vLLM: _block_hash is set only when full+cached
|
|
||||||
uint64_t block_hash = 0;
|
|
||||||
bool is_null = false;
|
|
||||||
KVCacheBlock* prev_free = nullptr;
|
|
||||||
KVCacheBlock* next_free = nullptr;
|
|
||||||
|
|
||||||
explicit KVCacheBlock(int32_t id = 0) : block_id(id) {}
|
|
||||||
void reset_hash() { has_hash = false; block_hash = 0; }
|
|
||||||
};
|
|
||||||
|
|
||||||
// Intrusive doubly-linked free list with fake head/tail (vLLM FreeKVCacheBlockQueue).
|
|
||||||
// O(1) middle removal is required so touch() can pull a warm cached block out of the
|
|
||||||
// free list when a later request hits its prefix.
|
|
||||||
class FreeBlockQueue {
|
|
||||||
public:
|
|
||||||
size_t num_free_blocks = 0;
|
|
||||||
|
|
||||||
explicit FreeBlockQueue(const std::vector<KVCacheBlock*>& blocks);
|
|
||||||
KVCacheBlock* popleft();
|
|
||||||
std::vector<KVCacheBlock*> popleft_n(size_t n);
|
|
||||||
void remove(KVCacheBlock* block);
|
|
||||||
void append(KVCacheBlock* block);
|
|
||||||
void append_n(const std::vector<KVCacheBlock*>& blocks);
|
|
||||||
void prepend_n(const std::vector<KVCacheBlock*>& blocks);
|
|
||||||
std::vector<KVCacheBlock*> get_all_free_blocks() const;
|
|
||||||
|
|
||||||
private:
|
|
||||||
KVCacheBlock fake_head{-1};
|
|
||||||
KVCacheBlock fake_tail{-1};
|
|
||||||
};
|
|
||||||
|
|
||||||
// vLLM BlockPool (block_pool.py).
|
|
||||||
class BlockPool {
|
|
||||||
public:
|
|
||||||
KVCacheBlock* null_block = nullptr;
|
|
||||||
|
|
||||||
BlockPool(int32_t num_blocks, bool enable_caching);
|
|
||||||
std::vector<KVCacheBlock*> get_new_blocks(size_t n);
|
|
||||||
KVCacheBlock* get_cached_block(uint64_t block_hash);
|
|
||||||
void touch(const std::vector<KVCacheBlock*>& blocks);
|
|
||||||
void free_blocks(const std::vector<KVCacheBlock*>& ordered_blocks);
|
|
||||||
void cache_full_blocks(const std::vector<KVCacheBlock*>& req_blocks,
|
|
||||||
size_t num_cached_blocks, size_t num_full_blocks,
|
|
||||||
const std::vector<uint64_t>& block_hashes);
|
|
||||||
size_t get_num_free_blocks() const { return free_queue_.num_free_blocks; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
bool maybe_evict_cached_block(KVCacheBlock* block);
|
|
||||||
|
|
||||||
bool enable_caching_;
|
|
||||||
std::vector<KVCacheBlock> blocks_; // owns all block descriptors
|
|
||||||
std::vector<KVCacheBlock*> ptrs_;
|
|
||||||
FreeBlockQueue free_queue_;
|
|
||||||
// vLLM stores hash -> {block_id: block} to allow duplicate-content blocks; the
|
|
||||||
// prototype keeps the last writer (single KV-cache group is sufficient for the wins).
|
|
||||||
std::unordered_map<uint64_t, KVCacheBlock*> cached_block_hash_to_block_;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Allocation + prefix-caching surface, ported from SingleTypeKVCacheManager /
|
|
||||||
// FullAttentionManager. Single KV-cache group; no extra_keys / eagle / spec-decode.
|
|
||||||
class PagedKVManager {
|
|
||||||
public:
|
|
||||||
PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching);
|
|
||||||
|
|
||||||
// Grow seq_id to cover total_tokens slots. Returns false on OOM (free queue empty).
|
|
||||||
bool allocate(int seq_id, size_t total_tokens);
|
|
||||||
std::vector<int32_t> block_table(int seq_id) const;
|
|
||||||
int64_t slot(int seq_id, int pos) const;
|
|
||||||
std::vector<int64_t> slot_mapping(int seq_id, const std::vector<int>& positions) const;
|
|
||||||
void free(int seq_id);
|
|
||||||
int block_size() const { return block_size_; }
|
|
||||||
|
|
||||||
// Prefix caching (win 3).
|
|
||||||
static uint64_t hash_block(uint64_t parent_hash, const std::vector<int>& token_ids);
|
|
||||||
std::vector<uint64_t> compute_block_hashes(const std::vector<int>& token_ids) const;
|
|
||||||
size_t get_computed_blocks(const std::vector<uint64_t>& block_hashes); // returns num cached tokens
|
|
||||||
void cache_blocks(int seq_id, const std::vector<uint64_t>& block_hashes, size_t num_tokens);
|
|
||||||
|
|
||||||
protected:
|
|
||||||
int block_size_;
|
|
||||||
BlockPool pool_;
|
|
||||||
std::map<int, std::vector<KVCacheBlock*>> req_to_blocks_;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace paged
|
|
||||||
@@ -1,59 +0,0 @@
|
|||||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
|
||||||
index a49a055a6..d95102bbd 100644
|
|
||||||
--- a/src/llama-kv-cache.cpp
|
|
||||||
+++ b/src/llama-kv-cache.cpp
|
|
||||||
@@ -11,6 +11,8 @@
|
|
||||||
#include <cstring>
|
|
||||||
#include <limits>
|
|
||||||
#include <map>
|
|
||||||
+#include <numeric>
|
|
||||||
+#include <cstdlib>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
static bool ggml_is_power_of_2(int n) {
|
|
||||||
@@ -931,6 +933,45 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
|
||||||
return { };
|
|
||||||
}
|
|
||||||
|
|
||||||
+ // [paged, experimental] Place this sequence's tokens at permuted,
|
|
||||||
+ // non-contiguous fixed-size BLOCK positions instead of a contiguous run.
|
|
||||||
+ // This validates that attention is invariant to physical KV placement -
|
|
||||||
+ // the correctness premise of paged attention. Enabled via LLAMA_KV_PAGED.
|
|
||||||
+ // Single-sequence scope (uses get_used() as the logical base); falls back
|
|
||||||
+ // to the normal allocator if the permuted cells aren't available.
|
|
||||||
+ static const bool paged_mode = (std::getenv("LLAMA_KV_PAGED") != nullptr);
|
|
||||||
+ if (paged_mode) {
|
|
||||||
+ const uint32_t bs = 16; // block size (tokens/block)
|
|
||||||
+ const uint32_t nblk = cells.size() / bs; // blocks in this stream's pool
|
|
||||||
+ if (nblk >= 2) {
|
|
||||||
+ // stride coprime to nblk => block-index permutation is a bijection
|
|
||||||
+ uint32_t k = 1;
|
|
||||||
+ for (uint32_t cand = (nblk / 2) | 1u; cand < nblk; cand += 2) {
|
|
||||||
+ if (std::gcd(cand, nblk) == 1u) { k = cand; break; }
|
|
||||||
+ }
|
|
||||||
+ const uint32_t base = cells.get_used();
|
|
||||||
+ bool ok = true;
|
|
||||||
+ for (uint32_t i = 0; i < n_tokens; ++i) {
|
|
||||||
+ const uint32_t L = base + i;
|
|
||||||
+ const uint32_t b = L / bs;
|
|
||||||
+ const uint32_t off = L % bs;
|
|
||||||
+ if (b >= nblk) { ok = false; break; }
|
|
||||||
+ const uint32_t phys = ((b * k) % nblk) * bs + off; // permuted block
|
|
||||||
+ if (phys >= cells.size() || !cells.is_empty(phys)) { ok = false; break; }
|
|
||||||
+ res.idxs[s].push_back(phys);
|
|
||||||
+ }
|
|
||||||
+ if (ok && res.idxs[s].size() == n_tokens) {
|
|
||||||
+ if (std::getenv("LLAMA_KV_PAGED_DEBUG")) {
|
|
||||||
+ fprintf(stderr, "[paged] seq placed %u tok at cells:", n_tokens);
|
|
||||||
+ for (uint32_t z = 0; z < res.idxs[s].size() && z < 24; ++z) fprintf(stderr, " %u", res.idxs[s][z]);
|
|
||||||
+ fprintf(stderr, " (k=%u nblk=%u base=%u)\n", k, nblk, base);
|
|
||||||
+ }
|
|
||||||
+ continue; // paged placement succeeded for this sequence
|
|
||||||
+ }
|
|
||||||
+ res.idxs[s].clear(); // fall back to the normal allocator
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
uint32_t n_tested = 0;
|
|
||||||
|
|
||||||
// for continuous slots, we test that all tokens in the ubatch fit, starting from the current head
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
diff --git a/tests/test-paged-kv-e2e.cpp b/tests/test-paged-kv-e2e.cpp
|
|
||||||
index 5a352e3..06ead50 100644
|
|
||||||
--- a/tests/test-paged-kv-e2e.cpp
|
|
||||||
+++ b/tests/test-paged-kv-e2e.cpp
|
|
||||||
@@ -115,6 +115,7 @@ static path_result run_paged(const std::string & model_path) {
|
|
||||||
params.sampling.temp = 0.0f; // greedy
|
|
||||||
params.warmup = false;
|
|
||||||
params.kv_paged = true;
|
|
||||||
+ params.fit_params = false; // honor explicit n_gpu_blocks; GB10 dev_memory over-reports free VRAM
|
|
||||||
params.n_gpu_blocks = 64;
|
|
||||||
params.n_cpu_blocks = 16;
|
|
||||||
params.n_sequences = 1;
|
|
||||||
@@ -1,42 +0,0 @@
|
|||||||
#include "../paged_kv_manager.h"
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstdio>
|
|
||||||
using namespace paged;
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
BlockPool pool(/*num_blocks=*/8, /*enable_caching=*/true);
|
|
||||||
// block 0 is reserved as null_block (vLLM pops one at init)
|
|
||||||
assert(pool.null_block != nullptr && pool.null_block->block_id == 0);
|
|
||||||
assert(pool.get_num_free_blocks() == 7);
|
|
||||||
|
|
||||||
// get_new_blocks sets ref_cnt=1 and removes from free list
|
|
||||||
auto b = pool.get_new_blocks(2);
|
|
||||||
assert(b.size() == 2 && b[0]->ref_cnt == 1 && b[1]->ref_cnt == 1);
|
|
||||||
assert(pool.get_num_free_blocks() == 5);
|
|
||||||
|
|
||||||
// cache two full blocks with chained hashes, then look them up
|
|
||||||
std::vector<uint64_t> hashes = {1111, 2222};
|
|
||||||
pool.cache_full_blocks(b, /*num_cached=*/0, /*num_full=*/2, hashes);
|
|
||||||
assert(b[0]->has_hash && b[0]->block_hash == 1111);
|
|
||||||
assert(pool.get_cached_block(1111) == b[0]);
|
|
||||||
assert(pool.get_cached_block(2222) == b[1]);
|
|
||||||
assert(pool.get_cached_block(9999) == nullptr);
|
|
||||||
|
|
||||||
// free: hashed blocks go to tail (kept warm), so they remain queryable.
|
|
||||||
pool.free_blocks(b);
|
|
||||||
assert(b[0]->ref_cnt == 0);
|
|
||||||
assert(pool.get_num_free_blocks() == 7);
|
|
||||||
assert(pool.get_cached_block(1111) == b[0]); // still cached/warm
|
|
||||||
|
|
||||||
// touch a warm cached block: pulls it out of free list, ++ref_cnt
|
|
||||||
pool.touch({b[0]});
|
|
||||||
assert(b[0]->ref_cnt == 1);
|
|
||||||
assert(pool.get_num_free_blocks() == 6);
|
|
||||||
|
|
||||||
// exhausting the pool then allocating evicts a warm cached hash
|
|
||||||
auto rest = pool.get_new_blocks(pool.get_num_free_blocks());
|
|
||||||
(void) rest;
|
|
||||||
assert(pool.get_cached_block(2222) == nullptr); // evicted on reuse
|
|
||||||
printf("test_block_pool: OK\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
@@ -1,44 +0,0 @@
|
|||||||
#include "../paged_kv_manager.h"
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
using namespace paged;
|
|
||||||
|
|
||||||
static std::vector<KVCacheBlock> make_blocks(int n) {
|
|
||||||
std::vector<KVCacheBlock> v;
|
|
||||||
v.reserve(n);
|
|
||||||
for (int i = 0; i < n; ++i) v.push_back(KVCacheBlock{i});
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
// ordered 0..9 at init; popleft yields ascending block_ids
|
|
||||||
auto blocks = make_blocks(10);
|
|
||||||
std::vector<KVCacheBlock*> ptrs;
|
|
||||||
for (auto& b : blocks) ptrs.push_back(&b);
|
|
||||||
FreeBlockQueue q(ptrs);
|
|
||||||
assert(q.num_free_blocks == 10);
|
|
||||||
|
|
||||||
KVCacheBlock* b0 = q.popleft();
|
|
||||||
assert(b0->block_id == 0);
|
|
||||||
assert(q.num_free_blocks == 9);
|
|
||||||
|
|
||||||
auto two = q.popleft_n(2); // {1,2}
|
|
||||||
assert(two.size() == 2 && two[0]->block_id == 1 && two[1]->block_id == 2);
|
|
||||||
assert(q.num_free_blocks == 7);
|
|
||||||
|
|
||||||
// O(1) middle removal: remove block 5 (currently free), count drops
|
|
||||||
q.remove(ptrs[5]);
|
|
||||||
assert(q.num_free_blocks == 6); // free: 3,4,6,7,8,9
|
|
||||||
|
|
||||||
// append puts a block at the tail; it comes back out only after the rest
|
|
||||||
q.append(b0); // free order now: 3,4,6,7,8,9,0
|
|
||||||
assert(q.num_free_blocks == 7);
|
|
||||||
auto all = q.get_all_free_blocks();
|
|
||||||
assert(all.front()->block_id == 3);
|
|
||||||
assert(all.back()->block_id == 0);
|
|
||||||
|
|
||||||
printf("test_free_block_queue: OK\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
@@ -1,133 +0,0 @@
|
|||||||
// Phase 2 (core numeric de-risk): attention over GATHERED paged KV must equal
|
|
||||||
// an independent host-computed reference.
|
|
||||||
//
|
|
||||||
// This answers the central risk in the design: feeding gather-to-scratch KV
|
|
||||||
// (a sequence whose blocks are non-contiguous in the shared pool) into ggml's
|
|
||||||
// standard attention ops (mul_mat -> soft_max_ext -> mul_mat) produces correct
|
|
||||||
// attention. If this holds, the paged read path is numerically sound; the
|
|
||||||
// remaining work is wiring it into llama-graph.cpp (Gate 0 in a real model).
|
|
||||||
|
|
||||||
#include "../paged_kv_manager.h"
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-cpu.h"
|
|
||||||
#include "ggml-alloc.h"
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cmath>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
using namespace paged;
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
const int d = 8; // head dim
|
|
||||||
const int n_kv = 48; // 3 blocks worth of KV tokens
|
|
||||||
const int n_q = 4; // query tokens
|
|
||||||
const int block_size = 16;
|
|
||||||
const int num_blocks = 8;
|
|
||||||
const int total_slots = block_size * num_blocks;
|
|
||||||
const float scale = 1.0f / std::sqrt((float) d);
|
|
||||||
|
|
||||||
// Non-contiguous physical layout for the KV sequence (blocks [2,1,5]).
|
|
||||||
PagedKVManager m(num_blocks, block_size, /*enable_caching=*/false);
|
|
||||||
assert(m.allocate(0, 2 * block_size));
|
|
||||||
assert(m.allocate(1, 2 * block_size));
|
|
||||||
m.free(0);
|
|
||||||
assert(m.allocate(2, n_kv));
|
|
||||||
std::vector<int> positions(n_kv);
|
|
||||||
for (int i = 0; i < n_kv; ++i) positions[i] = i;
|
|
||||||
auto slots64 = m.slot_mapping(2, positions);
|
|
||||||
std::vector<int32_t> slots32(slots64.begin(), slots64.end());
|
|
||||||
|
|
||||||
// Deterministic K, V, Q in logical [d, n] layout (column-major: col = token).
|
|
||||||
std::vector<float> K(d * n_kv), V(d * n_kv), Q(d * n_q);
|
|
||||||
for (int t = 0; t < n_kv; ++t)
|
|
||||||
for (int e = 0; e < d; ++e) {
|
|
||||||
K[t * d + e] = std::sin(0.1f * t + 0.3f * e);
|
|
||||||
V[t * d + e] = std::cos(0.2f * t - 0.1f * e);
|
|
||||||
}
|
|
||||||
for (int q = 0; q < n_q; ++q)
|
|
||||||
for (int e = 0; e < d; ++e) Q[q * d + e] = std::sin(0.05f * q + 0.7f * e);
|
|
||||||
|
|
||||||
// ---- Independent host reference attention -------------------------------
|
|
||||||
std::vector<float> ref(d * n_q, 0.0f);
|
|
||||||
for (int q = 0; q < n_q; ++q) {
|
|
||||||
std::vector<float> score(n_kv);
|
|
||||||
float mx = -1e30f;
|
|
||||||
for (int t = 0; t < n_kv; ++t) {
|
|
||||||
float dot = 0.0f;
|
|
||||||
for (int e = 0; e < d; ++e) dot += K[t * d + e] * Q[q * d + e];
|
|
||||||
score[t] = dot * scale;
|
|
||||||
mx = std::fmax(mx, score[t]);
|
|
||||||
}
|
|
||||||
float sum = 0.0f;
|
|
||||||
for (int t = 0; t < n_kv; ++t) { score[t] = std::exp(score[t] - mx); sum += score[t]; }
|
|
||||||
for (int t = 0; t < n_kv; ++t) {
|
|
||||||
float p = score[t] / sum;
|
|
||||||
for (int e = 0; e < d; ++e) ref[q * d + e] += p * V[t * d + e];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---- ggml paged path ----------------------------------------------------
|
|
||||||
ggml_backend_t backend = ggml_backend_cpu_init();
|
|
||||||
struct ggml_init_params dp = { ggml_tensor_overhead() * 16, NULL, true };
|
|
||||||
struct ggml_context * ctx_data = ggml_init(dp);
|
|
||||||
|
|
||||||
struct ggml_tensor * poolK = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, total_slots);
|
|
||||||
struct ggml_tensor * poolV = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, total_slots);
|
|
||||||
struct ggml_tensor * kSrc = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, n_kv);
|
|
||||||
struct ggml_tensor * vSrc = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, n_kv);
|
|
||||||
struct ggml_tensor * qT = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, d, n_q);
|
|
||||||
struct ggml_tensor * wIdx = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I64, n_kv);
|
|
||||||
struct ggml_tensor * gIdx = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I32, n_kv);
|
|
||||||
|
|
||||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx_data, backend);
|
|
||||||
std::vector<float> zeros(d * total_slots, 0.0f);
|
|
||||||
ggml_backend_tensor_set(poolK, zeros.data(), 0, ggml_nbytes(poolK));
|
|
||||||
ggml_backend_tensor_set(poolV, zeros.data(), 0, ggml_nbytes(poolV));
|
|
||||||
ggml_backend_tensor_set(kSrc, K.data(), 0, ggml_nbytes(kSrc));
|
|
||||||
ggml_backend_tensor_set(vSrc, V.data(), 0, ggml_nbytes(vSrc));
|
|
||||||
ggml_backend_tensor_set(qT, Q.data(), 0, ggml_nbytes(qT));
|
|
||||||
ggml_backend_tensor_set(wIdx, slots64.data(), 0, ggml_nbytes(wIdx));
|
|
||||||
ggml_backend_tensor_set(gIdx, slots32.data(), 0, ggml_nbytes(gIdx));
|
|
||||||
|
|
||||||
struct ggml_init_params cp = { ggml_tensor_overhead() * 64 + ggml_graph_overhead(), NULL, true };
|
|
||||||
struct ggml_context * ctx = ggml_init(cp);
|
|
||||||
|
|
||||||
struct ggml_tensor * wroteK = ggml_set_rows(ctx, poolK, kSrc, wIdx);
|
|
||||||
struct ggml_tensor * wroteV = ggml_set_rows(ctx, poolV, vSrc, wIdx);
|
|
||||||
struct ggml_tensor * gK = ggml_get_rows(ctx, wroteK, gIdx); // [d, n_kv]
|
|
||||||
struct ggml_tensor * gV = ggml_get_rows(ctx, wroteV, gIdx); // [d, n_kv]
|
|
||||||
|
|
||||||
struct ggml_tensor * kq = ggml_mul_mat(ctx, gK, qT); // [n_kv, n_q]
|
|
||||||
struct ggml_tensor * probs = ggml_soft_max_ext(ctx, kq, NULL, scale, 0.0f);
|
|
||||||
struct ggml_tensor * vT = ggml_cont(ctx, ggml_transpose(ctx, gV)); // [n_kv, d]
|
|
||||||
struct ggml_tensor * out = ggml_mul_mat(ctx, vT, probs); // [d, n_q]
|
|
||||||
ggml_set_output(out);
|
|
||||||
|
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
|
||||||
ggml_build_forward_expand(gf, out);
|
|
||||||
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
|
||||||
assert(ggml_gallocr_alloc_graph(galloc, gf));
|
|
||||||
assert(ggml_backend_graph_compute(backend, gf) == GGML_STATUS_SUCCESS);
|
|
||||||
|
|
||||||
std::vector<float> got(d * n_q);
|
|
||||||
ggml_backend_tensor_get(out, got.data(), 0, ggml_nbytes(out));
|
|
||||||
|
|
||||||
// ---- compare ------------------------------------------------------------
|
|
||||||
double max_err = 0.0;
|
|
||||||
for (int i = 0; i < d * n_q; ++i) max_err = std::fmax(max_err, std::fabs(got[i] - ref[i]));
|
|
||||||
printf("paged attention max abs err vs host reference: %.3e\n", max_err);
|
|
||||||
assert(max_err < 1e-4 && "paged-gathered attention must match host reference");
|
|
||||||
|
|
||||||
ggml_gallocr_free(galloc);
|
|
||||||
ggml_free(ctx);
|
|
||||||
ggml_free(ctx_data);
|
|
||||||
ggml_backend_buffer_free(buf);
|
|
||||||
ggml_backend_free(backend);
|
|
||||||
|
|
||||||
printf("test_ggml_paged_attn: OK (attention over non-contiguous paged KV matches reference)\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
@@ -1,142 +0,0 @@
|
|||||||
// Phase 1 integration test: prove the paged KV write+read MECHANISM at the
|
|
||||||
// ggml-op level, driven by PagedKVManager.
|
|
||||||
//
|
|
||||||
// write: ggml_set_rows(pool, k_src, slot_mapping) // scatter by slot
|
|
||||||
// read: ggml_get_rows(pool, gather_idx) // gather seq's slots
|
|
||||||
//
|
|
||||||
// The decisive property: a sequence's physical blocks are NON-CONTIGUOUS and
|
|
||||||
// OUT-OF-ORDER (forced via allocate/free/reallocate), yet gather(write(x)) == x,
|
|
||||||
// and a second sequence written into disjoint blocks does not contaminate it.
|
|
||||||
// This is exactly how a paged read path feeds contiguous scratch to attention.
|
|
||||||
|
|
||||||
#include "../paged_kv_manager.h"
|
|
||||||
|
|
||||||
#include "ggml.h"
|
|
||||||
#include "ggml-cpu.h"
|
|
||||||
#include "ggml-alloc.h"
|
|
||||||
#include "ggml-backend.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cmath>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
using namespace paged;
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
const int n_embd = 8;
|
|
||||||
const int block_size = 16;
|
|
||||||
const int num_blocks = 8; // block 0 reserved as null
|
|
||||||
const int total_slots = block_size * num_blocks; // 128
|
|
||||||
|
|
||||||
// --- Force a non-contiguous, out-of-order block layout for seqC ----------
|
|
||||||
PagedKVManager m(num_blocks, block_size, /*enable_caching=*/false);
|
|
||||||
assert(m.allocate(/*seqA=*/0, 2 * block_size)); // blocks {1,2}
|
|
||||||
assert(m.allocate(/*seqB=*/1, 2 * block_size)); // blocks {3,4}
|
|
||||||
m.free(0); // returns {1,2} to free list
|
|
||||||
assert(m.allocate(/*seqC=*/2, 3 * block_size)); // reuses freed blocks, reordered
|
|
||||||
|
|
||||||
auto btC = m.block_table(2);
|
|
||||||
auto btB = m.block_table(1);
|
|
||||||
printf("seqC block_table = [");
|
|
||||||
for (size_t i = 0; i < btC.size(); ++i) printf("%s%d", i ? "," : "", btC[i]);
|
|
||||||
printf("]\n");
|
|
||||||
assert(btC.size() == 3);
|
|
||||||
// sanity: seqC and seqB occupy disjoint physical blocks
|
|
||||||
for (int cb : btC) for (int bb : btB) assert(cb != bb);
|
|
||||||
|
|
||||||
const int n_tokens = 3 * block_size; // 48 tokens for seqC
|
|
||||||
|
|
||||||
// slot_mapping for seqC positions 0..n_tokens-1
|
|
||||||
std::vector<int> positions(n_tokens);
|
|
||||||
for (int i = 0; i < n_tokens; ++i) positions[i] = i;
|
|
||||||
std::vector<int64_t> slots64 = m.slot_mapping(2, positions); // I64 for set_rows
|
|
||||||
std::vector<int32_t> slots32(slots64.begin(), slots64.end()); // I32 for get_rows
|
|
||||||
|
|
||||||
// seqB occupies different blocks; write a sentinel there to prove isolation.
|
|
||||||
std::vector<int> posB(2 * block_size);
|
|
||||||
for (size_t i = 0; i < posB.size(); ++i) posB[i] = (int) i;
|
|
||||||
std::vector<int64_t> slotsB64 = m.slot_mapping(1, posB);
|
|
||||||
|
|
||||||
// --- ggml backend + persistent (statically allocated) tensors ------------
|
|
||||||
ggml_backend_t backend = ggml_backend_cpu_init();
|
|
||||||
assert(backend);
|
|
||||||
|
|
||||||
struct ggml_init_params dp = { /*mem_size=*/ ggml_tensor_overhead() * 16,
|
|
||||||
/*mem_buffer=*/ NULL, /*no_alloc=*/ true };
|
|
||||||
struct ggml_context * ctx_data = ggml_init(dp);
|
|
||||||
|
|
||||||
// The shared paged KV pool: one flat block pool, exactly like a paged layer.
|
|
||||||
struct ggml_tensor * pool = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, n_embd, total_slots);
|
|
||||||
struct ggml_tensor * k_src = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, n_embd, n_tokens);
|
|
||||||
struct ggml_tensor * w_idx = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I64, n_tokens);
|
|
||||||
struct ggml_tensor * g_idx = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I32, n_tokens);
|
|
||||||
struct ggml_tensor * kB_src = ggml_new_tensor_2d(ctx_data, GGML_TYPE_F32, n_embd, (int) posB.size());
|
|
||||||
struct ggml_tensor * wB_idx = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I64, (int) posB.size());
|
|
||||||
|
|
||||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx_data, backend);
|
|
||||||
assert(buf);
|
|
||||||
|
|
||||||
// pool starts zeroed
|
|
||||||
std::vector<float> zeros(n_embd * total_slots, 0.0f);
|
|
||||||
ggml_backend_tensor_set(pool, zeros.data(), 0, ggml_nbytes(pool));
|
|
||||||
|
|
||||||
// token t carries the value (float) t in every embedding lane -> easy to verify
|
|
||||||
std::vector<float> ksrc(n_embd * n_tokens);
|
|
||||||
for (int t = 0; t < n_tokens; ++t)
|
|
||||||
for (int e = 0; e < n_embd; ++e) ksrc[t * n_embd + e] = (float) t;
|
|
||||||
ggml_backend_tensor_set(k_src, ksrc.data(), 0, ggml_nbytes(k_src));
|
|
||||||
ggml_backend_tensor_set(w_idx, slots64.data(), 0, ggml_nbytes(w_idx));
|
|
||||||
ggml_backend_tensor_set(g_idx, slots32.data(), 0, ggml_nbytes(g_idx));
|
|
||||||
|
|
||||||
// seqB sentinel = 999 everywhere
|
|
||||||
std::vector<float> kBsrc(n_embd * posB.size(), 999.0f);
|
|
||||||
ggml_backend_tensor_set(kB_src, kBsrc.data(), 0, ggml_nbytes(kB_src));
|
|
||||||
ggml_backend_tensor_set(wB_idx, slotsB64.data(), 0, ggml_nbytes(wB_idx));
|
|
||||||
|
|
||||||
// --- compute graph: write seqB, write seqC, then gather seqC -------------
|
|
||||||
struct ggml_init_params cp = { /*mem_size=*/ ggml_tensor_overhead() * 32 + ggml_graph_overhead(),
|
|
||||||
/*mem_buffer=*/ NULL, /*no_alloc=*/ true };
|
|
||||||
struct ggml_context * ctx = ggml_init(cp);
|
|
||||||
|
|
||||||
struct ggml_tensor * wroteB = ggml_set_rows(ctx, pool, kB_src, wB_idx); // view(pool)
|
|
||||||
struct ggml_tensor * wroteC = ggml_set_rows(ctx, wroteB, k_src, w_idx); // chain so order is fixed
|
|
||||||
struct ggml_tensor * gathered = ggml_get_rows(ctx, wroteC, g_idx);
|
|
||||||
ggml_set_output(gathered);
|
|
||||||
|
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
|
||||||
ggml_build_forward_expand(gf, gathered);
|
|
||||||
|
|
||||||
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
|
||||||
assert(ggml_gallocr_alloc_graph(galloc, gf));
|
|
||||||
|
|
||||||
assert(ggml_backend_graph_compute(backend, gf) == GGML_STATUS_SUCCESS);
|
|
||||||
|
|
||||||
// --- verify gather(write(x)) == x for the non-contiguous sequence --------
|
|
||||||
std::vector<float> out(n_embd * n_tokens);
|
|
||||||
ggml_backend_tensor_get(gathered, out.data(), 0, ggml_nbytes(gathered));
|
|
||||||
|
|
||||||
int mism = 0;
|
|
||||||
for (int t = 0; t < n_tokens; ++t)
|
|
||||||
for (int e = 0; e < n_embd; ++e)
|
|
||||||
if (std::fabs(out[t * n_embd + e] - (float) t) > 1e-6f) mism++;
|
|
||||||
assert(mism == 0 && "gathered paged KV must equal source (round-trip)");
|
|
||||||
|
|
||||||
// --- verify isolation: read seqC slots directly from pool, unaffected by seqB
|
|
||||||
std::vector<float> pool_host(n_embd * total_slots);
|
|
||||||
ggml_backend_tensor_get(pool, pool_host.data(), 0, ggml_nbytes(pool));
|
|
||||||
for (int t = 0; t < n_tokens; ++t) {
|
|
||||||
int slot = (int) slots64[t];
|
|
||||||
for (int e = 0; e < n_embd; ++e)
|
|
||||||
assert(std::fabs(pool_host[slot * n_embd + e] - (float) t) < 1e-6f);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_gallocr_free(galloc);
|
|
||||||
ggml_free(ctx);
|
|
||||||
ggml_free(ctx_data);
|
|
||||||
ggml_backend_buffer_free(buf);
|
|
||||||
ggml_backend_free(backend);
|
|
||||||
|
|
||||||
printf("test_ggml_paged_rw: OK (non-contiguous paged write/gather round-trip)\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
#include "../paged_kv_manager.h"
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstdio>
|
|
||||||
using namespace paged;
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
PagedKVManager m(/*num_blocks=*/8, /*block_size=*/16, /*enable_caching=*/false);
|
|
||||||
// 20 tokens -> ceil(20/16)=2 blocks
|
|
||||||
assert(m.allocate(/*seq=*/0, 20));
|
|
||||||
auto bt = m.block_table(0);
|
|
||||||
assert(bt.size() == 2);
|
|
||||||
|
|
||||||
// slot arithmetic: pos 0 -> block bt[0]*16 + 0 ; pos 17 -> bt[1]*16 + 1
|
|
||||||
assert(m.slot(0, 0) == (int64_t)bt[0] * 16 + 0);
|
|
||||||
assert(m.slot(0, 17) == (int64_t)bt[1] * 16 + 1);
|
|
||||||
|
|
||||||
auto sm = m.slot_mapping(0, {0, 16, 17});
|
|
||||||
assert(sm.size() == 3 && sm[1] == (int64_t)bt[1] * 16 + 0);
|
|
||||||
|
|
||||||
// growing the same seq reuses existing blocks, adds only new ones
|
|
||||||
assert(m.allocate(0, 40)); // ceil(40/16)=3 -> +1 block
|
|
||||||
assert(m.block_table(0).size() == 3);
|
|
||||||
|
|
||||||
// OOM: blocks left = 8 - 1(null) - 3 = 4 blocks; ask for 5 blocks
|
|
||||||
assert(m.allocate(1, 5 * 16) == false);
|
|
||||||
|
|
||||||
// free returns blocks to the pool for reuse
|
|
||||||
m.free(0);
|
|
||||||
assert(m.allocate(1, 5 * 16)); // now fits
|
|
||||||
printf("test_paged_kv_manager: OK\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
#include "../paged_kv_manager.h"
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <vector>
|
|
||||||
using namespace paged;
|
|
||||||
|
|
||||||
int main() {
|
|
||||||
PagedKVManager m(/*num_blocks=*/64, /*block_size=*/16, /*enable_caching=*/true);
|
|
||||||
|
|
||||||
// shared prefix of 32 tokens (2 full blocks) + distinct suffix
|
|
||||||
std::vector<int> shared(32);
|
|
||||||
for (int i = 0; i < 32; ++i) shared[i] = 100 + i;
|
|
||||||
|
|
||||||
// chained hashing is deterministic and prefix-sensitive
|
|
||||||
auto h = m.compute_block_hashes(shared);
|
|
||||||
assert(h.size() == 2);
|
|
||||||
auto h2 = m.compute_block_hashes(shared);
|
|
||||||
assert(h == h2); // deterministic
|
|
||||||
std::vector<int> other = shared; other[0] = 999;
|
|
||||||
assert(m.compute_block_hashes(other)[0] != h[0]); // sensitive to content
|
|
||||||
|
|
||||||
// seq 0: cold, no cache hit yet
|
|
||||||
assert(m.get_computed_blocks(h) == 0);
|
|
||||||
assert(m.allocate(0, 32));
|
|
||||||
m.cache_blocks(0, h, 32);
|
|
||||||
|
|
||||||
// seq 1: warm — the 2 shared blocks are a cache hit (32 tokens)
|
|
||||||
assert(m.get_computed_blocks(h) == 32);
|
|
||||||
|
|
||||||
// first-miss stop: a chain that diverges after block 1 hits only 1 block
|
|
||||||
auto hmix = h; hmix[1] = 0xDEADBEEF;
|
|
||||||
assert(m.get_computed_blocks(hmix) == 16);
|
|
||||||
printf("test_prefix_cache: OK\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
@@ -1,106 +0,0 @@
|
|||||||
# Paged-attention / parity benchmarks (GB10 / DGX Spark)
|
|
||||||
|
|
||||||
Goal of the series: vLLM parity. This records the measured gap so the parity claim is data-backed, not asserted.
|
|
||||||
|
|
||||||
**Setup:** GB10 (sm_121, 119 GiB unified). Model Qwen3-Coder-30B-A3B. llama.cpp = pinned base + this series
|
|
||||||
(MXFP4_MOE, `-fa 1 -b 2048 -ub 2048`, `llama-batched-bench`, PP=512 TG=128). vLLM = 0.23.0 FP8 (recorded
|
|
||||||
prior run, same box/model). S_PP / S_TG are aggregate prefill / decode tok/s across B streams.
|
|
||||||
|
|
||||||
## Fresh llama.cpp (this series, MXFP4) vs vLLM (FP8)
|
|
||||||
|
|
||||||
| B | llama S_PP | vLLM S_PP | PP gap | llama S_TG | vLLM S_TG | TG gap |
|
|
||||||
|---|-----------|-----------|--------|-----------|-----------|--------|
|
|
||||||
| 1 | 1565 | 9644 | 6.2× | **83** | 48 | **llama wins** |
|
|
||||||
| 8 | 3648 | 33373 | 9.1× | 126 | 312 | 2.5× |
|
|
||||||
| 32 | 2074 | 99398 | 48× | 319 | 1171 | 3.7× |
|
|
||||||
| 64 | 3643 | 151990 | 42× | 771 | 2064 | 2.7× |
|
|
||||||
|
|
||||||
## Verdict — two distinct gaps, only one is the engine's
|
|
||||||
|
|
||||||
1. **Prefill (S_PP): 6–48× behind, and it does NOT scale with B** (plateaus ~3.6k). This is the **FP4 MoE
|
|
||||||
GEMM kernel** (`mul_mat_q<MXFP4>` ~22 TFLOP/s), confirmed earlier. **Paged attention cannot close this** —
|
|
||||||
it's per-token compute. Needs the tcgen05/CUTLASS grouped-GEMM (Lever 3, multi-week, no upstream base).
|
|
||||||
2. **Decode at concurrency (S_TG): 2.5–3.7× behind for B≥8** (we *win* at B=1). This gap IS partly the
|
|
||||||
engine's domain — vLLM's block-paged KV + continuous batching pack more concurrent decode work per step.
|
|
||||||
**This is what patches 0003–0006 target.** The win here is realistic; the prefill win is not (kernel).
|
|
||||||
|
|
||||||
## CORRECTION — decode-phase profile (B=64, decode-dominated nsys)
|
|
||||||
|
|
||||||
The "decode gap is engine-addressable" read above was **wrong**. Profiling a decode-dominated B=64 run:
|
|
||||||
|
|
||||||
| kernel | % GPU time |
|
|
||||||
|---|---|
|
|
||||||
| `mul_mat_q<MXFP4>` (MoE GEMM) | **54.6** |
|
|
||||||
| `flash_attn_ext` (attention) | 19.8 |
|
|
||||||
| `mul_mat_q<Q8>` (dense) | 10.9 |
|
|
||||||
| KV writes / quant / norms / rest | ~15 |
|
|
||||||
|
|
||||||
**Decode at concurrency is ALSO dominated by the FP4 MoE GEMM (54.6%)** — the same Lever-3 kernel as prefill.
|
|
||||||
Attention (the only thing paging optimizes) is ~20%, and the gather-read reclaims only the *masked-cell*
|
|
||||||
fraction of that. So **the paged series (0003–0006) cannot close the vLLM gap in either phase** — both are
|
|
||||||
MoE-kernel-bound. vLLM's concurrency advantage is its MoE/attention *kernels*, not (mainly) its KV management.
|
|
||||||
|
|
||||||
### What the paged series IS still good for (just not throughput parity)
|
|
||||||
|
|
||||||
- **Capacity**: block-granular + on-demand allocation → fit more/longer concurrent sequences in fixed VRAM.
|
|
||||||
- **Prefix sharing**: cross-request block dedup → lower TTFT + memory on shared system prompts / RAG.
|
|
||||||
|
|
||||||
These are real wins on *memory-pressured* and *shared-prefix* workloads — but they are not tok/s parity, and
|
|
||||||
batched-bench (fresh, non-fragmented, no shared prefix) won't show them.
|
|
||||||
|
|
||||||
## DENSE model parity (Qwen3-32B) — does the kernel gap exist for dense too? YES.
|
|
||||||
|
|
||||||
The MoE work above is about the grouped MoE GEMM. Dense models use a different (non-grouped) matmul path,
|
|
||||||
so we benchmarked a dense 32B head-to-head.
|
|
||||||
|
|
||||||
**Headline comparison — vLLM NVFP4 W4A16 vs llama.cpp Q4_K_M.** This is the *correct apples-to-apples on
|
|
||||||
DGX Spark*: both are **4-bit weights / 16-bit activations** (same quant class). vLLM = `Qwen3-32B-NVFP4A16`
|
|
||||||
(FlashInfer Marlin W4A16 kernel); llama.cpp = `Qwen3-32B-Q4_K_M` (int8-MMQ compute). The only difference is
|
|
||||||
the compute kernel — which is exactly what we're measuring. (Full **W4A4** NVFP4 does not run on GB10 today;
|
|
||||||
root cause below — and it would *not* be a fair comparison even if it did, since Q4_K_M is also weight-only-4-bit.)
|
|
||||||
|
|
||||||
| B | llama Q4_K_M PP | vLLM W4A16 PP | PP gap | llama decode | vLLM decode | TG gap |
|
|
||||||
|---|---|---|---|---|---|---|
|
|
||||||
| 1 | 708 | 5367 | 7.6× | 10.2 | 11.7 | ~parity |
|
|
||||||
| 8 | 761 | 14941 | 20× | 58 | 92 | 1.6× |
|
|
||||||
| 32 | 763 | 21952 | 29× | 205 | 330 | 1.6× |
|
|
||||||
| 64 | 765 | 24444 | 32× | 253 | 569 | 2.2× |
|
|
||||||
|
|
||||||
**Findings:**
|
|
||||||
1. **Dense prefill has the SAME (larger) kernel gap.** llama dense prefill plateaus at ~765 t/s regardless of
|
|
||||||
B; vLLM scales to 24.4k (32×). Both read 4-bit weights — the gap is the compute kernel: vLLM's FP4 Marlin
|
|
||||||
tensor-core GEMM vs llama's int8-MMQ. (Note: on consumer Blackwell, W4A16 Marlin is also reported *faster*
|
|
||||||
than the experimental W4A4 path, so W4A16 isn't a handicapped stand-in — it's the fast path.)
|
|
||||||
2. **Decode is ~parity at B=1** (10.2 vs 11.7 — both weight-bandwidth-bound reading 4-bit weights), and the
|
|
||||||
gap grows with batch (compute starts to matter → the kernel gap reappears: 2.2× at B=64).
|
|
||||||
3. **Scope decision (the reason for this benchmark): the Lever-3 kernel track must also deliver a NON-grouped
|
|
||||||
block-scaled FP4 GEMM for dense**, not only the MoE grouped GEMM. The dense GEMM is the simpler of the two
|
|
||||||
(a plain CUTLASS dense GEMM), so it's a good first kernel to land — and it benefits every dense model.
|
|
||||||
- **No cheap lever:** `GGML_CUDA_FORCE_CUBLAS` is a **no-op for dense too** (Q4_K pp512: 720.8 vs 721.8) —
|
|
||||||
dequant→cuBLAS-BF16 doesn't engage / isn't faster than int8-MMQ on GB10. With ubatch (saturates) and
|
|
||||||
nwarps (static_assert) already ruled out for MoE, **every config/flag lever is now exhausted** for both
|
|
||||||
model classes. Parity is strictly the FP4 tensor-core kernel.
|
|
||||||
4. **Why full W4A4 NVFP4 hangs on GB10 (root cause, researched).** This is a *known consumer-Blackwell
|
|
||||||
limitation, not a misconfiguration*. **FlashInfer ships no FP4 cubins for sm_120/sm_121** — its precompiled
|
|
||||||
kernels are all datacenter `Sm100a/Sm103a` (B200/B300). So on GB10 the dense `mm_fp4` W4A4 GEMM has no
|
|
||||||
working kernel: the optimized path is gated off for sm_121 (heuristic checks `minor==0`; 12.1 fails), the
|
|
||||||
CUTLASS dense FP4 fallback is documented to silently return **all-zeros**, and TRT-LLM errors at capability
|
|
||||||
120. Our exact symptom — loads weights, then stalls at the first profiling forward pass with
|
|
||||||
`enable_flashinfer_autotune=True` at 0–3% GPU — is the **FlashInfer FP4 autotuner/JIT spinning on an arch
|
|
||||||
with no FP4 cubins** (matches vllm #30163/#26381, flashinfer #2577/#3294). The "NVFP4 on DGX Spark" story
|
|
||||||
everyone cites is about *quantization + memory footprint + W4A16/MoE*, **not dense W4A4 inference**, which
|
|
||||||
isn't validated on sm_121 yet (where people patched it working, it was slower than W4A16 anyway).
|
|
||||||
**Therefore W4A16 vs Q4_K_M above is the right, reproducible apples-to-apples** for DGX Spark today.
|
|
||||||
Optional W4A4 retry (verify output isn't zeros first): `VLLM_SKIP_FLASHINFER_AUTOTUNE=1` +
|
|
||||||
`VLLM_NVFP4_GEMM_BACKEND=cutlass` + `--enforce-eager`, or NVIDIA's `vllm/vllm-openai:cu130-nightly` container.
|
|
||||||
|
|
||||||
## So, honestly, where parity stands
|
|
||||||
|
|
||||||
- **Decode single-stream: already at/above parity** (B=1: 83 vs 48).
|
|
||||||
- **Decode concurrency: a real, engine-addressable gap** the paged series can narrow (0004 on-demand pool +
|
|
||||||
0005 continuous batching). Target: close the 2.5–3.7× at B≥8.
|
|
||||||
- **Prefill: kernel-bound, not engine-bound.** No amount of paging reaches vLLM here; that's a separate track.
|
|
||||||
|
|
||||||
**Series status when measured:** 0001 (vendor) + 0002 (placement, token-identical) done; 0003 (gather-read)
|
|
||||||
turn-key-planned, not yet implemented. These numbers are the *baseline* the engine patches must improve on at
|
|
||||||
B≥8 decode — re-run this table after 0004/0005 to show the concurrency gap closing.
|
|
||||||
@@ -1,82 +0,0 @@
|
|||||||
# llama.cpp patch series — paged attention (vLLM-parity engine)
|
|
||||||
|
|
||||||
A **stacking** series: each patch is a small, self-contained, independently-buildable step toward an
|
|
||||||
in-model paged-attention engine. They apply in numeric order on top of the pinned `LLAMA_VERSION`
|
|
||||||
(`backend/cpp/llama-cpp/Makefile`). The build applies them automatically after checkout (see the
|
|
||||||
`llama.cpp:` target). Keeping the work as ordered patches — rather than one big diff — is what lets us
|
|
||||||
**rebase cleanly across llama.cpp bumps and avoid drift**: when a patch stops applying, only that small
|
|
||||||
patch needs fixing, and the failure points at exactly which step the upstream change touched.
|
|
||||||
|
|
||||||
## Base
|
|
||||||
|
|
||||||
- `LLAMA_VERSION` pin in `../Makefile`. **All patches are generated against that exact commit.** Bumping
|
|
||||||
the pin = re-run the regen workflow below and fix only the patches that no longer apply.
|
|
||||||
|
|
||||||
## The series (phases → patches)
|
|
||||||
|
|
||||||
| # | Patch | What | Verifies |
|
|
||||||
|---|-------|------|----------|
|
|
||||||
| 0001 | `0001-vendor-paged-kv-manager.patch` | Add `src/paged-kv-manager.{h,cpp}` (vLLM-parity block manager, CPU foundation) + CMake; no behavior change | builds; unit-tested separately under `../paged/` |
|
|
||||||
| 0002 | `0002-paged-kv-storage.patch` | Shared block-pool KV tensor + `set_rows`-by-slot writes, behind `LLAMA_KV_PAGED` | builds; write/gather round-trip |
|
|
||||||
| 0003 | `0003-paged-gather-read.patch` | `build_attn_paged` gather-read in `llama-graph.cpp` | **Gate 0**: token-identical greedy gen, single + multi-seq |
|
|
||||||
| 0004 | `0004-paged-ondemand-alloc.patch` | On-demand block allocation via PagedKVManager | max concurrent seqs before OOM |
|
|
||||||
| 0005 | `0005-paged-continuous-batching.patch` | Block-granular admit/evict in the server slot path | tok/s vs concurrency, mixed-length |
|
|
||||||
| 0006 | `0006-paged-prefix-caching.patch` | Block-hash cross-request prefix dedup | TTFT + memory on shared prefixes |
|
|
||||||
|
|
||||||
Each row is a separate `git commit` on the dev branch (below), exported 1:1 as a patch. Default off
|
|
||||||
(`LLAMA_KV_PAGED`) until Gate 0 (0003) is green, so partial series never changes stock behavior.
|
|
||||||
|
|
||||||
## Regen workflow (the anti-drift recipe)
|
|
||||||
|
|
||||||
```sh
|
|
||||||
# 1. check out the exact pin into a dev tree
|
|
||||||
git -C /tmp clone https://github.com/ggml-org/llama.cpp llama-dev && cd /tmp/llama-dev
|
|
||||||
git checkout <LLAMA_VERSION from ../Makefile>
|
|
||||||
git checkout -b paged
|
|
||||||
|
|
||||||
# 2. apply the current series (each becomes a commit), or develop the next patch
|
|
||||||
git am /path/to/backend/cpp/llama-cpp/patches/00*.patch # or `git apply` + commit per patch
|
|
||||||
|
|
||||||
# 3. iterate a phase as ONE commit, then export the whole series 1:1
|
|
||||||
git format-patch <LLAMA_VERSION>..paged -o /path/to/backend/cpp/llama-cpp/patches/ --zero-commit -N
|
|
||||||
|
|
||||||
# 4. on a pin bump: rebase `paged` onto the new pin; only conflicting patches need edits; re-export.
|
|
||||||
```
|
|
||||||
|
|
||||||
## Build integration
|
|
||||||
|
|
||||||
`../Makefile`'s `llama.cpp:` target runs, after `git checkout -b build $(LLAMA_VERSION)`:
|
|
||||||
```
|
|
||||||
for p in $(CURRENT_MAKEFILE_DIR)/patches/0*.patch; do git apply --verbose "$p"; done
|
|
||||||
```
|
|
||||||
All variants (avx/avx2/avx512/cuda/…) copy the patched `llama.cpp/` tree, so the series ships everywhere.
|
|
||||||
|
|
||||||
## Status
|
|
||||||
|
|
||||||
- **0001 vendor manager — DONE.** Applies clean to the pin; builds into `libllama`.
|
|
||||||
- **0002 block placement — DONE + VERIFIED.** Built `llama-simple` at the pin; greedy generation is
|
|
||||||
**token-identical** stock vs `LLAMA_KV_PAGED=1` (Qwen3-0.6B), paged branch confirmed firing.
|
|
||||||
- **0003 gather-read — DONE + VERIFIED (Gate 0 green).** Implemented in the **additive** form
|
|
||||||
(`ADDITIVE_DESIGN.md`): all logic in new `src/paged-attn.{h,cpp}` (a `llm_graph_input_i` gather-index
|
|
||||||
subclass + the K/V/mask gather), hooked by **one** line in `build_attn` + **two** thin accessors on
|
|
||||||
`llama_kv_cache_context` + 1 CMake line (216 insertions; no edit to `llm_graph_input_attn_kv` or
|
|
||||||
`llama-graph.h`). Greedy generation is **token-identical** stock vs `LLAMA_KV_PAGED=1` (Qwen3-0.6B,
|
|
||||||
**9/9** across 3 prompts × {32,96,128} tokens), with `n_gather=71 < n_kv=256` confirming real
|
|
||||||
compaction. Patch: `0003-paged-gather-read-env-LLAMA_KV_PAGED.patch`.
|
|
||||||
- **Key correctness finding:** `get_gather_idxs` must emit cells **sorted by token position**. The CPU
|
|
||||||
flash-attn online softmax reduces cells in physical-array order and is FP-order-sensitive, so 0002's
|
|
||||||
scattered placement *alone* (full-window read, no gather) diverges from stock once a sequence crosses
|
|
||||||
the first 16-cell block. The position-sorted gather reproduces stock's exact reduction order -> bit-
|
|
||||||
identical, not merely mathematically equivalent. So 0002 is the placement substrate; **0003 is what
|
|
||||||
makes paged placement token-identical under flash-attn.**
|
|
||||||
- 0004–0006 follow.
|
|
||||||
|
|
||||||
### Honest parity note (important)
|
|
||||||
|
|
||||||
This series delivers the paged-attention **engine** (capacity + scheduling + prefix sharing). It does **not**
|
|
||||||
by itself reach vLLM throughput parity, because the measured prefill bottleneck is the **FP4 MoE GEMM kernel**
|
|
||||||
(Lever 3: `mul_mat_q<MXFP4>` ~22 TFLOP/s, ~27× behind vLLM) — a *per-token compute* gap that paging does not
|
|
||||||
touch. Paged attention closes the **concurrency/memory** gap (more sequences, prefix reuse); the prefill/throughput
|
|
||||||
gap additionally needs the tcgen05/CUTLASS grouped-GEMM (deferred, upstream-grade, no shortcut — see
|
|
||||||
`../paged/UPSTREAM_GGML_ISSUE.md` and `DGX_BLACKWELL_PLAN.md`). So full vLLM parity = this series **AND** the
|
|
||||||
kernel; neither alone suffices.
|
|
||||||
@@ -1,91 +0,0 @@
|
|||||||
diff --git a/ggml/src/ggml-cuda/fp4-grouped-moe.cu b/ggml/src/ggml-cuda/fp4-grouped-moe.cu
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000..5f5a782
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/ggml/src/ggml-cuda/fp4-grouped-moe.cu
|
|
||||||
@@ -0,0 +1,46 @@
|
|
||||||
+#include "fp4-grouped-moe.cuh"
|
|
||||||
+
|
|
||||||
+#include <cstdlib>
|
|
||||||
+#include <cstdio>
|
|
||||||
+
|
|
||||||
+// SCAFFOLD for the FP4 grouped-GEMM MoE kernel (Lever 3).
|
|
||||||
+//
|
|
||||||
+// Why: on GB10 (sm_121) the MoE matmul runs mul_mat_q<MXFP4> - a warp-level mma.sync grouped MMQ -
|
|
||||||
+// at ~22 effective TFLOP/s, ~27x behind vLLM prefill, and it also dominates decode at concurrency
|
|
||||||
+// (54.6% of GPU time at B=64). It is the single bottleneck to vLLM parity in BOTH phases; paged
|
|
||||||
+// attention cannot touch it (proven by profiling). The fix is a CUTLASS-3.x collective-mainloop
|
|
||||||
+// grouped GEMM over all experts, block-scaled e2m1 operands via tcgen05 tensor-memory MMA.
|
|
||||||
+//
|
|
||||||
+// This file is the integration seam. It is currently a no-op that always falls back to MMQ, so the
|
|
||||||
+// default build is byte-identical. The kernel is filled in over the phases in the design doc.
|
|
||||||
+
|
|
||||||
+static bool fp4_grouped_enabled() {
|
|
||||||
+ static const bool en = (std::getenv("GGML_CUDA_FP4_GROUPED") != nullptr);
|
|
||||||
+ return en;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+bool ggml_cuda_fp4_grouped_moe(
|
|
||||||
+ ggml_backend_cuda_context & ctx,
|
|
||||||
+ const ggml_tensor * src0,
|
|
||||||
+ const ggml_tensor * src1,
|
|
||||||
+ const ggml_tensor * ids,
|
|
||||||
+ ggml_tensor * dst) {
|
|
||||||
+ GGML_UNUSED(ctx); GGML_UNUSED(src1); GGML_UNUSED(ids); GGML_UNUSED(dst);
|
|
||||||
+
|
|
||||||
+ if (!fp4_grouped_enabled()) {
|
|
||||||
+ return false; // default: existing MMQ path
|
|
||||||
+ }
|
|
||||||
+ if (src0->type != GGML_TYPE_MXFP4 && src0->type != GGML_TYPE_NVFP4) {
|
|
||||||
+ return false;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ // TODO(kernel - see kernel design doc): CUTLASS 3.x GemmGrouped, sm_120a, block-scaled e2m1,
|
|
||||||
+ // tcgen05 MMA; per-expert problem offsets from `ids`; fused activation quant; numerical parity
|
|
||||||
+ // vs mul_mat_q<MXFP4> before enabling by default.
|
|
||||||
+ static bool warned = false;
|
|
||||||
+ if (!warned) {
|
|
||||||
+ warned = true;
|
|
||||||
+ fprintf(stderr, "[fp4-grouped] GGML_CUDA_FP4_GROUPED set, kernel not yet implemented - using MMQ\n");
|
|
||||||
+ }
|
|
||||||
+ return false; // scaffold: fall back until the kernel lands
|
|
||||||
+}
|
|
||||||
diff --git a/ggml/src/ggml-cuda/fp4-grouped-moe.cuh b/ggml/src/ggml-cuda/fp4-grouped-moe.cuh
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000..29e1b5a
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/ggml/src/ggml-cuda/fp4-grouped-moe.cuh
|
|
||||||
@@ -0,0 +1,13 @@
|
|
||||||
+#pragma once
|
|
||||||
+
|
|
||||||
+#include "common.cuh"
|
|
||||||
+
|
|
||||||
+// Entry point for the tcgen05/CUTLASS block-scaled FP4 (MXFP4/NVFP4) grouped-GEMM MoE kernel for
|
|
||||||
+// Blackwell consumer GPUs (sm_120/121). Returns true if it handled the op; false to fall back to
|
|
||||||
+// the existing warp-mma MMQ path. Gated behind GGML_CUDA_FP4_GROUPED until correct + faster.
|
|
||||||
+bool ggml_cuda_fp4_grouped_moe(
|
|
||||||
+ ggml_backend_cuda_context & ctx,
|
|
||||||
+ const ggml_tensor * src0, // expert weights, MXFP4/NVFP4 [n_embd, n_ff, n_expert]
|
|
||||||
+ const ggml_tensor * src1, // activations, F32 [n_embd, n_tokens, ...]
|
|
||||||
+ const ggml_tensor * ids, // expert routing, I32
|
|
||||||
+ ggml_tensor * dst); // F32 output
|
|
||||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
||||||
index 8ea462a..104d131 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
||||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
|
||||||
@@ -30,6 +30,7 @@
|
|
||||||
#include "ggml-cuda/im2col.cuh"
|
|
||||||
#include "ggml-cuda/mmf.cuh"
|
|
||||||
#include "ggml-cuda/mmq.cuh"
|
|
||||||
+#include "ggml-cuda/fp4-grouped-moe.cuh"
|
|
||||||
#include "ggml-cuda/mmvf.cuh"
|
|
||||||
#include "ggml-cuda/mmvq.cuh"
|
|
||||||
#include "ggml-cuda/norm.cuh"
|
|
||||||
@@ -2701,6 +2702,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ggml_cuda_should_use_mmq(src0->type, cc, ne12, /*n_experts=*/ne02)) {
|
|
||||||
+ if (ggml_cuda_fp4_grouped_moe(ctx, src0, src1, ids, dst)) { return; }
|
|
||||||
ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
@@ -1,447 +0,0 @@
|
|||||||
From bef64835d444a44ed8391bc395cdab38164229d5 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Fri, 19 Jun 2026 22:54:49 +0000
|
|
||||||
Subject: [PATCH] vendor paged kv manager
|
|
||||||
|
|
||||||
vLLM-parity host-side KV block manager (FreeBlockQueue, BlockPool,
|
|
||||||
PagedKVManager, chained-hash prefix cache). Pure C++17, no behavior change -
|
|
||||||
nothing uses it yet; wired in by later patches in the series.
|
|
||||||
---
|
|
||||||
src/CMakeLists.txt | 1 +
|
|
||||||
src/paged-kv-manager.cpp | 296 +++++++++++++++++++++++++++++++++++++++
|
|
||||||
src/paged-kv-manager.h | 108 ++++++++++++++
|
|
||||||
3 files changed, 405 insertions(+)
|
|
||||||
create mode 100644 src/paged-kv-manager.cpp
|
|
||||||
create mode 100644 src/paged-kv-manager.h
|
|
||||||
|
|
||||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
|
||||||
index d15ccfd99..a030940b8 100644
|
|
||||||
--- a/src/CMakeLists.txt
|
|
||||||
+++ b/src/CMakeLists.txt
|
|
||||||
@@ -24,6 +24,7 @@ add_library(llama
|
|
||||||
llama-io.cpp
|
|
||||||
llama-kv-cache.cpp
|
|
||||||
llama-kv-cache-iswa.cpp
|
|
||||||
+ paged-kv-manager.cpp
|
|
||||||
llama-kv-cache-dsa.cpp
|
|
||||||
llama-memory.cpp
|
|
||||||
llama-memory-hybrid.cpp
|
|
||||||
diff --git a/src/paged-kv-manager.cpp b/src/paged-kv-manager.cpp
|
|
||||||
new file mode 100644
|
|
||||||
index 000000000..ca0dcd83a
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/src/paged-kv-manager.cpp
|
|
||||||
@@ -0,0 +1,296 @@
|
|
||||||
+#include "paged-kv-manager.h"
|
|
||||||
+#include <cassert>
|
|
||||||
+#include <stdexcept>
|
|
||||||
+
|
|
||||||
+namespace paged {
|
|
||||||
+
|
|
||||||
+// ---------------------------------------------------------------------------
|
|
||||||
+// FreeBlockQueue (port of kv_cache_utils.py FreeKVCacheBlockQueue)
|
|
||||||
+// ---------------------------------------------------------------------------
|
|
||||||
+
|
|
||||||
+FreeBlockQueue::FreeBlockQueue(const std::vector<KVCacheBlock*>& blocks) {
|
|
||||||
+ num_free_blocks = blocks.size();
|
|
||||||
+ for (size_t i = 0; i < blocks.size(); ++i) {
|
|
||||||
+ if (i > 0) blocks[i]->prev_free = blocks[i - 1];
|
|
||||||
+ if (i + 1 < blocks.size()) blocks[i]->next_free = blocks[i + 1];
|
|
||||||
+ }
|
|
||||||
+ if (!blocks.empty()) {
|
|
||||||
+ fake_head.next_free = blocks.front();
|
|
||||||
+ blocks.front()->prev_free = &fake_head;
|
|
||||||
+ fake_tail.prev_free = blocks.back();
|
|
||||||
+ blocks.back()->next_free = &fake_tail;
|
|
||||||
+ } else {
|
|
||||||
+ fake_head.next_free = &fake_tail;
|
|
||||||
+ fake_tail.prev_free = &fake_head;
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+KVCacheBlock* FreeBlockQueue::popleft() {
|
|
||||||
+ KVCacheBlock* first = fake_head.next_free;
|
|
||||||
+ if (first == &fake_tail || first == nullptr) {
|
|
||||||
+ assert(num_free_blocks == 0);
|
|
||||||
+ throw std::runtime_error("No free blocks available");
|
|
||||||
+ }
|
|
||||||
+ fake_head.next_free = first->next_free;
|
|
||||||
+ first->next_free->prev_free = &fake_head;
|
|
||||||
+ first->prev_free = first->next_free = nullptr;
|
|
||||||
+ num_free_blocks--;
|
|
||||||
+ return first;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+std::vector<KVCacheBlock*> FreeBlockQueue::popleft_n(size_t n) {
|
|
||||||
+ std::vector<KVCacheBlock*> ret;
|
|
||||||
+ if (n == 0) return ret;
|
|
||||||
+ assert(num_free_blocks >= n);
|
|
||||||
+ num_free_blocks -= n;
|
|
||||||
+ KVCacheBlock* curr = fake_head.next_free;
|
|
||||||
+ ret.reserve(n);
|
|
||||||
+ for (size_t i = 0; i < n; ++i) {
|
|
||||||
+ assert(curr != nullptr);
|
|
||||||
+ ret.push_back(curr);
|
|
||||||
+ KVCacheBlock* last = curr;
|
|
||||||
+ curr = curr->next_free;
|
|
||||||
+ last->prev_free = last->next_free = nullptr;
|
|
||||||
+ }
|
|
||||||
+ if (curr != nullptr) {
|
|
||||||
+ fake_head.next_free = curr;
|
|
||||||
+ curr->prev_free = &fake_head;
|
|
||||||
+ }
|
|
||||||
+ return ret;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void FreeBlockQueue::remove(KVCacheBlock* block) {
|
|
||||||
+ if (!block->prev_free || !block->next_free)
|
|
||||||
+ throw std::runtime_error("remove() called on an invalid block");
|
|
||||||
+ block->prev_free->next_free = block->next_free;
|
|
||||||
+ block->next_free->prev_free = block->prev_free;
|
|
||||||
+ block->prev_free = block->next_free = nullptr;
|
|
||||||
+ num_free_blocks--;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void FreeBlockQueue::append(KVCacheBlock* block) {
|
|
||||||
+ KVCacheBlock* last = fake_tail.prev_free;
|
|
||||||
+ last->next_free = block;
|
|
||||||
+ block->prev_free = last;
|
|
||||||
+ block->next_free = &fake_tail;
|
|
||||||
+ fake_tail.prev_free = block;
|
|
||||||
+ num_free_blocks++;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void FreeBlockQueue::append_n(const std::vector<KVCacheBlock*>& blocks) {
|
|
||||||
+ if (blocks.empty()) return;
|
|
||||||
+ KVCacheBlock* last = fake_tail.prev_free;
|
|
||||||
+ for (KVCacheBlock* b : blocks) {
|
|
||||||
+ b->prev_free = last;
|
|
||||||
+ last->next_free = b;
|
|
||||||
+ last = b;
|
|
||||||
+ }
|
|
||||||
+ last->next_free = &fake_tail;
|
|
||||||
+ fake_tail.prev_free = last;
|
|
||||||
+ num_free_blocks += blocks.size();
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void FreeBlockQueue::prepend_n(const std::vector<KVCacheBlock*>& blocks) {
|
|
||||||
+ if (blocks.empty()) return;
|
|
||||||
+ KVCacheBlock* first = fake_head.next_free;
|
|
||||||
+ KVCacheBlock* prev = &fake_head;
|
|
||||||
+ for (KVCacheBlock* b : blocks) {
|
|
||||||
+ b->prev_free = prev;
|
|
||||||
+ prev->next_free = b;
|
|
||||||
+ prev = b;
|
|
||||||
+ }
|
|
||||||
+ prev->next_free = first;
|
|
||||||
+ first->prev_free = prev;
|
|
||||||
+ num_free_blocks += blocks.size();
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+std::vector<KVCacheBlock*> FreeBlockQueue::get_all_free_blocks() const {
|
|
||||||
+ std::vector<KVCacheBlock*> ret;
|
|
||||||
+ const KVCacheBlock* curr = fake_head.next_free;
|
|
||||||
+ while (curr && curr->next_free != nullptr) {
|
|
||||||
+ ret.push_back(const_cast<KVCacheBlock*>(curr));
|
|
||||||
+ curr = curr->next_free;
|
|
||||||
+ }
|
|
||||||
+ return ret;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+// ---------------------------------------------------------------------------
|
|
||||||
+// BlockPool (port of block_pool.py)
|
|
||||||
+// ---------------------------------------------------------------------------
|
|
||||||
+
|
|
||||||
+static std::vector<KVCacheBlock*> make_ptrs(std::vector<KVCacheBlock>& v) {
|
|
||||||
+ std::vector<KVCacheBlock*> p;
|
|
||||||
+ p.reserve(v.size());
|
|
||||||
+ for (auto& b : v) p.push_back(&b);
|
|
||||||
+ return p;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static std::vector<KVCacheBlock> make_block_vec(int32_t num_blocks) {
|
|
||||||
+ std::vector<KVCacheBlock> v;
|
|
||||||
+ v.reserve(num_blocks);
|
|
||||||
+ for (int32_t i = 0; i < num_blocks; ++i) v.emplace_back(i);
|
|
||||||
+ return v;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+BlockPool::BlockPool(int32_t num_blocks, bool enable_caching)
|
|
||||||
+ : enable_caching_(enable_caching),
|
|
||||||
+ blocks_(make_block_vec(num_blocks)),
|
|
||||||
+ ptrs_(make_ptrs(blocks_)),
|
|
||||||
+ free_queue_(ptrs_) {
|
|
||||||
+ // vLLM reserves block_id 0 as the null block (never cached).
|
|
||||||
+ null_block = free_queue_.popleft();
|
|
||||||
+ null_block->is_null = true;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+bool BlockPool::maybe_evict_cached_block(KVCacheBlock* block) {
|
|
||||||
+ if (!block->has_hash) return false;
|
|
||||||
+ auto it = cached_block_hash_to_block_.find(block->block_hash);
|
|
||||||
+ if (it == cached_block_hash_to_block_.end() || it->second != block) return false;
|
|
||||||
+ cached_block_hash_to_block_.erase(it);
|
|
||||||
+ block->reset_hash();
|
|
||||||
+ return true;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+std::vector<KVCacheBlock*> BlockPool::get_new_blocks(size_t n) {
|
|
||||||
+ if (n > get_num_free_blocks())
|
|
||||||
+ throw std::runtime_error("Cannot get free blocks from pool");
|
|
||||||
+ auto ret = free_queue_.popleft_n(n);
|
|
||||||
+ for (KVCacheBlock* b : ret) {
|
|
||||||
+ if (enable_caching_) maybe_evict_cached_block(b);
|
|
||||||
+ assert(b->ref_cnt == 0);
|
|
||||||
+ b->ref_cnt += 1;
|
|
||||||
+ }
|
|
||||||
+ return ret;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+KVCacheBlock* BlockPool::get_cached_block(uint64_t block_hash) {
|
|
||||||
+ auto it = cached_block_hash_to_block_.find(block_hash);
|
|
||||||
+ return it == cached_block_hash_to_block_.end() ? nullptr : it->second;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void BlockPool::touch(const std::vector<KVCacheBlock*>& blocks) {
|
|
||||||
+ for (KVCacheBlock* b : blocks) {
|
|
||||||
+ // ref_cnt==0 means the block is a free-list eviction candidate; pull it out.
|
|
||||||
+ if (b->ref_cnt == 0 && !b->is_null) free_queue_.remove(b);
|
|
||||||
+ b->ref_cnt += 1;
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void BlockPool::free_blocks(const std::vector<KVCacheBlock*>& ordered_blocks) {
|
|
||||||
+ std::vector<KVCacheBlock*> without_hash, with_hash;
|
|
||||||
+ for (KVCacheBlock* b : ordered_blocks) {
|
|
||||||
+ if (b->is_null) continue;
|
|
||||||
+ b->ref_cnt -= 1;
|
|
||||||
+ if (b->ref_cnt == 0) (b->has_hash ? with_hash : without_hash).push_back(b);
|
|
||||||
+ }
|
|
||||||
+ free_queue_.prepend_n(without_hash); // un-hashed: evicted first (front)
|
|
||||||
+ free_queue_.append_n(with_hash); // hashed: kept warm (tail)
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void BlockPool::cache_full_blocks(const std::vector<KVCacheBlock*>& req_blocks,
|
|
||||||
+ size_t num_cached_blocks, size_t num_full_blocks,
|
|
||||||
+ const std::vector<uint64_t>& block_hashes) {
|
|
||||||
+ for (size_t i = num_cached_blocks; i < num_full_blocks; ++i) {
|
|
||||||
+ KVCacheBlock* blk = req_blocks[i];
|
|
||||||
+ if (blk->has_hash) continue;
|
|
||||||
+ blk->has_hash = true;
|
|
||||||
+ blk->block_hash = block_hashes[i];
|
|
||||||
+ cached_block_hash_to_block_[blk->block_hash] = blk;
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+// ---------------------------------------------------------------------------
|
|
||||||
+// PagedKVManager (port of SingleTypeKVCacheManager / FullAttentionManager)
|
|
||||||
+// ---------------------------------------------------------------------------
|
|
||||||
+
|
|
||||||
+static inline size_t cdiv(size_t a, size_t b) { return (a + b - 1) / b; }
|
|
||||||
+
|
|
||||||
+PagedKVManager::PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching)
|
|
||||||
+ : block_size_(block_size), pool_(num_blocks, enable_caching) {}
|
|
||||||
+
|
|
||||||
+bool PagedKVManager::allocate(int seq_id, size_t total_tokens) {
|
|
||||||
+ auto& req = req_to_blocks_[seq_id];
|
|
||||||
+ size_t need = cdiv(total_tokens, block_size_);
|
|
||||||
+ if (need <= req.size()) return true;
|
|
||||||
+ size_t add = need - req.size();
|
|
||||||
+ if (add > pool_.get_num_free_blocks()) return false; // OOM
|
|
||||||
+ auto nb = pool_.get_new_blocks(add);
|
|
||||||
+ req.insert(req.end(), nb.begin(), nb.end());
|
|
||||||
+ return true;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+std::vector<int32_t> PagedKVManager::block_table(int seq_id) const {
|
|
||||||
+ std::vector<int32_t> bt;
|
|
||||||
+ auto it = req_to_blocks_.find(seq_id);
|
|
||||||
+ if (it == req_to_blocks_.end()) return bt;
|
|
||||||
+ bt.reserve(it->second.size());
|
|
||||||
+ for (KVCacheBlock* b : it->second) bt.push_back(b->block_id);
|
|
||||||
+ return bt;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+int64_t PagedKVManager::slot(int seq_id, int pos) const {
|
|
||||||
+ const auto& req = req_to_blocks_.at(seq_id);
|
|
||||||
+ int32_t phys = req[pos / block_size_]->block_id;
|
|
||||||
+ return (int64_t)phys * block_size_ + (pos % block_size_);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+std::vector<int64_t> PagedKVManager::slot_mapping(int seq_id, const std::vector<int>& positions) const {
|
|
||||||
+ std::vector<int64_t> sm;
|
|
||||||
+ sm.reserve(positions.size());
|
|
||||||
+ for (int p : positions) sm.push_back(slot(seq_id, p));
|
|
||||||
+ return sm;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void PagedKVManager::free(int seq_id) {
|
|
||||||
+ auto it = req_to_blocks_.find(seq_id);
|
|
||||||
+ if (it == req_to_blocks_.end()) return;
|
|
||||||
+ // Free in reverse so the tail of the block chain is evicted first (vLLM order).
|
|
||||||
+ std::vector<KVCacheBlock*> ordered(it->second.rbegin(), it->second.rend());
|
|
||||||
+ pool_.free_blocks(ordered);
|
|
||||||
+ req_to_blocks_.erase(it);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+// FNV-1a chained block hash. Deterministic and prefix-sensitive; folds the parent
|
|
||||||
+// hash into the seed so each block hash transitively encodes its whole prefix
|
|
||||||
+// (behavioral parity with vLLM hash_block_tokens chaining; vLLM uses sha256 bytes).
|
|
||||||
+uint64_t PagedKVManager::hash_block(uint64_t parent_hash, const std::vector<int>& token_ids) {
|
|
||||||
+ uint64_t h = 1469598103934665603ull ^ parent_hash;
|
|
||||||
+ for (int t : token_ids) {
|
|
||||||
+ h ^= (uint64_t)(uint32_t)t;
|
|
||||||
+ h *= 1099511628211ull;
|
|
||||||
+ }
|
|
||||||
+ if (h == 0) h = 0x9e3779b97f4a7c15ull; // never 0 (0 reads as "no hash")
|
|
||||||
+ return h;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+std::vector<uint64_t> PagedKVManager::compute_block_hashes(const std::vector<int>& token_ids) const {
|
|
||||||
+ std::vector<uint64_t> hashes;
|
|
||||||
+ uint64_t parent = 0; // NONE_HASH analogue
|
|
||||||
+ size_t n_full = token_ids.size() / block_size_;
|
|
||||||
+ for (size_t i = 0; i < n_full; ++i) {
|
|
||||||
+ std::vector<int> blk(token_ids.begin() + i * block_size_,
|
|
||||||
+ token_ids.begin() + (i + 1) * block_size_);
|
|
||||||
+ parent = hash_block(parent, blk);
|
|
||||||
+ hashes.push_back(parent);
|
|
||||||
+ }
|
|
||||||
+ return hashes;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+size_t PagedKVManager::get_computed_blocks(const std::vector<uint64_t>& block_hashes) {
|
|
||||||
+ std::vector<KVCacheBlock*> hits;
|
|
||||||
+ for (uint64_t bh : block_hashes) { // stop at first miss (prefix property)
|
|
||||||
+ KVCacheBlock* cb = pool_.get_cached_block(bh);
|
|
||||||
+ if (!cb) break;
|
|
||||||
+ hits.push_back(cb);
|
|
||||||
+ }
|
|
||||||
+ pool_.touch(hits); // ++ref_cnt, pull from free list
|
|
||||||
+ return hits.size() * (size_t)block_size_;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void PagedKVManager::cache_blocks(int seq_id, const std::vector<uint64_t>& block_hashes, size_t num_tokens) {
|
|
||||||
+ auto& req = req_to_blocks_[seq_id];
|
|
||||||
+ size_t n_full = num_tokens / block_size_;
|
|
||||||
+ pool_.cache_full_blocks(req, /*num_cached=*/0, n_full, block_hashes);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+} // namespace paged
|
|
||||||
diff --git a/src/paged-kv-manager.h b/src/paged-kv-manager.h
|
|
||||||
new file mode 100644
|
|
||||||
index 000000000..740280a7f
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/src/paged-kv-manager.h
|
|
||||||
@@ -0,0 +1,108 @@
|
|
||||||
+#pragma once
|
|
||||||
+// Paged KV cache block manager for llama.cpp (CPU-first prototype).
|
|
||||||
+//
|
|
||||||
+// Host-side block management is a faithful port of vLLM V1:
|
|
||||||
+// vllm/v1/core/kv_cache_utils.py (KVCacheBlock, FreeKVCacheBlockQueue, hash_block_tokens)
|
|
||||||
+// vllm/v1/core/block_pool.py (BlockPool: get_new_blocks/touch/free/evict/cache_full_blocks)
|
|
||||||
+// vllm/v1/core/single_type_kv_cache_manager.py (allocate_new_blocks, find_longest_cache_hit)
|
|
||||||
+//
|
|
||||||
+// Parity is on behavior/algorithm (block chaining, first-miss stop, ref-counting,
|
|
||||||
+// LRU eviction order), not on exact hash bytes. This unit has zero ggml/llama.cpp
|
|
||||||
+// dependency so it can be unit-tested in isolation.
|
|
||||||
+
|
|
||||||
+#include <cstdint>
|
|
||||||
+#include <vector>
|
|
||||||
+#include <unordered_map>
|
|
||||||
+#include <map>
|
|
||||||
+
|
|
||||||
+namespace paged {
|
|
||||||
+
|
|
||||||
+// vLLM KVCacheBlock (kv_cache_utils.py).
|
|
||||||
+struct KVCacheBlock {
|
|
||||||
+ int32_t block_id = 0;
|
|
||||||
+ int ref_cnt = 0;
|
|
||||||
+ bool has_hash = false; // vLLM: _block_hash is set only when full+cached
|
|
||||||
+ uint64_t block_hash = 0;
|
|
||||||
+ bool is_null = false;
|
|
||||||
+ KVCacheBlock* prev_free = nullptr;
|
|
||||||
+ KVCacheBlock* next_free = nullptr;
|
|
||||||
+
|
|
||||||
+ explicit KVCacheBlock(int32_t id = 0) : block_id(id) {}
|
|
||||||
+ void reset_hash() { has_hash = false; block_hash = 0; }
|
|
||||||
+};
|
|
||||||
+
|
|
||||||
+// Intrusive doubly-linked free list with fake head/tail (vLLM FreeKVCacheBlockQueue).
|
|
||||||
+// O(1) middle removal is required so touch() can pull a warm cached block out of the
|
|
||||||
+// free list when a later request hits its prefix.
|
|
||||||
+class FreeBlockQueue {
|
|
||||||
+public:
|
|
||||||
+ size_t num_free_blocks = 0;
|
|
||||||
+
|
|
||||||
+ explicit FreeBlockQueue(const std::vector<KVCacheBlock*>& blocks);
|
|
||||||
+ KVCacheBlock* popleft();
|
|
||||||
+ std::vector<KVCacheBlock*> popleft_n(size_t n);
|
|
||||||
+ void remove(KVCacheBlock* block);
|
|
||||||
+ void append(KVCacheBlock* block);
|
|
||||||
+ void append_n(const std::vector<KVCacheBlock*>& blocks);
|
|
||||||
+ void prepend_n(const std::vector<KVCacheBlock*>& blocks);
|
|
||||||
+ std::vector<KVCacheBlock*> get_all_free_blocks() const;
|
|
||||||
+
|
|
||||||
+private:
|
|
||||||
+ KVCacheBlock fake_head{-1};
|
|
||||||
+ KVCacheBlock fake_tail{-1};
|
|
||||||
+};
|
|
||||||
+
|
|
||||||
+// vLLM BlockPool (block_pool.py).
|
|
||||||
+class BlockPool {
|
|
||||||
+public:
|
|
||||||
+ KVCacheBlock* null_block = nullptr;
|
|
||||||
+
|
|
||||||
+ BlockPool(int32_t num_blocks, bool enable_caching);
|
|
||||||
+ std::vector<KVCacheBlock*> get_new_blocks(size_t n);
|
|
||||||
+ KVCacheBlock* get_cached_block(uint64_t block_hash);
|
|
||||||
+ void touch(const std::vector<KVCacheBlock*>& blocks);
|
|
||||||
+ void free_blocks(const std::vector<KVCacheBlock*>& ordered_blocks);
|
|
||||||
+ void cache_full_blocks(const std::vector<KVCacheBlock*>& req_blocks,
|
|
||||||
+ size_t num_cached_blocks, size_t num_full_blocks,
|
|
||||||
+ const std::vector<uint64_t>& block_hashes);
|
|
||||||
+ size_t get_num_free_blocks() const { return free_queue_.num_free_blocks; }
|
|
||||||
+
|
|
||||||
+private:
|
|
||||||
+ bool maybe_evict_cached_block(KVCacheBlock* block);
|
|
||||||
+
|
|
||||||
+ bool enable_caching_;
|
|
||||||
+ std::vector<KVCacheBlock> blocks_; // owns all block descriptors
|
|
||||||
+ std::vector<KVCacheBlock*> ptrs_;
|
|
||||||
+ FreeBlockQueue free_queue_;
|
|
||||||
+ // vLLM stores hash -> {block_id: block} to allow duplicate-content blocks; the
|
|
||||||
+ // prototype keeps the last writer (single KV-cache group is sufficient for the wins).
|
|
||||||
+ std::unordered_map<uint64_t, KVCacheBlock*> cached_block_hash_to_block_;
|
|
||||||
+};
|
|
||||||
+
|
|
||||||
+// Allocation + prefix-caching surface, ported from SingleTypeKVCacheManager /
|
|
||||||
+// FullAttentionManager. Single KV-cache group; no extra_keys / eagle / spec-decode.
|
|
||||||
+class PagedKVManager {
|
|
||||||
+public:
|
|
||||||
+ PagedKVManager(int32_t num_blocks, int block_size, bool enable_caching);
|
|
||||||
+
|
|
||||||
+ // Grow seq_id to cover total_tokens slots. Returns false on OOM (free queue empty).
|
|
||||||
+ bool allocate(int seq_id, size_t total_tokens);
|
|
||||||
+ std::vector<int32_t> block_table(int seq_id) const;
|
|
||||||
+ int64_t slot(int seq_id, int pos) const;
|
|
||||||
+ std::vector<int64_t> slot_mapping(int seq_id, const std::vector<int>& positions) const;
|
|
||||||
+ void free(int seq_id);
|
|
||||||
+ int block_size() const { return block_size_; }
|
|
||||||
+
|
|
||||||
+ // Prefix caching (win 3).
|
|
||||||
+ static uint64_t hash_block(uint64_t parent_hash, const std::vector<int>& token_ids);
|
|
||||||
+ std::vector<uint64_t> compute_block_hashes(const std::vector<int>& token_ids) const;
|
|
||||||
+ size_t get_computed_blocks(const std::vector<uint64_t>& block_hashes); // returns num cached tokens
|
|
||||||
+ void cache_blocks(int seq_id, const std::vector<uint64_t>& block_hashes, size_t num_tokens);
|
|
||||||
+
|
|
||||||
+protected:
|
|
||||||
+ int block_size_;
|
|
||||||
+ BlockPool pool_;
|
|
||||||
+ std::map<int, std::vector<KVCacheBlock*>> req_to_blocks_;
|
|
||||||
+};
|
|
||||||
+
|
|
||||||
+} // namespace paged
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,75 +0,0 @@
|
|||||||
From 5c9c709e6c6b07e0399b75fd4e46e752d418a9a8 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Fri, 19 Jun 2026 23:04:17 +0000
|
|
||||||
Subject: [PATCH] paged kv block placement (env LLAMA_KV_PAGED)
|
|
||||||
|
|
||||||
Place each sequence's tokens at permuted, non-contiguous fixed-size block
|
|
||||||
positions in find_slot, proving attention is invariant to physical KV placement
|
|
||||||
(token-identical greedy generation). Default off; single-sequence scope; falls
|
|
||||||
back to the normal allocator. The paged-placement substrate for the gather-read.
|
|
||||||
---
|
|
||||||
src/llama-kv-cache.cpp | 41 +++++++++++++++++++++++++++++++++++++++++
|
|
||||||
1 file changed, 41 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
|
||||||
index 2802103bd..999e2ae61 100644
|
|
||||||
--- a/src/llama-kv-cache.cpp
|
|
||||||
+++ b/src/llama-kv-cache.cpp
|
|
||||||
@@ -11,6 +11,8 @@
|
|
||||||
#include <cstring>
|
|
||||||
#include <limits>
|
|
||||||
#include <map>
|
|
||||||
+#include <numeric>
|
|
||||||
+#include <cstdlib>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
static bool ggml_is_power_of_2(int n) {
|
|
||||||
@@ -1020,6 +1022,45 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
|
||||||
return { };
|
|
||||||
}
|
|
||||||
|
|
||||||
+ // [paged, experimental] Place this sequence's tokens at permuted,
|
|
||||||
+ // non-contiguous fixed-size BLOCK positions instead of a contiguous run.
|
|
||||||
+ // This validates that attention is invariant to physical KV placement -
|
|
||||||
+ // the correctness premise of paged attention. Enabled via LLAMA_KV_PAGED.
|
|
||||||
+ // Single-sequence scope (uses get_used() as the logical base); falls back
|
|
||||||
+ // to the normal allocator if the permuted cells aren't available.
|
|
||||||
+ static const bool paged_mode = (std::getenv("LLAMA_KV_PAGED") != nullptr);
|
|
||||||
+ if (paged_mode) {
|
|
||||||
+ const uint32_t bs = 16; // block size (tokens/block)
|
|
||||||
+ const uint32_t nblk = cells.size() / bs; // blocks in this stream's pool
|
|
||||||
+ if (nblk >= 2) {
|
|
||||||
+ // stride coprime to nblk => block-index permutation is a bijection
|
|
||||||
+ uint32_t k = 1;
|
|
||||||
+ for (uint32_t cand = (nblk / 2) | 1u; cand < nblk; cand += 2) {
|
|
||||||
+ if (std::gcd(cand, nblk) == 1u) { k = cand; break; }
|
|
||||||
+ }
|
|
||||||
+ const uint32_t base = cells.get_used();
|
|
||||||
+ bool ok = true;
|
|
||||||
+ for (uint32_t i = 0; i < n_tokens; ++i) {
|
|
||||||
+ const uint32_t L = base + i;
|
|
||||||
+ const uint32_t b = L / bs;
|
|
||||||
+ const uint32_t off = L % bs;
|
|
||||||
+ if (b >= nblk) { ok = false; break; }
|
|
||||||
+ const uint32_t phys = ((b * k) % nblk) * bs + off; // permuted block
|
|
||||||
+ if (phys >= cells.size() || !cells.is_empty(phys)) { ok = false; break; }
|
|
||||||
+ res.idxs[s].push_back(phys);
|
|
||||||
+ }
|
|
||||||
+ if (ok && res.idxs[s].size() == n_tokens) {
|
|
||||||
+ if (std::getenv("LLAMA_KV_PAGED_DEBUG")) {
|
|
||||||
+ fprintf(stderr, "[paged] seq placed %u tok at cells:", n_tokens);
|
|
||||||
+ for (uint32_t z = 0; z < res.idxs[s].size() && z < 24; ++z) fprintf(stderr, " %u", res.idxs[s][z]);
|
|
||||||
+ fprintf(stderr, " (k=%u nblk=%u base=%u)\n", k, nblk, base);
|
|
||||||
+ }
|
|
||||||
+ continue; // paged placement succeeded for this sequence
|
|
||||||
+ }
|
|
||||||
+ res.idxs[s].clear(); // fall back to the normal allocator
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
uint32_t n_tested = 0;
|
|
||||||
|
|
||||||
// for continuous slots, we test that all tokens in the ubatch fit, starting from the current head
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,102 +0,0 @@
|
|||||||
# Patch 0003 — paged gather-read: exact implementation plan
|
|
||||||
|
|
||||||
**Goal:** a sequence attends only its own (compacted) cells via `ggml_get_rows`, instead of the scattered
|
|
||||||
`[0,n_kv)` window. Token-identical (attention is permutation-invariant over the KV set). **Gated**: stock
|
|
||||||
path stays byte-identical (no new ops unless `LLAMA_KV_PAGED`).
|
|
||||||
|
|
||||||
**Base:** applies on top of 0001+0002 at the pin. Dev tree: `backend/cpp/llama-cpp-paged-dev` (branch `paged`).
|
|
||||||
|
|
||||||
## Design
|
|
||||||
|
|
||||||
The gather is keyed off one runtime index list (the sequence's used cells, in a fixed order), exposed as a
|
|
||||||
graph input (mirroring `k_idxs`). In `build_attn`, gather K, V **and the kq_mask** by that same index, so all
|
|
||||||
three stay aligned. `n_gathered` replaces `n_kv` for the attention. Only active when the cache is in paged
|
|
||||||
mode (a new `is_paged()` flag set when `LLAMA_KV_PAGED`/find_slot used permuted placement).
|
|
||||||
|
|
||||||
ggml note: `ggml_get_rows(a,b)` gathers `a`'s **ne1** by `b` (I32). Raw K is `[n_embd_k_gqa, kv_size, n_stream]`
|
|
||||||
→ ne1 = cells → direct. The mask is `[n_kv, n_tokens, 1, n_stream]` → n_kv is **ne0**, so gather as
|
|
||||||
`transpose → get_rows → transpose`.
|
|
||||||
|
|
||||||
### KEY CORRECTIONS (found while implementing — these change the edits)
|
|
||||||
|
|
||||||
1. **Gather index = ALL used (non-empty) cells in `[0,n_kv)`, NOT `sinfo.idxs`.** `sinfo.idxs` is only the
|
|
||||||
*current ubatch's write slots*; attention reads the *full history*. The query set per token is masked by
|
|
||||||
`kq_mask`, so gathering the union of all used cells + gathering the mask the same way is token-identical
|
|
||||||
and drops exactly the empty (already-masked) cells. So: `gather = { i in [0,n_kv) : !cells.is_empty(i) }`.
|
|
||||||
|
|
||||||
2. **Static-graph size is fine because llama.cpp rebuilds the graph every ubatch.** `n_gather` (used-cell
|
|
||||||
count) is therefore a build-time constant for that ubatch — `build_input_gather_idxs` sizes the I32
|
|
||||||
tensor to `get_n_gather()` computed at build, `set_input_gather_idxs` fills the identical cell list. They
|
|
||||||
MUST use the same loop (`for i in [0,n_kv): if !is_empty(i) push i`) so build-order == fill-order.
|
|
||||||
|
|
||||||
3. **K/V gather can live entirely in `build_attn`, no cache get_k change.** The `get_k` 4d view is contiguous
|
|
||||||
in `[ne0,ne1,ne2]` from cell 0 (nb2 == n_embd_head*n_head_kv*elemsz), so for **single stream (ns==1)**:
|
|
||||||
`reshape_3d(k, n_embd_head*n_head_kv, n_kv, 1) → get_rows(., gi) → reshape_4d(., n_embd_head, n_head_kv, n_gather, 1)`.
|
|
||||||
Multi-stream (ns>1) breaks contiguity (nb3 uses kv_size) → gate to ns==1 first, multi-stream follow-up.
|
|
||||||
|
|
||||||
4. So the ONLY cache additions are `is_paged()`, `get_n_gather(n_kv)`, `build/set_input_gather_idxs(n_kv)`;
|
|
||||||
everything else (K/V/mask gather) is in `build_attn`. `set_input_kq_mask` is **unchanged** (built over
|
|
||||||
n_kv, then gathered). Smaller than the 7-edit estimate above.
|
|
||||||
|
|
||||||
## Edits
|
|
||||||
|
|
||||||
### 1. `src/llama-kv-cache.h` — declare gather infra (in `llama_kv_cache`)
|
|
||||||
```cpp
|
|
||||||
bool is_paged() const { return paged_active; } // near get_size()
|
|
||||||
ggml_tensor * build_input_gather_idxs(ggml_context * ctx, const slot_info & sinfo) const;
|
|
||||||
void set_input_gather_idxs (ggml_tensor * dst, const slot_info & sinfo) const;
|
|
||||||
uint32_t get_n_gather(const slot_info & sinfo) const; // == sum of used cells gathered
|
|
||||||
```
|
|
||||||
Add member `mutable bool paged_active = false;` and in `llama_kv_cache_context` forward the three (like
|
|
||||||
`build_input_k_idxs`/`get_n_kv`).
|
|
||||||
|
|
||||||
### 2. `src/llama-kv-cache.cpp`
|
|
||||||
- In `find_slot`, in the paged branch (0002), set `paged_active = true;` on success.
|
|
||||||
- `get_n_gather(sinfo)` = `sinfo.idxs[0].size()` summed over streams (the count actually placed).
|
|
||||||
- `build_input_gather_idxs`: `ggml_new_tensor_1d(ctx, GGML_TYPE_I32, get_n_gather(sinfo)); ggml_set_input(...)`.
|
|
||||||
- `set_input_gather_idxs`: fill `data[k++] = strm_off + sinfo.idxs[s][i]` for every placed cell (same order
|
|
||||||
the mask/k/v will see). This is the canonical gather order.
|
|
||||||
|
|
||||||
### 3. `src/llama-graph.h` — `llm_graph_input_attn_kv`
|
|
||||||
Add `ggml_tensor * gather_idxs = nullptr;` + `ggml_tensor * get_gather_idxs() const { return gather_idxs; }`.
|
|
||||||
|
|
||||||
### 4. `src/llama-graph.cpp`
|
|
||||||
- `llm_graph_input_attn_kv::set_input`: if `mctx->is_paged()` → `mctx->set_input_gather_idxs(gather_idxs, ...)`.
|
|
||||||
- `build_attn_inp_kv` (creates the input): if `mctx_cur->is_paged()` → `inp->gather_idxs =
|
|
||||||
mctx_cur->build_input_gather_idxs(ctx0, ...)`.
|
|
||||||
- `build_attn` (the kv overload, ~2356): after `k`,`v`,`kq_mask`:
|
|
||||||
```cpp
|
|
||||||
if (ggml_tensor * gi = inp->get_gather_idxs()) {
|
|
||||||
k = ggml_get_rows(ctx0, k, gi); // [d, n_gather, ...] (reshape view ok)
|
|
||||||
v = v_trans ? /* gather columns */ : ggml_get_rows(ctx0, v, gi);
|
|
||||||
ggml_tensor * m = ggml_cont(ctx0, ggml_transpose(ctx0, kq_mask)); // [n_tokens, n_kv]
|
|
||||||
m = ggml_get_rows(ctx0, m, gi); // [n_tokens, n_gather]
|
|
||||||
kq_mask = ggml_cont(ctx0, ggml_transpose(ctx0, m)); // [n_gather, n_tokens]
|
|
||||||
}
|
|
||||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
|
||||||
```
|
|
||||||
Note: `get_k` returns the reshaped 4d view; gather must run on a cell-major shape. Simplest: add a paged
|
|
||||||
variant `get_k(ctx,il)` that returns `ggml_get_rows` of the **raw** `layers[ikv].k` then reshapes to
|
|
||||||
`[n_embd_head, n_head_kv, n_gather, ns]`. Do the gather in the cache, not the graph, for K/V; keep only the
|
|
||||||
mask gather in the graph. (Cleaner — revisit during impl.)
|
|
||||||
|
|
||||||
### 5. V-transposed path
|
|
||||||
When `!flash_attn`, V is stored transposed `[kv_size, n_embd_v_gqa]`; gather its **rows** (ne1 = n_embd) won't
|
|
||||||
work — gather columns via the same idx on the non-transposed store, OR force `is_paged()` to require
|
|
||||||
flash-attn for the first cut (`GGML_ASSERT`) and handle v_trans in a follow-up.
|
|
||||||
|
|
||||||
## Verification (the gate)
|
|
||||||
```sh
|
|
||||||
cmake --build build-cpu --target llama-simple -j
|
|
||||||
M=Qwen3-0.6B.Q4_K_M.gguf ; P="<the 0002 prompt>"
|
|
||||||
build-cpu/bin/llama-simple -m $M -n 64 "$P" > a.txt # stock
|
|
||||||
LLAMA_KV_PAGED=1 build-cpu/bin/llama-simple -m $M -n 64 "$P" > b.txt # paged gather-read
|
|
||||||
diff a.txt b.txt # MUST be identical
|
|
||||||
```
|
|
||||||
Also assert (debug) that `n_gather < n_kv` on a multi-chunk sequence (proves compaction, not identity).
|
|
||||||
Export only when identical: `git format-patch HEAD~1 -o patches/ --start-number 3 -N`.
|
|
||||||
|
|
||||||
## Risks
|
|
||||||
- Mask transpose/layout: if `b.txt` diverges, dump the gathered mask vs expected for token 0; off-by-order
|
|
||||||
means the `set_input_gather_idxs` order ≠ the get_k gather order — they MUST use the identical loop.
|
|
||||||
- flash-attn vs not: do flash-attn first (simpler mask), then v_trans.
|
|
||||||
@@ -1,369 +0,0 @@
|
|||||||
From c1de00f4cc1eb0dd25993880bb4c8562be1937d4 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Mon, 22 Jun 2026 10:24:22 +0200
|
|
||||||
Subject: [PATCH] paged gather-read (env LLAMA_KV_PAGED) - patch 0003
|
|
||||||
|
|
||||||
Gather K, V and the kq_mask down to each sequence stream's non-empty cells
|
|
||||||
before build_attn_mha. Position-sorted per stream so the flash-attn online
|
|
||||||
softmax reduction order matches stock byte-for-byte. Multi-stream: one index
|
|
||||||
column per stream over k->ne[3], padded to the max non-empty count with a
|
|
||||||
masked (empty) cell. Gated behind LLAMA_KV_PAGED; no-op when unset.
|
|
||||||
---
|
|
||||||
src/CMakeLists.txt | 1 +
|
|
||||||
src/llama-graph.cpp | 9 ++-
|
|
||||||
src/llama-kv-cache.cpp | 74 ++++++++++++++++++++++++
|
|
||||||
src/llama-kv-cache.h | 11 ++++
|
|
||||||
src/paged-attn.cpp | 128 +++++++++++++++++++++++++++++++++++++++++
|
|
||||||
src/paged-attn.h | 40 +++++++++++++
|
|
||||||
6 files changed, 262 insertions(+), 1 deletion(-)
|
|
||||||
create mode 100644 src/paged-attn.cpp
|
|
||||||
create mode 100644 src/paged-attn.h
|
|
||||||
|
|
||||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
|
||||||
index a030940..58083b3 100644
|
|
||||||
--- a/src/CMakeLists.txt
|
|
||||||
+++ b/src/CMakeLists.txt
|
|
||||||
@@ -25,6 +25,7 @@ add_library(llama
|
|
||||||
llama-kv-cache.cpp
|
|
||||||
llama-kv-cache-iswa.cpp
|
|
||||||
paged-kv-manager.cpp
|
|
||||||
+ paged-attn.cpp
|
|
||||||
llama-kv-cache-dsa.cpp
|
|
||||||
llama-memory.cpp
|
|
||||||
llama-memory-hybrid.cpp
|
|
||||||
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
|
|
||||||
index 68c9e60..b59d2a5 100644
|
|
||||||
--- a/src/llama-graph.cpp
|
|
||||||
+++ b/src/llama-graph.cpp
|
|
||||||
@@ -6,6 +6,8 @@
|
|
||||||
#include "llama-cparams.h"
|
|
||||||
|
|
||||||
#include "llama-kv-cache.h"
|
|
||||||
+
|
|
||||||
+#include "paged-attn.h"
|
|
||||||
#include "llama-kv-cache-iswa.h"
|
|
||||||
#include "llama-kv-cache-dsa.h"
|
|
||||||
#include "llama-memory-hybrid.h"
|
|
||||||
@@ -2356,7 +2358,12 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
||||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
|
||||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
|
||||||
|
|
||||||
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, sinks, v_mla, kq_scale, il);
|
|
||||||
+ // [paged 0003] gather K, V and the mask to the sequence's used cells only
|
|
||||||
+ // (no-op unless env LLAMA_KV_PAGED is set).
|
|
||||||
+ ggml_tensor * kq_mask_g = kq_mask;
|
|
||||||
+ paged_attn::gather(ctx0, res, mctx_cur, &k, &v, &kq_mask_g);
|
|
||||||
+
|
|
||||||
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_g, sinks, v_mla, kq_scale, il);
|
|
||||||
cb(cur, "kqv_out", il);
|
|
||||||
|
|
||||||
if (inp->self_v_rot) {
|
|
||||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
|
||||||
index 999e2ae..30d02d7 100644
|
|
||||||
--- a/src/llama-kv-cache.cpp
|
|
||||||
+++ b/src/llama-kv-cache.cpp
|
|
||||||
@@ -1,4 +1,6 @@
|
|
||||||
#include "llama-kv-cache.h"
|
|
||||||
+#include <vector>
|
|
||||||
+#include <utility>
|
|
||||||
|
|
||||||
#include "llama-impl.h"
|
|
||||||
#include "llama-io.h"
|
|
||||||
@@ -1329,6 +1331,70 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
|
|
||||||
ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
|
|
||||||
}
|
|
||||||
|
|
||||||
+// [paged 0003] gather-read: enumerate the non-empty cells in [0, n_kv) for the
|
|
||||||
+// single stream addressed by sinfo. With paged placement (patch 0002) these are
|
|
||||||
+// the sequence's scattered block cells; gathering K/V/mask by this index list
|
|
||||||
+// compacts the attention read while preserving every unmasked (token,cell) pair.
|
|
||||||
+uint32_t llama_kv_cache::get_n_gather(uint32_t n_kv, const slot_info & sinfo) const {
|
|
||||||
+ // Multi-stream: the gathered K/V/mask tensors are rectangular [.., n_gather,
|
|
||||||
+ // n_stream], so n_gather is the MAX non-empty count across the batch streams.
|
|
||||||
+ // Streams with fewer cells are padded (see get_gather_idxs) with a masked
|
|
||||||
+ // (empty) cell index, which contributes exp(-inf)=0 and is thus a no-op.
|
|
||||||
+ // K is laid out over physical streams [s0, s1]; index v_cells the same way.
|
|
||||||
+ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
|
|
||||||
+ uint32_t mx = 0;
|
|
||||||
+ for (uint32_t j = 0; j < ns; ++j) {
|
|
||||||
+ const auto & cells = v_cells[sinfo.s0 + j];
|
|
||||||
+ const uint32_t n = std::min<uint32_t>(n_kv, cells.size());
|
|
||||||
+ uint32_t cnt = 0;
|
|
||||||
+ for (uint32_t i = 0; i < n; ++i) {
|
|
||||||
+ if (!cells.is_empty(i)) {
|
|
||||||
+ ++cnt;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ mx = std::max(mx, cnt);
|
|
||||||
+ }
|
|
||||||
+ return mx;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void llama_kv_cache::get_gather_idxs(int32_t * dst, uint32_t n_kv, const slot_info & sinfo) const {
|
|
||||||
+ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
|
|
||||||
+ const uint32_t n_gather = get_n_gather(n_kv, sinfo);
|
|
||||||
+ // dst is [n_gather, n_stream] (ne0 = n_gather): column s at dst[s*n_gather..].
|
|
||||||
+ for (uint32_t j = 0; j < ns; ++j) {
|
|
||||||
+ const auto & cells = v_cells[sinfo.s0 + j];
|
|
||||||
+ const uint32_t n = std::min<uint32_t>(n_kv, cells.size());
|
|
||||||
+ // Collect the non-empty cells, then order them by token POSITION (not by
|
|
||||||
+ // physical cell index). The attention reduction (flash-attn online
|
|
||||||
+ // softmax, and the non-flash soft_max) runs over cells in array order and
|
|
||||||
+ // is order-sensitive in floating point. Stock (contiguous) placement
|
|
||||||
+ // happens to store cells in position order, so emitting the gathered
|
|
||||||
+ // indices in position order reproduces stock's exact reduction order -
|
|
||||||
+ // making the paged read bit-identical, not merely math-equivalent.
|
|
||||||
+ std::vector<std::pair<llama_pos, int32_t>> pc;
|
|
||||||
+ pc.reserve(n);
|
|
||||||
+ int32_t pad = -1;
|
|
||||||
+ for (uint32_t i = 0; i < n; ++i) {
|
|
||||||
+ if (!cells.is_empty(i)) {
|
|
||||||
+ pc.emplace_back(cells.pos_get(i), (int32_t) i);
|
|
||||||
+ } else if (pad < 0) {
|
|
||||||
+ pad = (int32_t) i; // first empty cell: its mask is -inf -> safe pad
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ std::sort(pc.begin(), pc.end());
|
|
||||||
+ int32_t * col = dst + (size_t) j * n_gather;
|
|
||||||
+ for (size_t k = 0; k < pc.size(); ++k) {
|
|
||||||
+ col[k] = pc[k].second;
|
|
||||||
+ }
|
|
||||||
+ // Pad the tail to n_gather with a masked (empty) cell so the rectangular
|
|
||||||
+ // gather drops to zero contribution for streams shorter than the max.
|
|
||||||
+ const int32_t padv = (pad >= 0) ? pad : (pc.empty() ? 0 : pc.back().second);
|
|
||||||
+ for (uint32_t k = (uint32_t) pc.size(); k < n_gather; ++k) {
|
|
||||||
+ col[k] = padv;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
|
|
||||||
GGML_UNUSED(sinfo);
|
|
||||||
|
|
||||||
@@ -2620,6 +2686,14 @@ ggml_tensor * llama_kv_cache_context::get_v(ggml_context * ctx, int32_t il) cons
|
|
||||||
return kv->get_v(ctx, il, n_kv, sinfos[i_cur]);
|
|
||||||
}
|
|
||||||
|
|
||||||
+uint32_t llama_kv_cache_context::get_n_gather() const {
|
|
||||||
+ return kv->get_n_gather(n_kv, sinfos[i_cur]);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void llama_kv_cache_context::get_gather_idxs(int32_t * dst) const {
|
|
||||||
+ kv->get_gather_idxs(dst, n_kv, sinfos[i_cur]);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
|
|
||||||
return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
|
|
||||||
}
|
|
||||||
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
|
|
||||||
index 3d68f98..494c0fb 100644
|
|
||||||
--- a/src/llama-kv-cache.h
|
|
||||||
+++ b/src/llama-kv-cache.h
|
|
||||||
@@ -171,6 +171,12 @@ public:
|
|
||||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
|
|
||||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
|
|
||||||
|
|
||||||
+ // [paged 0003] count / list the non-empty cells in [0, n_kv) per stream of
|
|
||||||
+ // sinfo (position-sorted, padded across streams). Used by paged-attn
|
|
||||||
+ // gather-read. get_n_gather returns the max count across streams.
|
|
||||||
+ uint32_t get_n_gather(uint32_t n_kv, const slot_info & sinfo) const;
|
|
||||||
+ void get_gather_idxs(int32_t * dst, uint32_t n_kv, const slot_info & sinfo) const;
|
|
||||||
+
|
|
||||||
// store k_cur and v_cur in the cache based on the provided head location
|
|
||||||
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
|
|
||||||
ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const;
|
|
||||||
@@ -368,6 +374,11 @@ public:
|
|
||||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
|
||||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
|
||||||
|
|
||||||
+ // [paged 0003] gather-read helpers (delegate to the kv cache for the
|
|
||||||
+ // current ubatch's stream).
|
|
||||||
+ uint32_t get_n_gather() const;
|
|
||||||
+ void get_gather_idxs(int32_t * dst) const;
|
|
||||||
+
|
|
||||||
// store k_cur and v_cur in the cache based on the provided head location
|
|
||||||
// note: the heads in k_cur and v_cur should be laid out contiguously in memory
|
|
||||||
// - k_cur [n_embd_head_k, n_head_k, n_tokens]
|
|
||||||
diff --git a/src/paged-attn.cpp b/src/paged-attn.cpp
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000..ade75e8
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/src/paged-attn.cpp
|
|
||||||
@@ -0,0 +1,128 @@
|
|
||||||
+#include "paged-attn.h"
|
|
||||||
+
|
|
||||||
+#include "llama-graph.h"
|
|
||||||
+#include "llama-kv-cache.h"
|
|
||||||
+
|
|
||||||
+#include "ggml.h"
|
|
||||||
+#include "ggml-backend.h"
|
|
||||||
+
|
|
||||||
+#include <cstdlib>
|
|
||||||
+#include <cstdio>
|
|
||||||
+
|
|
||||||
+namespace paged_attn {
|
|
||||||
+
|
|
||||||
+bool active() {
|
|
||||||
+ static const bool a = (std::getenv("LLAMA_KV_PAGED") != nullptr);
|
|
||||||
+ return a;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static bool debug() {
|
|
||||||
+ static const bool d = (std::getenv("LLAMA_KV_PAGED_DEBUG") != nullptr);
|
|
||||||
+ return d;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+namespace {
|
|
||||||
+
|
|
||||||
+// Graph input that, at set_input time, fills an I32 [n_gather, n_stream] tensor
|
|
||||||
+// with each stream's non-empty cell indices (position-sorted, padded with a
|
|
||||||
+// masked/empty cell) by delegating to the kv-cache context. Private to this
|
|
||||||
+// unit; default can_reuse()==false keeps the graph from being reused across
|
|
||||||
+// decodes (n_gather grows every step).
|
|
||||||
+class input_gather_idxs : public llm_graph_input_i {
|
|
||||||
+public:
|
|
||||||
+ input_gather_idxs(const llama_kv_cache_context * mctx, ggml_tensor * idxs)
|
|
||||||
+ : mctx(mctx), idxs(idxs) {}
|
|
||||||
+
|
|
||||||
+ void set_input(const llama_ubatch * ubatch) override {
|
|
||||||
+ GGML_UNUSED(ubatch);
|
|
||||||
+ GGML_ASSERT(idxs && ggml_backend_buffer_is_host(idxs->buffer));
|
|
||||||
+ mctx->get_gather_idxs((int32_t *) idxs->data);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ const llama_kv_cache_context * mctx;
|
|
||||||
+ ggml_tensor * idxs;
|
|
||||||
+};
|
|
||||||
+
|
|
||||||
+} // namespace
|
|
||||||
+
|
|
||||||
+void gather(ggml_context * ctx0,
|
|
||||||
+ llm_graph_result * res,
|
|
||||||
+ const llama_kv_cache_context * mctx,
|
|
||||||
+ ggml_tensor ** k,
|
|
||||||
+ ggml_tensor ** v,
|
|
||||||
+ ggml_tensor ** kq_mask) {
|
|
||||||
+ if (!active()) {
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ ggml_tensor * K = *k;
|
|
||||||
+ ggml_tensor * V = *v;
|
|
||||||
+ ggml_tensor * M = *kq_mask;
|
|
||||||
+
|
|
||||||
+ // Number of streams (sequences) in the unified batch. K is laid out
|
|
||||||
+ // [d, h, n_kv, n_stream] and the mask is [n_kv, n_tps, 1, n_stream]; the
|
|
||||||
+ // gather is per-stream (one index column per stream), so a single
|
|
||||||
+ // ggml_get_rows over the stream axis handles 1..N streams uniformly.
|
|
||||||
+ const int64_t n_stream = K->ne[3];
|
|
||||||
+ GGML_ASSERT(M->ne[3] == n_stream);
|
|
||||||
+
|
|
||||||
+ const int64_t n_gather = (int64_t) mctx->get_n_gather();
|
|
||||||
+ if (n_gather <= 0) {
|
|
||||||
+ // Worst-case graph reserve (empty cache) or nothing placed yet: leave
|
|
||||||
+ // the full [0, n_kv) read untouched so buffer sizing stays worst-case.
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (debug()) {
|
|
||||||
+ static int64_t once = 0;
|
|
||||||
+ if (once++ < 2) {
|
|
||||||
+ fprintf(stderr, "[paged-attn] gather n_stream=%lld n_kv=%lld n_gather=%lld\n",
|
|
||||||
+ (long long) n_stream, (long long) K->ne[2], (long long) n_gather);
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ // Per-stream index tensor [n_gather, n_stream], filled at set_input from
|
|
||||||
+ // each stream's non-empty cells. ggml_get_rows broadcasts along ne[1]==
|
|
||||||
+ // n_stream, so column s gathers from stream s of the source.
|
|
||||||
+ ggml_tensor * idx = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_gather, n_stream);
|
|
||||||
+ ggml_set_input(idx);
|
|
||||||
+ res->add_input(llm_graph_input_ptr(new input_gather_idxs(mctx, idx)));
|
|
||||||
+
|
|
||||||
+ // --- gather K: collapse (head_dim, n_head) so cells become the row axis ---
|
|
||||||
+ {
|
|
||||||
+ ggml_tensor * t = ggml_cont(ctx0, K); // [d, h, n_kv, ns]
|
|
||||||
+ t = ggml_reshape_3d(ctx0, t, K->ne[0]*K->ne[1], K->ne[2], n_stream); // [d*h, n_kv, ns]
|
|
||||||
+ t = ggml_get_rows(ctx0, t, idx); // [d*h, n_gather, ns]
|
|
||||||
+ *k = ggml_reshape_4d(ctx0, t, K->ne[0], K->ne[1], n_gather, n_stream); // [d, h, n_gather, ns]
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ // --- gather V ---
|
|
||||||
+ // Normalize to a non-transposed [d, h, n_kv, ns] view first, so the gathered
|
|
||||||
+ // result is contiguous and build_attn_mha sees a consistent v_trans==false.
|
|
||||||
+ {
|
|
||||||
+ const bool v_trans = V->nb[1] > V->nb[2];
|
|
||||||
+ ggml_tensor * vsrc = v_trans
|
|
||||||
+ ? ggml_permute(ctx0, V, 2, 1, 0, 3) // [n_kv, h, d, ns] -> [d, h, n_kv, ns]
|
|
||||||
+ : V; // already [d, h, n_kv, ns]
|
|
||||||
+ ggml_tensor * t = ggml_cont(ctx0, vsrc); // [d, h, n_kv, ns]
|
|
||||||
+ t = ggml_reshape_3d(ctx0, t, vsrc->ne[0]*vsrc->ne[1], vsrc->ne[2], n_stream); // [d*h, n_kv, ns]
|
|
||||||
+ t = ggml_get_rows(ctx0, t, idx); // [d*h, n_gather, ns]
|
|
||||||
+ *v = ggml_reshape_4d(ctx0, t, vsrc->ne[0], vsrc->ne[1], n_gather, n_stream); // [d, h, n_gather, ns]
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ // --- gather mask (cells are ne0): transpose so cells become the row axis,
|
|
||||||
+ // gather per stream, transpose back ---
|
|
||||||
+ {
|
|
||||||
+ ggml_tensor * m = ggml_reshape_3d(ctx0, M, M->ne[0], M->ne[1], n_stream); // [n_kv, n_tps, ns]
|
|
||||||
+ m = ggml_cont(ctx0, ggml_transpose(ctx0, m)); // [n_tps, n_kv, ns]
|
|
||||||
+ m = ggml_get_rows(ctx0, m, idx); // [n_tps, n_gather, ns] (F32)
|
|
||||||
+ m = ggml_cont(ctx0, ggml_transpose(ctx0, m)); // [n_gather, n_tps, ns]
|
|
||||||
+ m = ggml_reshape_4d(ctx0, m, n_gather, M->ne[1], 1, n_stream);
|
|
||||||
+ if (M->type != m->type) {
|
|
||||||
+ m = ggml_cast(ctx0, m, M->type); // flash-attn requires an F16 mask
|
|
||||||
+ }
|
|
||||||
+ *kq_mask = m;
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+} // namespace paged_attn
|
|
||||||
diff --git a/src/paged-attn.h b/src/paged-attn.h
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000..c5b7bd7
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/src/paged-attn.h
|
|
||||||
@@ -0,0 +1,40 @@
|
|
||||||
+#pragma once
|
|
||||||
+// Paged attention gather-read (patch 0003, experimental).
|
|
||||||
+//
|
|
||||||
+// Companion to the paged block placement in llama_kv_cache::find_slot (patch
|
|
||||||
+// 0002). Patch 0002 places a sequence's tokens at permuted, non-contiguous
|
|
||||||
+// fixed-size block cells, but attention still reads the whole [0, n_kv) window
|
|
||||||
+// (empty cells masked to -inf). This unit compacts that read: it gathers K, V
|
|
||||||
+// and the kq_mask down to ONLY the sequence's used (non-empty) cells before
|
|
||||||
+// build_attn_mha.
|
|
||||||
+//
|
|
||||||
+// Correctness: attention is permutation-invariant over the KV set, and dropping
|
|
||||||
+// already-masked empty cells removes only exp(-inf)=0 terms - so greedy output
|
|
||||||
+// is identical to stock. Gated behind env LLAMA_KV_PAGED; a no-op when unset.
|
|
||||||
+//
|
|
||||||
+// All logic lives here to keep the core files additive: build_attn gets one
|
|
||||||
+// call, llama_kv_cache_context gets two thin accessors, CMake gets one line.
|
|
||||||
+
|
|
||||||
+#include <cstdint>
|
|
||||||
+
|
|
||||||
+struct ggml_context;
|
|
||||||
+struct ggml_tensor;
|
|
||||||
+class llm_graph_result;
|
|
||||||
+class llama_kv_cache_context;
|
|
||||||
+
|
|
||||||
+namespace paged_attn {
|
|
||||||
+
|
|
||||||
+// true iff env LLAMA_KV_PAGED is set (evaluated once).
|
|
||||||
+bool active();
|
|
||||||
+
|
|
||||||
+// Gather K, V and the kq_mask down to the current sequence's non-empty cells.
|
|
||||||
+// No-op (returns immediately) unless active(). On return *k, *v and *kq_mask
|
|
||||||
+// point at the compacted tensors; pass them straight to build_attn_mha.
|
|
||||||
+void gather(ggml_context * ctx0,
|
|
||||||
+ llm_graph_result * res,
|
|
||||||
+ const llama_kv_cache_context * mctx,
|
|
||||||
+ ggml_tensor ** k,
|
|
||||||
+ ggml_tensor ** v,
|
|
||||||
+ ggml_tensor ** kq_mask);
|
|
||||||
+
|
|
||||||
+} // namespace paged_attn
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,298 +0,0 @@
|
|||||||
From 7c294973de28d1ac991505638d726acfb371d541 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Mon, 22 Jun 2026 10:50:35 +0200
|
|
||||||
Subject: [PATCH] paged on-demand block allocation (env LLAMA_KV_PAGED) - patch
|
|
||||||
0004
|
|
||||||
|
|
||||||
Drive the paged placement in find_slot through the vendored PagedKVManager
|
|
||||||
(patch 0001) instead of a fixed full-pool permutation. Blocks are popped from a
|
|
||||||
free pool on demand as the sequence crosses block boundaries (peak << full
|
|
||||||
reservation) and returned on sequence end (seq_rm full removal / clear). One
|
|
||||||
manager per (kv-cache, stream); all state lives in the new src/paged-alloc unit,
|
|
||||||
so the core kv-cache struct is untouched - find_slot/clear/seq_rm gain only a
|
|
||||||
gated call. Default off; stock path byte-identical.
|
|
||||||
---
|
|
||||||
src/CMakeLists.txt | 1 +
|
|
||||||
src/llama-kv-cache.cpp | 69 +++++++++++++++++----------
|
|
||||||
src/paged-alloc.cpp | 106 +++++++++++++++++++++++++++++++++++++++++
|
|
||||||
src/paged-alloc.h | 39 +++++++++++++++
|
|
||||||
4 files changed, 190 insertions(+), 25 deletions(-)
|
|
||||||
create mode 100644 src/paged-alloc.cpp
|
|
||||||
create mode 100644 src/paged-alloc.h
|
|
||||||
|
|
||||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
|
||||||
index 58083b3..4d9d7d1 100644
|
|
||||||
--- a/src/CMakeLists.txt
|
|
||||||
+++ b/src/CMakeLists.txt
|
|
||||||
@@ -26,6 +26,7 @@ add_library(llama
|
|
||||||
llama-kv-cache-iswa.cpp
|
|
||||||
paged-kv-manager.cpp
|
|
||||||
paged-attn.cpp
|
|
||||||
+ paged-alloc.cpp
|
|
||||||
llama-kv-cache-dsa.cpp
|
|
||||||
llama-memory.cpp
|
|
||||||
llama-memory-hybrid.cpp
|
|
||||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
|
||||||
index 30d02d7..1125d9a 100644
|
|
||||||
--- a/src/llama-kv-cache.cpp
|
|
||||||
+++ b/src/llama-kv-cache.cpp
|
|
||||||
@@ -1,4 +1,5 @@
|
|
||||||
#include "llama-kv-cache.h"
|
|
||||||
+#include "paged-alloc.h"
|
|
||||||
#include <vector>
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
@@ -381,6 +382,11 @@ llama_kv_cache::llama_kv_cache(
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_kv_cache::clear(bool data) {
|
|
||||||
+ // [paged 0004] return all on-demand blocks to the pool on cache clear.
|
|
||||||
+ if (paged_alloc::active()) {
|
|
||||||
+ paged_alloc::release_all(this);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
for (uint32_t s = 0; s < n_stream; ++s) {
|
|
||||||
v_cells[s].reset();
|
|
||||||
v_heads[s] = 0;
|
|
||||||
@@ -409,6 +415,16 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
||||||
p1 = std::numeric_limits<llama_pos>::max();
|
|
||||||
}
|
|
||||||
|
|
||||||
+ // [paged 0004] free a stream's on-demand blocks when its whole sequence is
|
|
||||||
+ // removed (sequence end), so they return to the pool for reuse.
|
|
||||||
+ if (paged_alloc::active() && p0 == 0 && p1 == std::numeric_limits<llama_pos>::max()) {
|
|
||||||
+ if (seq_id >= 0) {
|
|
||||||
+ paged_alloc::release(this, (int) seq_to_stream[seq_id]);
|
|
||||||
+ } else {
|
|
||||||
+ paged_alloc::release_all(this);
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
if (seq_id >= 0) {
|
|
||||||
auto & cells = v_cells[seq_to_stream[seq_id]];
|
|
||||||
auto & head = v_heads[seq_to_stream[seq_id]];
|
|
||||||
@@ -1030,36 +1046,39 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
|
||||||
// the correctness premise of paged attention. Enabled via LLAMA_KV_PAGED.
|
|
||||||
// Single-sequence scope (uses get_used() as the logical base); falls back
|
|
||||||
// to the normal allocator if the permuted cells aren't available.
|
|
||||||
- static const bool paged_mode = (std::getenv("LLAMA_KV_PAGED") != nullptr);
|
|
||||||
- if (paged_mode) {
|
|
||||||
+ // [paged 0004] On-demand block allocation. Patch 0002 proved attention is
|
|
||||||
+ // invariant to physical KV placement; here that placement is driven by
|
|
||||||
+ // the vendored PagedKVManager (patch 0001): blocks are popped from a free
|
|
||||||
+ // pool only as the sequence crosses block boundaries (peak << full
|
|
||||||
+ // reservation) and returned on sequence end. Enabled via LLAMA_KV_PAGED;
|
|
||||||
+ // falls back to the normal allocator on pool exhaustion or any conflict.
|
|
||||||
+ if (paged_alloc::active()) {
|
|
||||||
const uint32_t bs = 16; // block size (tokens/block)
|
|
||||||
- const uint32_t nblk = cells.size() / bs; // blocks in this stream's pool
|
|
||||||
+ const uint32_t nblk = cells.size() / bs; // this stream's block budget
|
|
||||||
if (nblk >= 2) {
|
|
||||||
- // stride coprime to nblk => block-index permutation is a bijection
|
|
||||||
- uint32_t k = 1;
|
|
||||||
- for (uint32_t cand = (nblk / 2) | 1u; cand < nblk; cand += 2) {
|
|
||||||
- if (std::gcd(cand, nblk) == 1u) { k = cand; break; }
|
|
||||||
- }
|
|
||||||
const uint32_t base = cells.get_used();
|
|
||||||
- bool ok = true;
|
|
||||||
- for (uint32_t i = 0; i < n_tokens; ++i) {
|
|
||||||
- const uint32_t L = base + i;
|
|
||||||
- const uint32_t b = L / bs;
|
|
||||||
- const uint32_t off = L % bs;
|
|
||||||
- if (b >= nblk) { ok = false; break; }
|
|
||||||
- const uint32_t phys = ((b * k) % nblk) * bs + off; // permuted block
|
|
||||||
- if (phys >= cells.size() || !cells.is_empty(phys)) { ok = false; break; }
|
|
||||||
- res.idxs[s].push_back(phys);
|
|
||||||
- }
|
|
||||||
- if (ok && res.idxs[s].size() == n_tokens) {
|
|
||||||
- if (std::getenv("LLAMA_KV_PAGED_DEBUG")) {
|
|
||||||
- fprintf(stderr, "[paged] seq placed %u tok at cells:", n_tokens);
|
|
||||||
- for (uint32_t z = 0; z < res.idxs[s].size() && z < 24; ++z) fprintf(stderr, " %u", res.idxs[s][z]);
|
|
||||||
- fprintf(stderr, " (k=%u nblk=%u base=%u)\n", k, nblk, base);
|
|
||||||
+ const int strm = (int) seq_to_stream[seq_id];
|
|
||||||
+ std::vector<uint32_t> placed;
|
|
||||||
+ if (paged_alloc::place(this, strm, base, n_tokens, bs, nblk, placed)) {
|
|
||||||
+ bool ok = (placed.size() == n_tokens);
|
|
||||||
+ for (uint32_t i = 0; ok && i < n_tokens; ++i) {
|
|
||||||
+ if (placed[i] >= cells.size() || !cells.is_empty(placed[i])) {
|
|
||||||
+ ok = false;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ if (ok) {
|
|
||||||
+ for (uint32_t phys : placed) {
|
|
||||||
+ res.idxs[s].push_back(phys);
|
|
||||||
+ }
|
|
||||||
+ if (std::getenv("LLAMA_KV_PAGED_DEBUG")) {
|
|
||||||
+ fprintf(stderr, "[paged] stream %d placed %u tok at cells:", strm, n_tokens);
|
|
||||||
+ for (uint32_t z = 0; z < res.idxs[s].size() && z < 24; ++z) fprintf(stderr, " %u", res.idxs[s][z]);
|
|
||||||
+ fprintf(stderr, " (nblk=%u base=%u)\n", nblk, base);
|
|
||||||
+ }
|
|
||||||
+ continue; // on-demand paged placement succeeded
|
|
||||||
}
|
|
||||||
- continue; // paged placement succeeded for this sequence
|
|
||||||
+ res.idxs[s].clear(); // fall back to the normal allocator
|
|
||||||
}
|
|
||||||
- res.idxs[s].clear(); // fall back to the normal allocator
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
diff --git a/src/paged-alloc.cpp b/src/paged-alloc.cpp
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000..1d13f9c
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/src/paged-alloc.cpp
|
|
||||||
@@ -0,0 +1,106 @@
|
|
||||||
+#include "paged-alloc.h"
|
|
||||||
+#include "paged-kv-manager.h"
|
|
||||||
+
|
|
||||||
+#include <cstdlib>
|
|
||||||
+#include <cstdio>
|
|
||||||
+#include <map>
|
|
||||||
+#include <memory>
|
|
||||||
+#include <utility>
|
|
||||||
+
|
|
||||||
+namespace paged_alloc {
|
|
||||||
+
|
|
||||||
+bool active() {
|
|
||||||
+ static const bool a = (std::getenv("LLAMA_KV_PAGED") != nullptr);
|
|
||||||
+ return a;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+static bool debug() {
|
|
||||||
+ static const bool d = (std::getenv("LLAMA_KV_PAGED_DEBUG") != nullptr);
|
|
||||||
+ return d;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+namespace {
|
|
||||||
+
|
|
||||||
+using key_t = std::pair<const void *, int>;
|
|
||||||
+
|
|
||||||
+// One PagedKVManager per (kv-cache, stream): each stream owns a separate
|
|
||||||
+// physical pool of cells.size() cells, so a manager's block ids map directly to
|
|
||||||
+// cell ranges within that stream's pool. The internal request id is always 0.
|
|
||||||
+std::map<key_t, std::unique_ptr<paged::PagedKVManager>> g_managers;
|
|
||||||
+
|
|
||||||
+paged::PagedKVManager * get_mgr(const void * cache, int stream,
|
|
||||||
+ uint32_t pool_blocks, uint32_t block_size) {
|
|
||||||
+ const key_t k{cache, stream};
|
|
||||||
+ auto it = g_managers.find(k);
|
|
||||||
+ if (it == g_managers.end()) {
|
|
||||||
+ // enable_caching=false: prefix caching is a later patch; 0004 exercises
|
|
||||||
+ // only on-demand allocate / free.
|
|
||||||
+ auto mgr = std::make_unique<paged::PagedKVManager>(
|
|
||||||
+ (int32_t) pool_blocks, (int) block_size, /*enable_caching=*/false);
|
|
||||||
+ it = g_managers.emplace(k, std::move(mgr)).first;
|
|
||||||
+ }
|
|
||||||
+ return it->second.get();
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+} // namespace
|
|
||||||
+
|
|
||||||
+bool place(const void * cache, int stream, uint32_t base, uint32_t n_tokens,
|
|
||||||
+ uint32_t block_size, uint32_t pool_blocks,
|
|
||||||
+ std::vector<uint32_t> & out) {
|
|
||||||
+ if (n_tokens == 0) {
|
|
||||||
+ return true;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ paged::PagedKVManager * mgr = get_mgr(cache, stream, pool_blocks, block_size);
|
|
||||||
+
|
|
||||||
+ const size_t before = mgr->block_table(0).size();
|
|
||||||
+
|
|
||||||
+ // Grow the request to cover the highest logical position. The manager pops
|
|
||||||
+ // free blocks only for the boundaries actually crossed - that is the on-
|
|
||||||
+ // demand behavior; an already-covered range adds nothing.
|
|
||||||
+ if (!mgr->allocate(0, (size_t) base + n_tokens)) {
|
|
||||||
+ return false; // pool exhausted -> caller falls back to the stock path
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ out.reserve(out.size() + n_tokens);
|
|
||||||
+ for (uint32_t i = 0; i < n_tokens; ++i) {
|
|
||||||
+ const int64_t s = mgr->slot(0, (int) (base + i));
|
|
||||||
+ out.push_back((uint32_t) s);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (debug()) {
|
|
||||||
+ const size_t after = mgr->block_table(0).size();
|
|
||||||
+ if (after != before) {
|
|
||||||
+ fprintf(stderr,
|
|
||||||
+ "[paged-alloc] cache=%p stream=%d grew %zu->%zu blocks "
|
|
||||||
+ "(budget=%u; base=%u +%u tok)\n",
|
|
||||||
+ cache, stream, before, after, pool_blocks, base, n_tokens);
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ return true;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void release(const void * cache, int stream) {
|
|
||||||
+ auto it = g_managers.find({cache, stream});
|
|
||||||
+ if (it == g_managers.end()) {
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+ it->second->free(0);
|
|
||||||
+ g_managers.erase(it);
|
|
||||||
+ if (debug()) {
|
|
||||||
+ fprintf(stderr, "[paged-alloc] released cache=%p stream=%d\n", cache, stream);
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void release_all(const void * cache) {
|
|
||||||
+ for (auto it = g_managers.begin(); it != g_managers.end(); ) {
|
|
||||||
+ if (it->first.first == cache) {
|
|
||||||
+ it = g_managers.erase(it);
|
|
||||||
+ } else {
|
|
||||||
+ ++it;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+} // namespace paged_alloc
|
|
||||||
diff --git a/src/paged-alloc.h b/src/paged-alloc.h
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000..bf66665
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/src/paged-alloc.h
|
|
||||||
@@ -0,0 +1,39 @@
|
|
||||||
+#pragma once
|
|
||||||
+// On-demand paged KV block allocation (patch 0004, experimental).
|
|
||||||
+//
|
|
||||||
+// Backs the paged placement in llama_kv_cache::find_slot (patch 0002) with the
|
|
||||||
+// vendored host-side PagedKVManager (patch 0001). Instead of mapping a
|
|
||||||
+// sequence's logical positions onto a fixed full-pool permutation, blocks are
|
|
||||||
+// popped from a free pool ON DEMAND as the sequence crosses block boundaries,
|
|
||||||
+// and returned to the pool on sequence end. This is where the paged memory-
|
|
||||||
+// capacity benefit begins: a short sequence holds only a few blocks, not the
|
|
||||||
+// whole reserved window.
|
|
||||||
+//
|
|
||||||
+// Gated behind env LLAMA_KV_PAGED; a no-op when unset. All state lives in this
|
|
||||||
+// unit (a static registry keyed by kv-cache + stream), so the core kv-cache
|
|
||||||
+// struct stays untouched - find_slot only gains a gated call.
|
|
||||||
+
|
|
||||||
+#include <cstdint>
|
|
||||||
+#include <vector>
|
|
||||||
+
|
|
||||||
+namespace paged_alloc {
|
|
||||||
+
|
|
||||||
+// true iff env LLAMA_KV_PAGED is set (evaluated once).
|
|
||||||
+bool active();
|
|
||||||
+
|
|
||||||
+// Place n_tokens logical positions [base, base+n_tokens) of one stream on
|
|
||||||
+// demand, appending their physical cell indices to `out`. pool_blocks =
|
|
||||||
+// cells.size()/block_size is this stream's block budget. Returns false (leaving
|
|
||||||
+// `out` unchanged) on pool exhaustion, so the caller falls back to the stock
|
|
||||||
+// allocator. The caller still validates each returned cell is empty.
|
|
||||||
+bool place(const void * cache, int stream, uint32_t base, uint32_t n_tokens,
|
|
||||||
+ uint32_t block_size, uint32_t pool_blocks,
|
|
||||||
+ std::vector<uint32_t> & out);
|
|
||||||
+
|
|
||||||
+// Return a stream's blocks to the pool (sequence end).
|
|
||||||
+void release(const void * cache, int stream);
|
|
||||||
+
|
|
||||||
+// Return every stream's blocks for a kv-cache (clear() / teardown).
|
|
||||||
+void release_all(const void * cache);
|
|
||||||
+
|
|
||||||
+} // namespace paged_alloc
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,143 +0,0 @@
|
|||||||
From 141029beec609e87f24f6f6bba3ec842d7037862 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Mon, 22 Jun 2026 12:13:44 +0200
|
|
||||||
Subject: [PATCH] paged cross-request prefix caching (env LLAMA_KV_PAGED) -
|
|
||||||
patch 0006
|
|
||||||
|
|
||||||
Add host-side cross-request prefix sharing to the vendored PagedKVManager
|
|
||||||
(patches 0001-0004): on placement, hash a new sequence prefix blocks, reuse the
|
|
||||||
matching cached physical blocks (ref_cnt++) for the shared prefix and allocate
|
|
||||||
fresh blocks only for the divergent suffix. A shared block is freed only at
|
|
||||||
ref 0; copy-on-write privatises a still-shared (ref>1) block before a divergent
|
|
||||||
write so co-owners stay byte-correct. All logic lives in the vendored
|
|
||||||
src/paged-kv-manager unit (place_with_prefix / cow_block / ref-counting); the
|
|
||||||
core kv-cache files are untouched. Default off; gated behind LLAMA_KV_PAGED.
|
|
||||||
|
|
||||||
Wiring the physical-cell reuse into find_slot so the engine itself skips
|
|
||||||
recompute needs core seq-membership changes and is left to a later patch.
|
|
||||||
|
|
||||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
|
||||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
---
|
|
||||||
src/paged-kv-manager.cpp | 65 ++++++++++++++++++++++++++++++++++++++++
|
|
||||||
src/paged-kv-manager.h | 23 ++++++++++++++
|
|
||||||
2 files changed, 88 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/src/paged-kv-manager.cpp b/src/paged-kv-manager.cpp
|
|
||||||
index ca0dcd8..4c6ee4c 100644
|
|
||||||
--- a/src/paged-kv-manager.cpp
|
|
||||||
+++ b/src/paged-kv-manager.cpp
|
|
||||||
@@ -293,4 +293,69 @@ void PagedKVManager::cache_blocks(int seq_id, const std::vector<uint64_t>& block
|
|
||||||
pool_.cache_full_blocks(req, /*num_cached=*/0, n_full, block_hashes);
|
|
||||||
}
|
|
||||||
|
|
||||||
+// ---------------------------------------------------------------------------
|
|
||||||
+// Cross-request prefix caching + copy-on-write (patch 0006)
|
|
||||||
+// ---------------------------------------------------------------------------
|
|
||||||
+
|
|
||||||
+size_t PagedKVManager::place_with_prefix(int seq_id, const std::vector<int>& token_ids) {
|
|
||||||
+ auto& req = req_to_blocks_[seq_id];
|
|
||||||
+
|
|
||||||
+ // Longest cached prefix: hash the full blocks and stop at the first miss.
|
|
||||||
+ // A block hash transitively encodes its whole prefix (FNV chaining), so the
|
|
||||||
+ // first miss bounds the reusable prefix (vLLM find_longest_cache_hit).
|
|
||||||
+ const std::vector<uint64_t> hashes = compute_block_hashes(token_ids);
|
|
||||||
+ std::vector<KVCacheBlock*> hits;
|
|
||||||
+ for (uint64_t bh : hashes) {
|
|
||||||
+ KVCacheBlock* cb = pool_.get_cached_block(bh);
|
|
||||||
+ if (!cb) break;
|
|
||||||
+ hits.push_back(cb);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ // Reuse: ++ref_cnt (pulling warm blocks back out of the free list) then
|
|
||||||
+ // splice the shared physical blocks into this sequence's block table.
|
|
||||||
+ pool_.touch(hits);
|
|
||||||
+ req.insert(req.end(), hits.begin(), hits.end());
|
|
||||||
+
|
|
||||||
+ // Allocate fresh blocks only for the divergent suffix.
|
|
||||||
+ const size_t need = cdiv(token_ids.size(), block_size_);
|
|
||||||
+ if (need > req.size()) {
|
|
||||||
+ const size_t add = need - req.size();
|
|
||||||
+ if (add > pool_.get_num_free_blocks()) {
|
|
||||||
+ // OOM: roll the sequence back (un-touch the shared prefix so no ref
|
|
||||||
+ // leaks) and report no placement; the caller falls back to stock.
|
|
||||||
+ std::vector<KVCacheBlock*> ordered(req.rbegin(), req.rend());
|
|
||||||
+ pool_.free_blocks(ordered);
|
|
||||||
+ req.clear();
|
|
||||||
+ return 0;
|
|
||||||
+ }
|
|
||||||
+ auto nb = pool_.get_new_blocks(add);
|
|
||||||
+ req.insert(req.end(), nb.begin(), nb.end());
|
|
||||||
+ }
|
|
||||||
+ return hits.size();
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+std::pair<int32_t, int32_t> PagedKVManager::cow_block(int seq_id, size_t bi) {
|
|
||||||
+ auto& req = req_to_blocks_.at(seq_id);
|
|
||||||
+ KVCacheBlock* old = req.at(bi);
|
|
||||||
+ if (old->ref_cnt <= 1) {
|
|
||||||
+ return { old->block_id, old->block_id }; // already private - no copy
|
|
||||||
+ }
|
|
||||||
+ // Private copy for this sequence. get_new_blocks sets the fresh block's
|
|
||||||
+ // ref_cnt to 1; free_blocks decrements the shared block, which stays >0 so
|
|
||||||
+ // it is NOT returned to the pool and the other owners are left untouched.
|
|
||||||
+ KVCacheBlock* fresh = pool_.get_new_blocks(1).front();
|
|
||||||
+ pool_.free_blocks({ old });
|
|
||||||
+ req[bi] = fresh;
|
|
||||||
+ return { old->block_id, fresh->block_id };
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+int PagedKVManager::block_ref_cnt_at(int seq_id, size_t bi) const {
|
|
||||||
+ return req_to_blocks_.at(seq_id).at(bi)->ref_cnt;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+size_t PagedKVManager::num_blocks(int seq_id) const {
|
|
||||||
+ auto it = req_to_blocks_.find(seq_id);
|
|
||||||
+ return it == req_to_blocks_.end() ? 0 : it->second.size();
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
} // namespace paged
|
|
||||||
diff --git a/src/paged-kv-manager.h b/src/paged-kv-manager.h
|
|
||||||
index 740280a..34decbc 100644
|
|
||||||
--- a/src/paged-kv-manager.h
|
|
||||||
+++ b/src/paged-kv-manager.h
|
|
||||||
@@ -14,6 +14,7 @@
|
|
||||||
#include <vector>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <map>
|
|
||||||
+#include <utility>
|
|
||||||
|
|
||||||
namespace paged {
|
|
||||||
|
|
||||||
@@ -99,6 +100,28 @@ public:
|
|
||||||
size_t get_computed_blocks(const std::vector<uint64_t>& block_hashes); // returns num cached tokens
|
|
||||||
void cache_blocks(int seq_id, const std::vector<uint64_t>& block_hashes, size_t num_tokens);
|
|
||||||
|
|
||||||
+ // Cross-request prefix caching + copy-on-write (patch 0006).
|
|
||||||
+ //
|
|
||||||
+ // Splice the longest cached prefix of token_ids into seq_id (reuse the
|
|
||||||
+ // shared physical blocks, ref_cnt++ so a block frees only at ref 0) and
|
|
||||||
+ // allocate fresh blocks only for the divergent suffix. Returns the number of
|
|
||||||
+ // shared (reused) blocks; the caller skips recomputing those tokens. On pool
|
|
||||||
+ // exhaustion the sequence is rolled back (no ref leak) and 0 is returned.
|
|
||||||
+ size_t place_with_prefix(int seq_id, const std::vector<int>& token_ids);
|
|
||||||
+
|
|
||||||
+ // Copy-on-write the block at logical index bi of seq_id. If that block is
|
|
||||||
+ // shared (ref_cnt>1), allocate a fresh private block, drop this seq's ref on
|
|
||||||
+ // the shared one (other owners keep it, content untouched) and install the
|
|
||||||
+ // fresh block at bi. Returns {old_block_id, new_block_id}; new==old when the
|
|
||||||
+ // block was already private (ref_cnt<=1) and no copy is needed. The caller
|
|
||||||
+ // copies the physical cell contents old_block_id -> new_block_id.
|
|
||||||
+ std::pair<int32_t, int32_t> cow_block(int seq_id, size_t bi);
|
|
||||||
+
|
|
||||||
+ // Introspection for the prefix-share gate (debug/tests).
|
|
||||||
+ int block_ref_cnt_at(int seq_id, size_t bi) const;
|
|
||||||
+ size_t num_blocks(int seq_id) const;
|
|
||||||
+ size_t num_free_blocks() const { return pool_.get_num_free_blocks(); }
|
|
||||||
+
|
|
||||||
protected:
|
|
||||||
int block_size_;
|
|
||||||
BlockPool pool_;
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,531 +0,0 @@
|
|||||||
From da20c1c0571e84bc76202d915d4bb82892a3392b Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Mon, 22 Jun 2026 12:46:28 +0200
|
|
||||||
Subject: [PATCH] paged engine prefix recompute-skip (env LLAMA_KV_PAGED) -
|
|
||||||
patch 0007
|
|
||||||
|
|
||||||
Wire the host-side cross-request prefix cache (patch 0006) into the engine so a
|
|
||||||
new sequence physically SHARES the cached prefix blocks and skips recomputing the
|
|
||||||
shared prefix - the actual compute win that 0006 (which only proved the host-side
|
|
||||||
machinery + realised reuse via the stock seq_cp) did not yet deliver from the
|
|
||||||
paged path itself.
|
|
||||||
|
|
||||||
Mechanism (all gated behind LLAMA_KV_PAGED; default off, stock byte-identical):
|
|
||||||
|
|
||||||
* paged-alloc reworked from a per-stream, request-0, destroyed-on-free manager
|
|
||||||
into ONE persistent caching PagedKVManager per (kv-cache, stream) whose
|
|
||||||
requests are keyed by the real llama_seq_id. free(seq) now releases exactly
|
|
||||||
one sequence, so ref-counted shared blocks survive while another sharer holds
|
|
||||||
them. New seams: share_prefix (place_with_prefix -> shared prefix tokens),
|
|
||||||
slot, commit (publish a sequence into the content cache), ref-counted release,
|
|
||||||
plus ref/num-free introspection.
|
|
||||||
|
|
||||||
* Two gated llama_kv_cache methods (the core seq-membership handling 0007 needs):
|
|
||||||
paged_prefix_share() reuses the longest cached content prefix for a sequence
|
|
||||||
and marks the shared physical cells as belonging to it (cells.seq_add) so the
|
|
||||||
engine's attention mask includes the already-computed prefix KV; the caller
|
|
||||||
then decodes ONLY the divergent suffix. paged_prefix_commit() publishes a
|
|
||||||
sequence's full blocks for later reuse.
|
|
||||||
|
|
||||||
* find_slot's paged branch anchors placement on each sequence's own logical base
|
|
||||||
(ubatch.pos) and keys the manager request by seq_id, so an independently-freed
|
|
||||||
sequence and a shared prefix coexist in one unified pool. seq_rm/clear free
|
|
||||||
per-sequence (ref-counted) instead of nuking the whole stream.
|
|
||||||
|
|
||||||
* paged-prefix-api: a thin gated shim so a caller holding only the public
|
|
||||||
llama.h can reach the seam and the introspection without the internal headers.
|
|
||||||
|
|
||||||
Core existing-file touch: src/llama-kv-cache.{cpp,h}, +71 -3. Everything else is
|
|
||||||
additive vendored units. Verified on Qwen3-0.6B-Q8_0 (CPU, unified cache): a
|
|
||||||
sequence B sharing A's prefix decodes greedy tokens byte-identical to B from
|
|
||||||
scratch with the prefill computing ONLY the suffix (32 prefix tokens skipped) at
|
|
||||||
a block boundary AND mid-block; the shared block carries ref_cnt 2 while both
|
|
||||||
hold it, drops to 1 when one sharer is removed (survivor intact, re-shareable, no
|
|
||||||
use-after-free) and returns to the pool only when all sharers are freed. The
|
|
||||||
0004 serving gate (unified and non-unified) stays byte-identical stock vs paged.
|
|
||||||
|
|
||||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
|
||||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
---
|
|
||||||
src/CMakeLists.txt | 1 +
|
|
||||||
src/llama-kv-cache.cpp | 66 +++++++++++++++++++++++--
|
|
||||||
src/llama-kv-cache.h | 8 +++
|
|
||||||
src/paged-alloc.cpp | 104 ++++++++++++++++++++++++++++++---------
|
|
||||||
src/paged-alloc.h | 69 +++++++++++++++++++-------
|
|
||||||
src/paged-prefix-api.cpp | 48 ++++++++++++++++++
|
|
||||||
src/paged-prefix-api.h | 27 ++++++++++
|
|
||||||
7 files changed, 280 insertions(+), 43 deletions(-)
|
|
||||||
create mode 100644 src/paged-prefix-api.cpp
|
|
||||||
create mode 100644 src/paged-prefix-api.h
|
|
||||||
|
|
||||||
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
|
|
||||||
index 4d9d7d1..432f42d 100644
|
|
||||||
--- a/src/CMakeLists.txt
|
|
||||||
+++ b/src/CMakeLists.txt
|
|
||||||
@@ -27,6 +27,7 @@ add_library(llama
|
|
||||||
paged-kv-manager.cpp
|
|
||||||
paged-attn.cpp
|
|
||||||
paged-alloc.cpp
|
|
||||||
+ paged-prefix-api.cpp
|
|
||||||
llama-kv-cache-dsa.cpp
|
|
||||||
llama-memory.cpp
|
|
||||||
llama-memory-hybrid.cpp
|
|
||||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
|
||||||
index 1125d9a..7510ff9 100644
|
|
||||||
--- a/src/llama-kv-cache.cpp
|
|
||||||
+++ b/src/llama-kv-cache.cpp
|
|
||||||
@@ -419,7 +419,7 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
|
|
||||||
// removed (sequence end), so they return to the pool for reuse.
|
|
||||||
if (paged_alloc::active() && p0 == 0 && p1 == std::numeric_limits<llama_pos>::max()) {
|
|
||||||
if (seq_id >= 0) {
|
|
||||||
- paged_alloc::release(this, (int) seq_to_stream[seq_id]);
|
|
||||||
+ paged_alloc::release(this, (int) seq_to_stream[seq_id], (int) seq_id);
|
|
||||||
} else {
|
|
||||||
paged_alloc::release_all(this);
|
|
||||||
}
|
|
||||||
@@ -1056,10 +1056,15 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
|
||||||
const uint32_t bs = 16; // block size (tokens/block)
|
|
||||||
const uint32_t nblk = cells.size() / bs; // this stream's block budget
|
|
||||||
if (nblk >= 2) {
|
|
||||||
- const uint32_t base = cells.get_used();
|
|
||||||
+ // [paged 0007] Anchor placement on this sequence's own logical
|
|
||||||
+ // base position (ubatch.pos), not the shared used-count, and key
|
|
||||||
+ // the manager request by the real seq_id. slot(seq,pos) is then
|
|
||||||
+ // stable per sequence, so an independently-freed (ref-counted)
|
|
||||||
+ // sequence and a shared prefix can coexist in one unified pool.
|
|
||||||
+ const uint32_t base = (uint32_t) ubatch.pos[s*n_tokens];
|
|
||||||
const int strm = (int) seq_to_stream[seq_id];
|
|
||||||
std::vector<uint32_t> placed;
|
|
||||||
- if (paged_alloc::place(this, strm, base, n_tokens, bs, nblk, placed)) {
|
|
||||||
+ if (paged_alloc::place(this, strm, (int) seq_id, base, n_tokens, bs, nblk, placed)) {
|
|
||||||
bool ok = (placed.size() == n_tokens);
|
|
||||||
for (uint32_t i = 0; ok && i < n_tokens; ++i) {
|
|
||||||
if (placed[i] >= cells.size() || !cells.is_empty(placed[i])) {
|
|
||||||
@@ -1165,6 +1170,61 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch,
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
+// [paged 0007] Cross-request prefix recompute-skip.
|
|
||||||
+//
|
|
||||||
+// Reuse a cached content prefix for seq_id: share_prefix() splices the longest
|
|
||||||
+// matching cached physical blocks into seq_id (ref_cnt++) and reserves fresh
|
|
||||||
+// blocks for the divergent suffix. We then mark the shared physical cells as
|
|
||||||
+// belonging to seq_id - those cells already hold the owner's computed KV at the
|
|
||||||
+// matching logical positions, so the caller decodes ONLY the suffix and the
|
|
||||||
+// prefix is never recomputed. Returns the number of shared prefix tokens.
|
|
||||||
+// Gated behind LLAMA_KV_PAGED; a no-op (returns 0) otherwise.
|
|
||||||
+int32_t llama_kv_cache::paged_prefix_share(llama_seq_id seq_id, const std::vector<llama_token> & tokens) {
|
|
||||||
+ if (!paged_alloc::active() || tokens.empty()) {
|
|
||||||
+ return 0;
|
|
||||||
+ }
|
|
||||||
+ const uint32_t bs = 16;
|
|
||||||
+ const uint32_t strm = (uint32_t) seq_to_stream[seq_id];
|
|
||||||
+ auto & cells = v_cells[strm];
|
|
||||||
+ const uint32_t nblk = cells.size() / bs;
|
|
||||||
+ if (nblk < 2) {
|
|
||||||
+ return 0;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ std::vector<int> toks(tokens.begin(), tokens.end());
|
|
||||||
+ const size_t kshare = paged_alloc::share_prefix(this, (int) strm, (int) seq_id, toks, bs, nblk);
|
|
||||||
+
|
|
||||||
+ for (size_t p = 0; p < kshare; ++p) {
|
|
||||||
+ const int64_t cell = paged_alloc::slot(this, (int) strm, (int) seq_id, (int) p);
|
|
||||||
+ if (cell < 0 || (uint32_t) cell >= cells.size() ||
|
|
||||||
+ cells.is_empty((uint32_t) cell) ||
|
|
||||||
+ cells.pos_get((uint32_t) cell) != (llama_pos) p) {
|
|
||||||
+ // Owner cell missing / repurposed: cannot safely share. Roll the
|
|
||||||
+ // sequence back so the caller recomputes the whole prompt.
|
|
||||||
+ paged_alloc::release(this, (int) strm, (int) seq_id);
|
|
||||||
+ return 0;
|
|
||||||
+ }
|
|
||||||
+ if (!cells.seq_has((uint32_t) cell, seq_id)) {
|
|
||||||
+ cells.seq_add((uint32_t) cell, seq_id);
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ return (int32_t) kshare;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+// [paged 0007] Publish a sequence's full blocks into the content cache so a
|
|
||||||
+// later paged_prefix_share() can reuse them. Call after the sequence KV is
|
|
||||||
+// computed (its prefill decode has run).
|
|
||||||
+void llama_kv_cache::paged_prefix_commit(llama_seq_id seq_id, const std::vector<llama_token> & tokens) {
|
|
||||||
+ if (!paged_alloc::active() || tokens.empty()) {
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+ const uint32_t bs = 16;
|
|
||||||
+ const uint32_t strm = (uint32_t) seq_to_stream[seq_id];
|
|
||||||
+ const uint32_t nblk = v_cells[strm].size() / bs;
|
|
||||||
+ std::vector<int> toks(tokens.begin(), tokens.end());
|
|
||||||
+ paged_alloc::commit(this, (int) strm, (int) seq_id, toks, bs, nblk);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) {
|
|
||||||
// TODO: refactor [TAG_KV_CACHE_SHARE_CELLS]
|
|
||||||
if (other) {
|
|
||||||
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
|
|
||||||
index 494c0fb..f374ac6 100644
|
|
||||||
--- a/src/llama-kv-cache.h
|
|
||||||
+++ b/src/llama-kv-cache.h
|
|
||||||
@@ -199,6 +199,14 @@ public:
|
|
||||||
// emplace the ubatch context into slot: [sinfo.idxs[0...ubatch.n_tokens - 1]]
|
|
||||||
void apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch);
|
|
||||||
|
|
||||||
+ // [paged 0007] Cross-request prefix recompute-skip (experimental, gated by
|
|
||||||
+ // env LLAMA_KV_PAGED). paged_prefix_share() reuses a cached content prefix
|
|
||||||
+ // for seq_id and returns the number of shared prefix tokens (the caller
|
|
||||||
+ // decodes only the suffix); paged_prefix_commit() publishes a sequence into
|
|
||||||
+ // the content cache for later reuse. No-ops when LLAMA_KV_PAGED is unset.
|
|
||||||
+ int32_t paged_prefix_share (llama_seq_id seq_id, const std::vector<llama_token> & tokens);
|
|
||||||
+ void paged_prefix_commit(llama_seq_id seq_id, const std::vector<llama_token> & tokens);
|
|
||||||
+
|
|
||||||
//
|
|
||||||
// input API
|
|
||||||
//
|
|
||||||
diff --git a/src/paged-alloc.cpp b/src/paged-alloc.cpp
|
|
||||||
index 1d13f9c..c1027fb 100644
|
|
||||||
--- a/src/paged-alloc.cpp
|
|
||||||
+++ b/src/paged-alloc.cpp
|
|
||||||
@@ -23,9 +23,13 @@ namespace {
|
|
||||||
|
|
||||||
using key_t = std::pair<const void *, int>;
|
|
||||||
|
|
||||||
-// One PagedKVManager per (kv-cache, stream): each stream owns a separate
|
|
||||||
-// physical pool of cells.size() cells, so a manager's block ids map directly to
|
|
||||||
-// cell ranges within that stream's pool. The internal request id is always 0.
|
|
||||||
+// One persistent PagedKVManager per (kv-cache, stream): each stream owns a
|
|
||||||
+// separate physical pool of cells.size() cells, so a manager's block ids map
|
|
||||||
+// directly to cell ranges within that stream's pool. Requests inside a manager
|
|
||||||
+// are keyed by the real llama_seq_id (NOT a fixed 0), so free(seq) releases one
|
|
||||||
+// sequence and shared blocks survive at ref>0 - this is what makes ref-counted
|
|
||||||
+// cross-request prefix sharing (0007) possible. Caching is enabled so commit()
|
|
||||||
+// can publish blocks and share_prefix() can hit them.
|
|
||||||
std::map<key_t, std::unique_ptr<paged::PagedKVManager>> g_managers;
|
|
||||||
|
|
||||||
paged::PagedKVManager * get_mgr(const void * cache, int stream,
|
|
||||||
@@ -33,18 +37,21 @@ paged::PagedKVManager * get_mgr(const void * cache, int stream,
|
|
||||||
const key_t k{cache, stream};
|
|
||||||
auto it = g_managers.find(k);
|
|
||||||
if (it == g_managers.end()) {
|
|
||||||
- // enable_caching=false: prefix caching is a later patch; 0004 exercises
|
|
||||||
- // only on-demand allocate / free.
|
|
||||||
auto mgr = std::make_unique<paged::PagedKVManager>(
|
|
||||||
- (int32_t) pool_blocks, (int) block_size, /*enable_caching=*/false);
|
|
||||||
+ (int32_t) pool_blocks, (int) block_size, /*enable_caching=*/true);
|
|
||||||
it = g_managers.emplace(k, std::move(mgr)).first;
|
|
||||||
}
|
|
||||||
return it->second.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
+paged::PagedKVManager * find_mgr(const void * cache, int stream) {
|
|
||||||
+ auto it = g_managers.find({cache, stream});
|
|
||||||
+ return it == g_managers.end() ? nullptr : it->second.get();
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
-bool place(const void * cache, int stream, uint32_t base, uint32_t n_tokens,
|
|
||||||
+bool place(const void * cache, int stream, int seq, uint32_t base, uint32_t n_tokens,
|
|
||||||
uint32_t block_size, uint32_t pool_blocks,
|
|
||||||
std::vector<uint32_t> & out) {
|
|
||||||
if (n_tokens == 0) {
|
|
||||||
@@ -53,43 +60,79 @@ bool place(const void * cache, int stream, uint32_t base, uint32_t n_tokens,
|
|
||||||
|
|
||||||
paged::PagedKVManager * mgr = get_mgr(cache, stream, pool_blocks, block_size);
|
|
||||||
|
|
||||||
- const size_t before = mgr->block_table(0).size();
|
|
||||||
+ const size_t before = mgr->block_table(seq).size();
|
|
||||||
|
|
||||||
- // Grow the request to cover the highest logical position. The manager pops
|
|
||||||
- // free blocks only for the boundaries actually crossed - that is the on-
|
|
||||||
- // demand behavior; an already-covered range adds nothing.
|
|
||||||
- if (!mgr->allocate(0, (size_t) base + n_tokens)) {
|
|
||||||
+ // Grow this sequence's request to cover its highest logical position. The
|
|
||||||
+ // manager pops free blocks only for boundaries actually crossed; if
|
|
||||||
+ // share_prefix() already reserved these blocks, this is a no-op.
|
|
||||||
+ if (!mgr->allocate(seq, (size_t) base + n_tokens)) {
|
|
||||||
return false; // pool exhausted -> caller falls back to the stock path
|
|
||||||
}
|
|
||||||
|
|
||||||
out.reserve(out.size() + n_tokens);
|
|
||||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
|
||||||
- const int64_t s = mgr->slot(0, (int) (base + i));
|
|
||||||
+ const int64_t s = mgr->slot(seq, (int) (base + i));
|
|
||||||
out.push_back((uint32_t) s);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (debug()) {
|
|
||||||
- const size_t after = mgr->block_table(0).size();
|
|
||||||
+ const size_t after = mgr->block_table(seq).size();
|
|
||||||
if (after != before) {
|
|
||||||
fprintf(stderr,
|
|
||||||
- "[paged-alloc] cache=%p stream=%d grew %zu->%zu blocks "
|
|
||||||
+ "[paged-alloc] cache=%p stream=%d seq=%d grew %zu->%zu blocks "
|
|
||||||
"(budget=%u; base=%u +%u tok)\n",
|
|
||||||
- cache, stream, before, after, pool_blocks, base, n_tokens);
|
|
||||||
+ cache, stream, seq, before, after, pool_blocks, base, n_tokens);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
-void release(const void * cache, int stream) {
|
|
||||||
- auto it = g_managers.find({cache, stream});
|
|
||||||
- if (it == g_managers.end()) {
|
|
||||||
+size_t share_prefix(const void * cache, int stream, int seq,
|
|
||||||
+ const std::vector<int> & tokens,
|
|
||||||
+ uint32_t block_size, uint32_t pool_blocks) {
|
|
||||||
+ paged::PagedKVManager * mgr = get_mgr(cache, stream, pool_blocks, block_size);
|
|
||||||
+ const size_t shared_blocks = mgr->place_with_prefix(seq, tokens);
|
|
||||||
+ const size_t shared_tokens = shared_blocks * (size_t) block_size;
|
|
||||||
+ if (debug() && shared_blocks > 0) {
|
|
||||||
+ fprintf(stderr,
|
|
||||||
+ "[paged-alloc] cache=%p stream=%d seq=%d shares %zu prefix blocks "
|
|
||||||
+ "(%zu tokens) - prefix NOT recomputed\n",
|
|
||||||
+ cache, stream, seq, shared_blocks, shared_tokens);
|
|
||||||
+ }
|
|
||||||
+ return shared_tokens;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+int64_t slot(const void * cache, int stream, int seq, int pos) {
|
|
||||||
+ paged::PagedKVManager * mgr = find_mgr(cache, stream);
|
|
||||||
+ if (!mgr) {
|
|
||||||
+ return -1;
|
|
||||||
+ }
|
|
||||||
+ if ((size_t) (pos / mgr->block_size()) >= mgr->num_blocks(seq)) {
|
|
||||||
+ return -1;
|
|
||||||
+ }
|
|
||||||
+ return mgr->slot(seq, pos);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void commit(const void * cache, int stream, int seq,
|
|
||||||
+ const std::vector<int> & tokens, uint32_t block_size, uint32_t pool_blocks) {
|
|
||||||
+ paged::PagedKVManager * mgr = get_mgr(cache, stream, pool_blocks, block_size);
|
|
||||||
+ mgr->cache_blocks(seq, mgr->compute_block_hashes(tokens), tokens.size());
|
|
||||||
+ if (debug()) {
|
|
||||||
+ fprintf(stderr, "[paged-alloc] cache=%p stream=%d seq=%d committed %zu tokens\n",
|
|
||||||
+ cache, stream, seq, tokens.size());
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void release(const void * cache, int stream, int seq) {
|
|
||||||
+ paged::PagedKVManager * mgr = find_mgr(cache, stream);
|
|
||||||
+ if (!mgr) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
- it->second->free(0);
|
|
||||||
- g_managers.erase(it);
|
|
||||||
+ mgr->free(seq); // ref-counted: shared blocks survive while another seq holds them
|
|
||||||
if (debug()) {
|
|
||||||
- fprintf(stderr, "[paged-alloc] released cache=%p stream=%d\n", cache, stream);
|
|
||||||
+ fprintf(stderr, "[paged-alloc] released cache=%p stream=%d seq=%d (free=%zu)\n",
|
|
||||||
+ cache, stream, seq, mgr->num_free_blocks());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -103,4 +146,21 @@ void release_all(const void * cache) {
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
+int ref_cnt_at(const void * cache, int stream, int seq, int pos, uint32_t block_size) {
|
|
||||||
+ paged::PagedKVManager * mgr = find_mgr(cache, stream);
|
|
||||||
+ if (!mgr) {
|
|
||||||
+ return -1;
|
|
||||||
+ }
|
|
||||||
+ const size_t bi = (size_t) pos / block_size;
|
|
||||||
+ if (bi >= mgr->num_blocks(seq)) {
|
|
||||||
+ return -1;
|
|
||||||
+ }
|
|
||||||
+ return mgr->block_ref_cnt_at(seq, bi);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+size_t num_free(const void * cache, int stream) {
|
|
||||||
+ paged::PagedKVManager * mgr = find_mgr(cache, stream);
|
|
||||||
+ return mgr ? mgr->num_free_blocks() : 0;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
} // namespace paged_alloc
|
|
||||||
diff --git a/src/paged-alloc.h b/src/paged-alloc.h
|
|
||||||
index bf66665..88dedef 100644
|
|
||||||
--- a/src/paged-alloc.h
|
|
||||||
+++ b/src/paged-alloc.h
|
|
||||||
@@ -1,17 +1,27 @@
|
|
||||||
#pragma once
|
|
||||||
-// On-demand paged KV block allocation (patch 0004, experimental).
|
|
||||||
+// On-demand paged KV block allocation + cross-request prefix reuse
|
|
||||||
+// (patches 0004 + 0007, experimental).
|
|
||||||
//
|
|
||||||
-// Backs the paged placement in llama_kv_cache::find_slot (patch 0002) with the
|
|
||||||
-// vendored host-side PagedKVManager (patch 0001). Instead of mapping a
|
|
||||||
-// sequence's logical positions onto a fixed full-pool permutation, blocks are
|
|
||||||
-// popped from a free pool ON DEMAND as the sequence crosses block boundaries,
|
|
||||||
-// and returned to the pool on sequence end. This is where the paged memory-
|
|
||||||
-// capacity benefit begins: a short sequence holds only a few blocks, not the
|
|
||||||
-// whole reserved window.
|
|
||||||
+// Backs the paged placement in llama_kv_cache::find_slot with the vendored
|
|
||||||
+// host-side PagedKVManager (patch 0001). Two responsibilities:
|
|
||||||
//
|
|
||||||
-// Gated behind env LLAMA_KV_PAGED; a no-op when unset. All state lives in this
|
|
||||||
-// unit (a static registry keyed by kv-cache + stream), so the core kv-cache
|
|
||||||
-// struct stays untouched - find_slot only gains a gated call.
|
|
||||||
+// * On-demand allocation (0004): a sequence's logical positions are mapped to
|
|
||||||
+// physical cells block-by-block, popped from a free pool only as the
|
|
||||||
+// sequence grows and returned on sequence end.
|
|
||||||
+//
|
|
||||||
+// * Cross-request prefix reuse (0007): before a new sequence's suffix is
|
|
||||||
+// decoded, share_prefix() reuses the cached physical blocks of a matching
|
|
||||||
+// content prefix (ref_cnt++), so the engine shares the already-computed KV
|
|
||||||
+// cells and the caller decodes ONLY the divergent suffix - the prefix is not
|
|
||||||
+// recomputed. commit() publishes a sequence's full blocks into the content
|
|
||||||
+// cache so later sequences can hit them. Freeing is ref-counted: a shared
|
|
||||||
+// block returns to the pool only when every sharer has been released.
|
|
||||||
+//
|
|
||||||
+// One persistent PagedKVManager per (kv-cache, stream); requests inside it are
|
|
||||||
+// keyed by the real llama_seq_id, so free(seq) releases exactly one sequence and
|
|
||||||
+// shared blocks survive at ref>0. All state lives in this unit (a static
|
|
||||||
+// registry), so the core kv-cache struct stays untouched - find_slot gains only
|
|
||||||
+// gated calls. Gated behind env LLAMA_KV_PAGED; a no-op when unset.
|
|
||||||
|
|
||||||
#include <cstdint>
|
|
||||||
#include <vector>
|
|
||||||
@@ -21,19 +31,42 @@ namespace paged_alloc {
|
|
||||||
// true iff env LLAMA_KV_PAGED is set (evaluated once).
|
|
||||||
bool active();
|
|
||||||
|
|
||||||
-// Place n_tokens logical positions [base, base+n_tokens) of one stream on
|
|
||||||
-// demand, appending their physical cell indices to `out`. pool_blocks =
|
|
||||||
-// cells.size()/block_size is this stream's block budget. Returns false (leaving
|
|
||||||
+// Place n_tokens logical positions [base, base+n_tokens) of (cache,stream,seq)
|
|
||||||
+// on demand, appending their physical cell indices to `out`. pool_blocks =
|
|
||||||
+// cells.size()/block_size is the stream's block budget. Returns false (leaving
|
|
||||||
// `out` unchanged) on pool exhaustion, so the caller falls back to the stock
|
|
||||||
// allocator. The caller still validates each returned cell is empty.
|
|
||||||
-bool place(const void * cache, int stream, uint32_t base, uint32_t n_tokens,
|
|
||||||
+bool place(const void * cache, int stream, int seq, uint32_t base, uint32_t n_tokens,
|
|
||||||
uint32_t block_size, uint32_t pool_blocks,
|
|
||||||
std::vector<uint32_t> & out);
|
|
||||||
|
|
||||||
-// Return a stream's blocks to the pool (sequence end).
|
|
||||||
-void release(const void * cache, int stream);
|
|
||||||
+// [0007] Reuse the longest cached content prefix of `tokens` for (cache,stream,
|
|
||||||
+// seq): splice the shared physical blocks into seq (ref_cnt++) and reserve fresh
|
|
||||||
+// blocks for the divergent suffix. Returns the number of shared PREFIX TOKENS
|
|
||||||
+// (block-aligned); the caller marks those cells for seq and decodes only the
|
|
||||||
+// suffix. 0 if nothing matched or on pool exhaustion (sequence rolled back).
|
|
||||||
+size_t share_prefix(const void * cache, int stream, int seq,
|
|
||||||
+ const std::vector<int> & tokens,
|
|
||||||
+ uint32_t block_size, uint32_t pool_blocks);
|
|
||||||
+
|
|
||||||
+// [0007] Physical cell backing logical position `pos` of (cache,stream,seq), or
|
|
||||||
+// -1 if seq is unknown. Used to map a shared prefix position to its cell.
|
|
||||||
+int64_t slot(const void * cache, int stream, int seq, int pos);
|
|
||||||
|
|
||||||
-// Return every stream's blocks for a kv-cache (clear() / teardown).
|
|
||||||
+// [0007] Publish seq's full (block-aligned) blocks into the content cache so a
|
|
||||||
+// later share_prefix() can reuse them. Call after the sequence's KV is computed.
|
|
||||||
+void commit(const void * cache, int stream, int seq,
|
|
||||||
+ const std::vector<int> & tokens, uint32_t block_size, uint32_t pool_blocks);
|
|
||||||
+
|
|
||||||
+// Return one sequence's blocks to the pool (ref-counted; sequence end).
|
|
||||||
+void release(const void * cache, int stream, int seq);
|
|
||||||
+
|
|
||||||
+// Drop every manager for a kv-cache (clear() / teardown).
|
|
||||||
void release_all(const void * cache);
|
|
||||||
|
|
||||||
+// Introspection for the prefix-share gate (debug/tests). ref_cnt_at returns the
|
|
||||||
+// ref count of the block backing logical position `pos`, or -1 if unknown.
|
|
||||||
+int ref_cnt_at(const void * cache, int stream, int seq, int pos, uint32_t block_size);
|
|
||||||
+size_t num_free(const void * cache, int stream);
|
|
||||||
+
|
|
||||||
} // namespace paged_alloc
|
|
||||||
diff --git a/src/paged-prefix-api.cpp b/src/paged-prefix-api.cpp
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000..8573cd2
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/src/paged-prefix-api.cpp
|
|
||||||
@@ -0,0 +1,48 @@
|
|
||||||
+#include "paged-prefix-api.h"
|
|
||||||
+#include "paged-alloc.h"
|
|
||||||
+#include "llama-kv-cache.h"
|
|
||||||
+
|
|
||||||
+#include <vector>
|
|
||||||
+
|
|
||||||
+namespace paged_prefix_api {
|
|
||||||
+
|
|
||||||
+static llama_kv_cache * kv_of(llama_context * ctx) {
|
|
||||||
+ // The driver targets a plain unified KV-cache model; dynamic_cast yields null
|
|
||||||
+ // for wrapped caches (iSWA / hybrid), where cross-request cell sharing does
|
|
||||||
+ // not apply, so the shim degrades to a safe no-op.
|
|
||||||
+ return dynamic_cast<llama_kv_cache *>(llama_get_memory(ctx));
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+int32_t share(llama_context * ctx, llama_seq_id seq, const llama_token * tokens, int n) {
|
|
||||||
+ llama_kv_cache * kv = kv_of(ctx);
|
|
||||||
+ if (!kv || n <= 0) {
|
|
||||||
+ return 0;
|
|
||||||
+ }
|
|
||||||
+ return kv->paged_prefix_share(seq, std::vector<llama_token>(tokens, tokens + n));
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+void commit(llama_context * ctx, llama_seq_id seq, const llama_token * tokens, int n) {
|
|
||||||
+ llama_kv_cache * kv = kv_of(ctx);
|
|
||||||
+ if (!kv || n <= 0) {
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+ kv->paged_prefix_commit(seq, std::vector<llama_token>(tokens, tokens + n));
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+int ref_at(llama_context * ctx, llama_seq_id seq, int pos) {
|
|
||||||
+ llama_kv_cache * kv = kv_of(ctx);
|
|
||||||
+ if (!kv) {
|
|
||||||
+ return -1;
|
|
||||||
+ }
|
|
||||||
+ return paged_alloc::ref_cnt_at((const void *) kv, /*stream=*/0, (int) seq, pos, /*block_size=*/16);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+long num_free(llama_context * ctx) {
|
|
||||||
+ llama_kv_cache * kv = kv_of(ctx);
|
|
||||||
+ if (!kv) {
|
|
||||||
+ return 0;
|
|
||||||
+ }
|
|
||||||
+ return (long) paged_alloc::num_free((const void *) kv, /*stream=*/0);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+} // namespace paged_prefix_api
|
|
||||||
diff --git a/src/paged-prefix-api.h b/src/paged-prefix-api.h
|
|
||||||
new file mode 100644
|
|
||||||
index 0000000..78a3864
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/src/paged-prefix-api.h
|
|
||||||
@@ -0,0 +1,27 @@
|
|
||||||
+#pragma once
|
|
||||||
+// Thin test/diagnostic shim over the paged cross-request prefix engine seam
|
|
||||||
+// (patch 0007). Lets a driver that only includes the public llama.h reach the
|
|
||||||
+// gated llama_kv_cache::paged_prefix_* methods and the paged-alloc introspection
|
|
||||||
+// without pulling in the internal kv-cache headers. All entry points are no-ops
|
|
||||||
+// (return 0) unless env LLAMA_KV_PAGED is set. Experimental; not a stable API.
|
|
||||||
+
|
|
||||||
+#include "llama.h"
|
|
||||||
+
|
|
||||||
+namespace paged_prefix_api {
|
|
||||||
+
|
|
||||||
+// Reuse the longest cached content prefix of [tokens, tokens+n) for `seq` and
|
|
||||||
+// return the number of shared prefix tokens (the caller decodes only the
|
|
||||||
+// suffix). 0 if nothing was shared.
|
|
||||||
+int32_t share(llama_context * ctx, llama_seq_id seq, const llama_token * tokens, int n);
|
|
||||||
+
|
|
||||||
+// Publish `seq`'s full blocks into the content cache (call after its KV is computed).
|
|
||||||
+void commit(llama_context * ctx, llama_seq_id seq, const llama_token * tokens, int n);
|
|
||||||
+
|
|
||||||
+// Ref count of the paged block backing logical position `pos` of `seq` (unified
|
|
||||||
+// stream 0), or -1 if unknown.
|
|
||||||
+int ref_at(llama_context * ctx, llama_seq_id seq, int pos);
|
|
||||||
+
|
|
||||||
+// Number of free blocks in the unified stream-0 pool, or 0 if no manager.
|
|
||||||
+long num_free(llama_context * ctx);
|
|
||||||
+
|
|
||||||
+} // namespace paged_prefix_api
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,130 +0,0 @@
|
|||||||
From 088d58f3a0160cbc706226ac2e77ecfeae4c164a Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Mon, 22 Jun 2026 17:02:22 +0200
|
|
||||||
Subject: [PATCH] paged server cross-request prefix share (env LLAMA_KV_PAGED)
|
|
||||||
- patch 0008
|
|
||||||
|
|
||||||
Wire the paged cross-request prefix recompute-skip (patch 0007's engine seam,
|
|
||||||
paged_prefix_api::share/commit) into the llama-server continuous-batching loop
|
|
||||||
(update_slots) so CONCURRENT requests that share a long prefix physically reuse
|
|
||||||
one committed copy of the prefix blocks and prefill only their divergent suffix.
|
|
||||||
Patch 0007 proved the engine seam correct via a standalone driver, but the server
|
|
||||||
never called it: two concurrent shared-prefix requests each recomputed the full
|
|
||||||
prefix. The server's native prompt cache only reuses a slot's OWN prior prompt
|
|
||||||
(longest-common-prefix vs slot.prompt.tokens) - it does not share across distinct
|
|
||||||
concurrent slots. 0008 adds that cross-slot share.
|
|
||||||
|
|
||||||
Mechanism (all gated behind LLAMA_KV_PAGED; default off, stock byte-identical):
|
|
||||||
|
|
||||||
* In update_slots prompt-processing, after the native n_past is computed and
|
|
||||||
only for a FRESH slot (n_past < one block, i.e. the native cache did not
|
|
||||||
already cover the prefix), call paged_prefix_api::share() to splice the
|
|
||||||
longest committed cross-request prefix into this sequence (ref_cnt++ on the
|
|
||||||
shared physical blocks) and advance n_past past it, so the batch fill computes
|
|
||||||
ONLY the suffix. The slot's own divergent tail cells are removed first so the
|
|
||||||
shared cells own [n_past, kshare) without colliding (the native path removes
|
|
||||||
these later anyway). The n_past < block gate guarantees any block-aligned
|
|
||||||
share the engine returns is strictly larger than n_past and therefore always
|
|
||||||
adopted, so the engine's reservation always matches the suffix-only batch and
|
|
||||||
never leaves stale blocks (which otherwise fragment the paged pool).
|
|
||||||
|
|
||||||
* When a slot finishes prefill (SLOT_STATE_DONE_PROMPT -> GENERATING, the prefix
|
|
||||||
KV just computed), call paged_prefix_api::commit() to publish its prefix so
|
|
||||||
concurrent/later sharers can reuse it.
|
|
||||||
|
|
||||||
The share() / commit() entry points are forward-declared (defined in libllama,
|
|
||||||
src/paged-prefix-api.cpp) to avoid pulling internal kv-cache headers into the
|
|
||||||
server translation unit.
|
|
||||||
|
|
||||||
Verified in the server (32B NVFP4, CUDA, --kv-unified): with a live sequence
|
|
||||||
holding the prefix, K=16/32 concurrent shared-prefix requests prefill only their
|
|
||||||
~27-token suffix instead of the ~1003-token prefix (36x fewer prefill tokens;
|
|
||||||
K=16 23.9s -> 1.5s, K=32 57.9s -> 2.3s), the engine logs "shares ... prefix
|
|
||||||
blocks - NOT recomputed" with ref_cnt>1, and greedy output stays within the
|
|
||||||
documented CUDA batch-shape non-determinism band (stock native prompt-caching
|
|
||||||
shows the same magnitude). Cross-request sharing requires the unified KV cache.
|
|
||||||
|
|
||||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
|
||||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
---
|
|
||||||
tools/server/server-context.cpp | 50 +++++++++++++++++++++++++++++++++
|
|
||||||
1 file changed, 50 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
|
|
||||||
index da6a475..04c6361 100644
|
|
||||||
--- a/tools/server/server-context.cpp
|
|
||||||
+++ b/tools/server/server-context.cpp
|
|
||||||
@@ -15,6 +15,16 @@
|
|
||||||
#include "mtmd.h"
|
|
||||||
#include "mtmd-helper.h"
|
|
||||||
|
|
||||||
+// [paged 0008] Cross-request prefix recompute-skip shim. share()/commit() are
|
|
||||||
+// defined in libllama (src/paged-prefix-api.cpp, patch 0007) and are no-ops
|
|
||||||
+// unless env LLAMA_KV_PAGED is set. Declared here so the paged cross-slot prefix
|
|
||||||
+// cache wires into update_slots() without pulling in internal kv-cache headers.
|
|
||||||
+// Fully gated; stock (paged off) is byte-identical.
|
|
||||||
+namespace paged_prefix_api {
|
|
||||||
+ int32_t share (llama_context * ctx, llama_seq_id seq, const llama_token * tokens, int n);
|
|
||||||
+ void commit(llama_context * ctx, llama_seq_id seq, const llama_token * tokens, int n);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cstddef>
|
|
||||||
#include <cinttypes>
|
|
||||||
@@ -3007,6 +3017,37 @@ private:
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
+ // [paged 0008] Cross-request prefix recompute-skip. The native prompt cache
|
|
||||||
+ // above only reuses THIS slot's own prior prompt; when the paged KV
|
|
||||||
+ // engine is active, also reuse a committed CROSS-slot prefix so
|
|
||||||
+ // concurrent requests sharing a long prefix skip recompute. Gated on
|
|
||||||
+ // LLAMA_KV_PAGED (paged_kv_share static); stock stays byte-identical.
|
|
||||||
+ static const bool paged_kv_share = getenv("LLAMA_KV_PAGED") != nullptr;
|
|
||||||
+ // Only attempt the cross-request share on a FRESH slot (the native
|
|
||||||
+ // cache above did not already cover the prefix). With n_past < a
|
|
||||||
+ // block, any block-aligned share the engine returns is strictly
|
|
||||||
+ // larger than n_past and is therefore always adopted below - so the
|
|
||||||
+ // engine's full-prompt reservation always matches the suffix-only
|
|
||||||
+ // submission and never leaves stale blocks (which fragmented the
|
|
||||||
+ // paged pool and crashed the server under high fan-out otherwise).
|
|
||||||
+ if (paged_kv_share && n_past < 16 && slot.task->params.cache_prompt && !input_tokens.has_mtmd) {
|
|
||||||
+ const llama_tokens ptoks = input_tokens.get_text_tokens();
|
|
||||||
+ // Drop this slot's own cells beyond the natively-cached prefix before
|
|
||||||
+ // splicing the shared physical prefix in, so the shared cells can own
|
|
||||||
+ // [n_past, kshare) without colliding (the native path removes exactly
|
|
||||||
+ // these later; a no-op for a fresh slot).
|
|
||||||
+ common_context_seq_rm(ctx_tgt, slot.id, n_past, -1);
|
|
||||||
+ const int32_t kshare = paged_prefix_api::share(ctx_tgt, slot.id, ptoks.data(), (int) ptoks.size());
|
|
||||||
+ if (kshare > n_past) {
|
|
||||||
+ slot.prompt.tokens.keep_first(n_past);
|
|
||||||
+ for (int i = n_past; i < kshare; ++i) {
|
|
||||||
+ slot.prompt.tokens.push_back(ptoks[i]);
|
|
||||||
+ }
|
|
||||||
+ n_past = kshare;
|
|
||||||
+ SLT_INF(slot, "paged: reusing %d cross-request shared prefix tokens - not recomputed\n", n_past);
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
// [TAG_PROMPT_LOGITS]
|
|
||||||
if (n_past == slot.task->n_tokens() && n_past > 0) {
|
|
||||||
SLT_WRN(slot, "need to evaluate at least 1 token for each active slot (n_past = %d, task.n_tokens() = %d)\n", n_past, slot.task->n_tokens());
|
|
||||||
@@ -3427,6 +3468,15 @@ private:
|
|
||||||
// prompt evaluated for next-token prediction
|
|
||||||
slot.state = SLOT_STATE_GENERATING;
|
|
||||||
|
|
||||||
+ // [paged 0008] Publish this slot's computed prefix so concurrent/later
|
|
||||||
+ // slots can share it (no-op unless LLAMA_KV_PAGED). The prefill decode
|
|
||||||
+ // for [0, n_tokens) has just run, so the prefix KV is computed.
|
|
||||||
+ static const bool paged_kv_commit = getenv("LLAMA_KV_PAGED") != nullptr;
|
|
||||||
+ if (paged_kv_commit && slot.task->params.cache_prompt && !slot.prompt.tokens.has_mtmd) {
|
|
||||||
+ const llama_tokens ctoks = slot.prompt.tokens.get_text_tokens();
|
|
||||||
+ paged_prefix_api::commit(ctx_tgt, slot.id, ctoks.data(), (int) ctoks.size());
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
if (slot.can_speculate()) {
|
|
||||||
common_speculative_begin(spec.get(), slot.id, slot.prompt.tokens.get_text_tokens());
|
|
||||||
}
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,609 +0,0 @@
|
|||||||
From 59490d82e4d0d4ad05ffb5ca3cccc668f4a75281 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Mon, 22 Jun 2026 20:03:17 +0200
|
|
||||||
Subject: [PATCH] paged in-kernel decode read (env LLAMA_KV_PAGED) - patch 0009
|
|
||||||
|
|
||||||
Replace the per-layer per-step gather (patch 0003: ggml_get_rows of K/V into a
|
|
||||||
contiguous buffer) with an in-kernel paged read on the decode step. build_attn
|
|
||||||
passes the UNMODIFIED physical K/V views plus a block table (src[5] of
|
|
||||||
ggml_flash_attn_ext: an I32 [n_view, n_stream] position-ordered physical-cell
|
|
||||||
index, padded to FATTN_KQ_STRIDE). The CUDA fattn vec kernel and the CPU
|
|
||||||
reference map logical KV index j -> physical cell block_table[seq*ne11+j] and
|
|
||||||
read K_base+cell*nb11 / V_base+cell*nb21 in place, so the get_rows of K and V
|
|
||||||
(the bulk of the gather) is gone. The mask stays a small compacted [n_view]
|
|
||||||
causal mask in the same position order; KV_max / parallel_blocks / stream_k
|
|
||||||
split-K are unchanged. The decode shape is forced onto the vec kernel (the only
|
|
||||||
one wired for the block table); a nullptr block table => the stock contiguous
|
|
||||||
read, byte-identical.
|
|
||||||
|
|
||||||
Token-POSITION ordering keeps the flash-attn reduction order identical to stock,
|
|
||||||
so CPU-paged logits == CPU-stock bit-for-bit (verified: 4-stream FA greedy, 64
|
|
||||||
tokens). On GPU paged(vec) == stock(vec) at batch 1; at batch>1 it stays within
|
|
||||||
the documented vec-vs-mma non-determinism band. Decode step at batch 32 / 1024
|
|
||||||
ctx on GB10 (Qwen3-32B NVFP4): paged-gather 1279 ms -> in-kernel 696 ms (-46%),
|
|
||||||
recovering the gather regression to stock parity (647 ms). Gated behind
|
|
||||||
LLAMA_KV_PAGED; no-op (stock byte-identical) when unset.
|
|
||||||
|
|
||||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
|
||||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
---
|
|
||||||
ggml/include/ggml.h | 6 ++
|
|
||||||
ggml/src/ggml-cpu/ops.cpp | 10 ++-
|
|
||||||
ggml/src/ggml-cuda/fattn-common.cuh | 8 +-
|
|
||||||
ggml/src/ggml-cuda/fattn-mma-f16.cuh | 4 +-
|
|
||||||
ggml/src/ggml-cuda/fattn-tile.cuh | 4 +-
|
|
||||||
ggml/src/ggml-cuda/fattn-vec.cuh | 25 +++++--
|
|
||||||
ggml/src/ggml-cuda/fattn-wmma-f16.cu | 4 +-
|
|
||||||
ggml/src/ggml-cuda/fattn.cu | 9 +++
|
|
||||||
ggml/src/ggml.c | 14 ++++
|
|
||||||
src/llama-graph.cpp | 23 ++++--
|
|
||||||
src/llama-graph.h | 3 +-
|
|
||||||
src/llama-kv-cache.cpp | 31 ++++++++
|
|
||||||
src/llama-kv-cache.h | 4 +
|
|
||||||
src/paged-attn.cpp | 107 +++++++++++++++++++++++++++
|
|
||||||
src/paged-attn.h | 18 +++++
|
|
||||||
15 files changed, 248 insertions(+), 22 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
|
|
||||||
index d6807b6..823f5a9 100644
|
|
||||||
--- a/ggml/include/ggml.h
|
|
||||||
+++ b/ggml/include/ggml.h
|
|
||||||
@@ -2427,6 +2427,12 @@ extern "C" {
|
|
||||||
struct ggml_tensor * a,
|
|
||||||
struct ggml_tensor * sinks);
|
|
||||||
|
|
||||||
+ // [paged] optional block table in src[5]: I32 [n_kv_logical, n_stream]; maps each
|
|
||||||
+ // logical KV index to the physical cell within K/V. nullptr => stock contiguous read.
|
|
||||||
+ GGML_API void ggml_flash_attn_ext_set_block_table(
|
|
||||||
+ struct ggml_tensor * a,
|
|
||||||
+ struct ggml_tensor * block_table);
|
|
||||||
+
|
|
||||||
// TODO: needs to be adapted to ggml_flash_attn_ext
|
|
||||||
GGML_API struct ggml_tensor * ggml_flash_attn_back(
|
|
||||||
struct ggml_context * ctx,
|
|
||||||
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
|
|
||||||
index 74611dc..63c07a2 100644
|
|
||||||
--- a/ggml/src/ggml-cpu/ops.cpp
|
|
||||||
+++ b/ggml/src/ggml-cpu/ops.cpp
|
|
||||||
@@ -8330,6 +8330,8 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
|
||||||
const ggml_tensor * v = dst->src[2];
|
|
||||||
const ggml_tensor * mask = dst->src[3];
|
|
||||||
const ggml_tensor * sinks = dst->src[4];
|
|
||||||
+ const ggml_tensor * block_table = dst->src[5]; // [paged] logical->physical cell map (src[5])
|
|
||||||
+ const int32_t * bt = block_table ? (const int32_t *) block_table->data : nullptr;
|
|
||||||
|
|
||||||
GGML_TENSOR_LOCALS(int64_t, neq, q, ne)
|
|
||||||
GGML_TENSOR_LOCALS(size_t, nbq, q, nb)
|
|
||||||
@@ -8449,7 +8451,9 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
|
||||||
|
|
||||||
float s; // KQ value
|
|
||||||
|
|
||||||
- const char * k_data = (const char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3);
|
|
||||||
+ // [paged] map the logical KV index ic to its physical cell via the block table.
|
|
||||||
+ const int64_t ic_phys = bt ? (int64_t) bt[ik3*nek1 + ic] : ic;
|
|
||||||
+ const char * k_data = (const char *) k->data + ( ic_phys*nbk1 + ik2*nbk2 + ik3*nbk3);
|
|
||||||
kq_vec_dot(DK, &s, 0, k_data, 0, Q_q, 0, 1);
|
|
||||||
|
|
||||||
s = s*scale; // scale KQ value
|
|
||||||
@@ -8465,7 +8469,7 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
|
|
||||||
float ms = 1.0f; // upon new higher max val, scale VKQ and KQ sum with this value
|
|
||||||
float vs = 1.0f; // post-softmax KQ value, expf(s - M)
|
|
||||||
|
|
||||||
- const char * v_data = ((const char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
|
|
||||||
+ const char * v_data = ((const char *) v->data + (ic_phys*nbv1 + iv2*nbv2 + iv3*nbv3));
|
|
||||||
|
|
||||||
if (v->type == GGML_TYPE_F16) {
|
|
||||||
if (s > M) {
|
|
||||||
@@ -9021,7 +9025,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
|
||||||
const int64_t dr = (nr + nchunk - 1) / nchunk;
|
|
||||||
|
|
||||||
static constexpr int64_t Q_TILE_SZ = ggml_fa_tile_config::Q;
|
|
||||||
- bool use_tiled = !use_ref &&
|
|
||||||
+ bool use_tiled = !use_ref && dst->src[5] == nullptr && // [paged] one_chunk honors the block table
|
|
||||||
(q->type == GGML_TYPE_F32 &&
|
|
||||||
kv_is_f32_or_f16 &&
|
|
||||||
k->type == v->type &&
|
|
||||||
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
|
|
||||||
index 8dfa51a..3c6ddd5 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/fattn-common.cuh
|
|
||||||
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
|
|
||||||
@@ -39,7 +39,8 @@ typedef void (* fattn_kernel_t)(
|
|
||||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
|
||||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
|
||||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
|
||||||
- const int32_t nb31, const int32_t nb32, const int64_t nb33);
|
|
||||||
+ const int32_t nb31, const int32_t nb32, const int64_t nb33,
|
|
||||||
+ const int * __restrict__ block_table);
|
|
||||||
|
|
||||||
typedef float (*vec_dot_KQ_t)(
|
|
||||||
const char * __restrict__ K_c, const void * __restrict__ Q_v, const int * __restrict__ Q_q8 , const void * __restrict__ Q_ds);
|
|
||||||
@@ -981,6 +982,8 @@ void launch_fattn(
|
|
||||||
|
|
||||||
const ggml_tensor * mask = dst->src[3];
|
|
||||||
const ggml_tensor * sinks = dst->src[4];
|
|
||||||
+ const ggml_tensor * block_table = dst->src[5]; // [paged] optional logical->physical map
|
|
||||||
+ const int * bt_ptr = block_table ? (const int *) block_table->data : nullptr;
|
|
||||||
|
|
||||||
ggml_tensor * KQV = dst;
|
|
||||||
|
|
||||||
@@ -1217,7 +1220,8 @@ void launch_fattn(
|
|
||||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3], nb11, nb12, nb13,
|
|
||||||
nb21, nb22, nb23,
|
|
||||||
mask ? mask->ne[1] : 0, mask ? mask->ne[2] : 0, mask ? mask->ne[3] : 0,
|
|
||||||
- mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, mask ? mask->nb[3] : 0
|
|
||||||
+ mask ? mask->nb[1] : 0, mask ? mask->nb[2] : 0, mask ? mask->nb[3] : 0,
|
|
||||||
+ bt_ptr
|
|
||||||
);
|
|
||||||
CUDA_CHECK(cudaGetLastError());
|
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
|
|
||||||
index 83478a0..0a92cd6 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
|
|
||||||
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
|
|
||||||
@@ -1723,7 +1723,9 @@ static __global__ void flash_attn_ext_f16(
|
|
||||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
|
||||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
|
||||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
|
||||||
- const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
|
||||||
+ const int32_t nb31, const int32_t nb32, const int64_t nb33,
|
|
||||||
+ const int * __restrict__ block_table) {
|
|
||||||
+ GGML_UNUSED(block_table); // [paged] block table is honored only by the vec kernel
|
|
||||||
ggml_cuda_pdl_sync(); // TODO optimize placement
|
|
||||||
#if defined(FLASH_ATTN_AVAILABLE) && (defined(VOLTA_MMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) || defined(AMD_MFMA_AVAILABLE))
|
|
||||||
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
|
|
||||||
diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh
|
|
||||||
index 0a09981..0ff14e6 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
|
|
||||||
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
|
|
||||||
@@ -808,7 +808,9 @@ static __global__ void flash_attn_tile(
|
|
||||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
|
||||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
|
||||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
|
||||||
- const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
|
||||||
+ const int32_t nb31, const int32_t nb32, const int64_t nb33,
|
|
||||||
+ const int * __restrict__ block_table) {
|
|
||||||
+ GGML_UNUSED(block_table); // [paged] block table is honored only by the vec kernel
|
|
||||||
#ifdef FLASH_ATTN_AVAILABLE
|
|
||||||
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
|
|
||||||
const char * GGML_CUDA_RESTRICT K = K_ptr;
|
|
||||||
diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh
|
|
||||||
index 69dd936..a09e2fb 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/fattn-vec.cuh
|
|
||||||
+++ b/ggml/src/ggml-cuda/fattn-vec.cuh
|
|
||||||
@@ -39,7 +39,8 @@ static __global__ void flash_attn_ext_vec(
|
|
||||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
|
||||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
|
||||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
|
||||||
- const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
|
||||||
+ const int32_t nb31, const int32_t nb32, const int64_t nb33,
|
|
||||||
+ const int * __restrict__ block_table) {
|
|
||||||
ggml_cuda_pdl_lc();
|
|
||||||
#ifdef FLASH_ATTN_AVAILABLE
|
|
||||||
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
|
|
||||||
@@ -61,7 +62,7 @@ static __global__ void flash_attn_ext_vec(
|
|
||||||
nb11, nb12, nb13,
|
|
||||||
nb21, nb22, nb23,
|
|
||||||
ne31, ne32, ne33,
|
|
||||||
- nb31, nb32, nb33);
|
|
||||||
+ nb31, nb32, nb33, block_table);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
@@ -110,6 +111,14 @@ static __global__ void flash_attn_ext_vec(
|
|
||||||
K += nb13*sequence + nb12*(head / gqa_ratio);
|
|
||||||
V += nb23*sequence + nb22*(head / gqa_ratio);
|
|
||||||
|
|
||||||
+ // [paged] in-kernel block-table read: logical KV index j -> physical cell
|
|
||||||
+ // block_table[sequence*ne11 + j]; read K0 + cell*nb11 / V0 + cell*nb21. The
|
|
||||||
+ // mask/KV_max stay logical (the table is in token-position order). nullptr =>
|
|
||||||
+ // the stock contiguous read below.
|
|
||||||
+ const char * GGML_CUDA_RESTRICT K0 = K;
|
|
||||||
+ const char * GGML_CUDA_RESTRICT V0 = V;
|
|
||||||
+ const int * GGML_CUDA_RESTRICT bt = block_table ? block_table + (size_t) sequence*ne11 : nullptr;
|
|
||||||
+
|
|
||||||
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
|
|
||||||
|
|
||||||
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
|
|
||||||
@@ -267,10 +276,11 @@ static __global__ void flash_attn_ext_vec(
|
|
||||||
#pragma unroll
|
|
||||||
for (int i_KQ_0 = 0; i_KQ_0 < nthreads_KQ; ++i_KQ_0) {
|
|
||||||
const int i_KQ = threadIdx.y*WARP_SIZE + (nthreads_KQ == WARP_SIZE ? 0 : (threadIdx.x & ~(nthreads_KQ-1))) + i_KQ_0;
|
|
||||||
+ const char * GGML_CUDA_RESTRICT K_blk = bt ? (K0 + (int64_t) bt[k_VKQ_0 + i_KQ]*nb11) : (K + i_KQ*nb11);
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < ncols; ++j) {
|
|
||||||
- float sum = vec_dot_KQ(K + i_KQ*nb11, Q_reg[j], Q_i32[j], Q_ds[j]);
|
|
||||||
+ float sum = vec_dot_KQ(K_blk, Q_reg[j], Q_i32[j], Q_ds[j]);
|
|
||||||
sum = warp_reduce_sum<nthreads_KQ>(sum);
|
|
||||||
|
|
||||||
if (use_logit_softcap) {
|
|
||||||
@@ -324,6 +334,7 @@ static __global__ void flash_attn_ext_vec(
|
|
||||||
#pragma unroll
|
|
||||||
for (int k0 = 0; k0 < WARP_SIZE; k0 += V_cols_per_iter) {
|
|
||||||
const int k = threadIdx.y*WARP_SIZE + k0 + (nthreads_V == WARP_SIZE ? 0 : threadIdx.x / nthreads_V);
|
|
||||||
+ const char * GGML_CUDA_RESTRICT V_blk = bt ? (V0 + (int64_t) bt[k_VKQ_0 + k]*nb21) : (V + k*nb21);
|
|
||||||
|
|
||||||
#ifdef V_DOT2_F32_F16_AVAILABLE
|
|
||||||
half2 KQ_k[ncols];
|
|
||||||
@@ -336,14 +347,14 @@ static __global__ void flash_attn_ext_vec(
|
|
||||||
half2 tmp[V_rows_per_thread/2];
|
|
||||||
if constexpr (type_V == GGML_TYPE_BF16) {
|
|
||||||
float2 tmp_f[V_rows_per_thread/2];
|
|
||||||
- dequantize_V(V + k*nb21, tmp_f,
|
|
||||||
+ dequantize_V(V_blk, tmp_f,
|
|
||||||
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
|
|
||||||
#pragma unroll
|
|
||||||
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
|
|
||||||
tmp[i_VKQ_1] = __float22half2_rn(tmp_f[i_VKQ_1]);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
- dequantize_V(V + k*nb21, tmp,
|
|
||||||
+ dequantize_V(V_blk, tmp,
|
|
||||||
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
|
|
||||||
}
|
|
||||||
#pragma unroll
|
|
||||||
@@ -363,7 +374,7 @@ static __global__ void flash_attn_ext_vec(
|
|
||||||
#pragma unroll
|
|
||||||
for (int i_VKQ_0 = 0; i_VKQ_0 < D/2; i_VKQ_0 += nthreads_V*V_rows_per_thread/2) {
|
|
||||||
float2 tmp[V_rows_per_thread/2];
|
|
||||||
- dequantize_V(V + k*nb21, tmp,
|
|
||||||
+ dequantize_V(V_blk, tmp,
|
|
||||||
2*i_VKQ_0 + (nthreads_V == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_V)*V_rows_per_thread);
|
|
||||||
#pragma unroll
|
|
||||||
for (int i_VKQ_1 = 0; i_VKQ_1 < V_rows_per_thread/2; ++i_VKQ_1) {
|
|
||||||
@@ -522,7 +533,7 @@ static __global__ void flash_attn_ext_vec(
|
|
||||||
nb11, nb12, nb13,
|
|
||||||
nb21, nb22, nb23,
|
|
||||||
ne31, ne32, ne33,
|
|
||||||
- nb31, nb32, nb33);
|
|
||||||
+ nb31, nb32, nb33, block_table);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // FLASH_ATTN_AVAILABLE
|
|
||||||
}
|
|
||||||
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
|
|
||||||
index 6850716..5357849 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
|
|
||||||
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
|
|
||||||
@@ -44,7 +44,9 @@ static __global__ void flash_attn_ext_f16(
|
|
||||||
const int32_t nb11, const int32_t nb12, const int64_t nb13,
|
|
||||||
const int32_t nb21, const int32_t nb22, const int64_t nb23,
|
|
||||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
|
||||||
- const int32_t nb31, const int32_t nb32, const int64_t nb33) {
|
|
||||||
+ const int32_t nb31, const int32_t nb32, const int64_t nb33,
|
|
||||||
+ const int * __restrict__ block_table) {
|
|
||||||
+ GGML_UNUSED(block_table); // [paged] block table is honored only by the vec kernel
|
|
||||||
#if defined(FLASH_ATTN_AVAILABLE) && (defined(GGML_HIP_ROCWMMA_FATTN) && defined(GGML_USE_WMMA_FATTN))
|
|
||||||
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
|
|
||||||
const char * GGML_CUDA_RESTRICT K = K_ptr;
|
|
||||||
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
|
|
||||||
index d6c501b..e3771ee 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/fattn.cu
|
|
||||||
+++ b/ggml/src/ggml-cuda/fattn.cu
|
|
||||||
@@ -574,6 +574,15 @@ size_t ggml_cuda_flash_attn_ext_get_alloc_size(int device, const ggml_tensor * d
|
|
||||||
|
|
||||||
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
||||||
ggml_cuda_set_device(ctx.device);
|
|
||||||
+
|
|
||||||
+ // [paged] the block table (src[5]) is only honored by the vec kernel's
|
|
||||||
+ // in-kernel read; force it. build_attn only sets it for a vec-supported
|
|
||||||
+ // 1-token-per-stream decode shape.
|
|
||||||
+ if (dst->src[5] != nullptr) {
|
|
||||||
+ ggml_cuda_flash_attn_ext_vec(ctx, dst);
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
switch (ggml_cuda_get_best_fattn_kernel(ggml_cuda_get_device(), dst)) {
|
|
||||||
case BEST_FATTN_KERNEL_NONE:
|
|
||||||
GGML_ABORT("fatal error");
|
|
||||||
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
|
|
||||||
index b43016c..adbe52b 100644
|
|
||||||
--- a/ggml/src/ggml.c
|
|
||||||
+++ b/ggml/src/ggml.c
|
|
||||||
@@ -5442,6 +5442,20 @@ void ggml_flash_attn_ext_add_sinks(
|
|
||||||
a->src[4] = sinks;
|
|
||||||
}
|
|
||||||
|
|
||||||
+void ggml_flash_attn_ext_set_block_table(
|
|
||||||
+ struct ggml_tensor * a,
|
|
||||||
+ struct ggml_tensor * block_table) {
|
|
||||||
+ if (!block_table) {
|
|
||||||
+ a->src[5] = NULL;
|
|
||||||
+ return;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT);
|
|
||||||
+ GGML_ASSERT(block_table->type == GGML_TYPE_I32);
|
|
||||||
+
|
|
||||||
+ a->src[5] = block_table;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
// ggml_flash_attn_back
|
|
||||||
|
|
||||||
struct ggml_tensor * ggml_flash_attn_back(
|
|
||||||
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
|
|
||||||
index b59d2a5..abdb48d 100644
|
|
||||||
--- a/src/llama-graph.cpp
|
|
||||||
+++ b/src/llama-graph.cpp
|
|
||||||
@@ -2074,7 +2074,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
||||||
ggml_tensor * sinks,
|
|
||||||
ggml_tensor * v_mla,
|
|
||||||
float kq_scale,
|
|
||||||
- int il) const {
|
|
||||||
+ int il,
|
|
||||||
+ ggml_tensor * block_table) const {
|
|
||||||
const bool v_trans = v->nb[1] > v->nb[2];
|
|
||||||
|
|
||||||
// split the batch into streams if needed
|
|
||||||
@@ -2109,6 +2110,9 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|
||||||
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
|
|
||||||
cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
|
|
||||||
|
|
||||||
+ if (block_table) {
|
|
||||||
+ ggml_flash_attn_ext_set_block_table(cur, block_table);
|
|
||||||
+ }
|
|
||||||
ggml_flash_attn_ext_add_sinks(cur, sinks);
|
|
||||||
ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
|
|
||||||
|
|
||||||
@@ -2358,12 +2362,19 @@ ggml_tensor * llm_graph_context::build_attn(
|
|
||||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
|
||||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
|
||||||
|
|
||||||
- // [paged 0003] gather K, V and the mask to the sequence's used cells only
|
|
||||||
- // (no-op unless env LLAMA_KV_PAGED is set).
|
|
||||||
- ggml_tensor * kq_mask_g = kq_mask;
|
|
||||||
- paged_attn::gather(ctx0, res, mctx_cur, &k, &v, &kq_mask_g);
|
|
||||||
+ // [paged] decode read: when paging is active and this is a 1-token-per-stream
|
|
||||||
+ // decode step, present K/V as n_gather views + a block table so the fattn
|
|
||||||
+ // kernel reads the sequence's cells in-kernel (no get_rows of K/V). Else
|
|
||||||
+ // fall back to the gather-read (prefill, transposed V, or env off). All a
|
|
||||||
+ // no-op unless env LLAMA_KV_PAGED is set => stock byte-identical.
|
|
||||||
+ ggml_tensor * kq_mask_g = kq_mask;
|
|
||||||
+ ggml_tensor * block_table = nullptr;
|
|
||||||
+ const bool is_decode = (q_cur->ne[2] == k->ne[3]); // 1 query token per stream
|
|
||||||
+ if (!(is_decode && paged_attn::in_kernel_decode(ctx0, res, mctx_cur, &k, &v, &kq_mask_g, &block_table))) {
|
|
||||||
+ paged_attn::gather(ctx0, res, mctx_cur, &k, &v, &kq_mask_g);
|
|
||||||
+ }
|
|
||||||
|
|
||||||
- ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_g, sinks, v_mla, kq_scale, il);
|
|
||||||
+ ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask_g, sinks, v_mla, kq_scale, il, block_table);
|
|
||||||
cb(cur, "kqv_out", il);
|
|
||||||
|
|
||||||
if (inp->self_v_rot) {
|
|
||||||
diff --git a/src/llama-graph.h b/src/llama-graph.h
|
|
||||||
index 5e8a658..c95ae49 100644
|
|
||||||
--- a/src/llama-graph.h
|
|
||||||
+++ b/src/llama-graph.h
|
|
||||||
@@ -969,7 +969,8 @@ struct llm_graph_context {
|
|
||||||
ggml_tensor * sinks, // [n_head_q]
|
|
||||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
||||||
float kq_scale,
|
|
||||||
- int il) const;
|
|
||||||
+ int il,
|
|
||||||
+ ggml_tensor * block_table = nullptr) const; // [paged] optional src[5] block table
|
|
||||||
|
|
||||||
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
|
|
||||||
|
|
||||||
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
|
|
||||||
index 7510ff9..0351f86 100644
|
|
||||||
--- a/src/llama-kv-cache.cpp
|
|
||||||
+++ b/src/llama-kv-cache.cpp
|
|
||||||
@@ -1474,6 +1474,33 @@ void llama_kv_cache::get_gather_idxs(int32_t * dst, uint32_t n_kv, const slot_in
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
+void llama_kv_cache::get_block_table(int32_t * dst, uint32_t n_blk, uint32_t n_kv, const slot_info & sinfo) const {
|
|
||||||
+ const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;
|
|
||||||
+ for (uint32_t j = 0; j < ns; ++j) {
|
|
||||||
+ const auto & cells = v_cells[sinfo.s0 + j];
|
|
||||||
+ const uint32_t n = std::min<uint32_t>(n_kv, cells.size());
|
|
||||||
+ std::vector<std::pair<llama_pos, int32_t>> pc;
|
|
||||||
+ pc.reserve(n);
|
|
||||||
+ int32_t pad = -1;
|
|
||||||
+ for (uint32_t i = 0; i < n; ++i) {
|
|
||||||
+ if (!cells.is_empty(i)) {
|
|
||||||
+ pc.emplace_back(cells.pos_get(i), (int32_t) i);
|
|
||||||
+ } else if (pad < 0) {
|
|
||||||
+ pad = (int32_t) i;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ std::sort(pc.begin(), pc.end());
|
|
||||||
+ int32_t * col = dst + (size_t) j * n_blk;
|
|
||||||
+ for (size_t k = 0; k < pc.size(); ++k) {
|
|
||||||
+ col[k] = pc[k].second;
|
|
||||||
+ }
|
|
||||||
+ const int32_t padv = (pad >= 0) ? pad : (pc.empty() ? 0 : pc.back().second);
|
|
||||||
+ for (uint32_t k = (uint32_t) pc.size(); k < n_blk; ++k) {
|
|
||||||
+ col[k] = padv;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
|
|
||||||
GGML_UNUSED(sinfo);
|
|
||||||
|
|
||||||
@@ -2773,6 +2800,10 @@ void llama_kv_cache_context::get_gather_idxs(int32_t * dst) const {
|
|
||||||
kv->get_gather_idxs(dst, n_kv, sinfos[i_cur]);
|
|
||||||
}
|
|
||||||
|
|
||||||
+void llama_kv_cache_context::get_block_table(int32_t * dst, uint32_t n_blk) const {
|
|
||||||
+ kv->get_block_table(dst, n_blk, n_kv, sinfos[i_cur]);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
ggml_tensor * llama_kv_cache_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il) const {
|
|
||||||
return kv->cpy_k(ctx, k_cur, k_idxs, il, sinfos[i_cur]);
|
|
||||||
}
|
|
||||||
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
|
|
||||||
index f374ac6..e9980b6 100644
|
|
||||||
--- a/src/llama-kv-cache.h
|
|
||||||
+++ b/src/llama-kv-cache.h
|
|
||||||
@@ -176,6 +176,9 @@ public:
|
|
||||||
// gather-read. get_n_gather returns the max count across streams.
|
|
||||||
uint32_t get_n_gather(uint32_t n_kv, const slot_info & sinfo) const;
|
|
||||||
void get_gather_idxs(int32_t * dst, uint32_t n_kv, const slot_info & sinfo) const;
|
|
||||||
+ // [paged inc1] block table [n_blk, n_stream] (position order, padded to n_blk
|
|
||||||
+ // per column with a masked empty cell) for the in-kernel paged read.
|
|
||||||
+ void get_block_table(int32_t * dst, uint32_t n_blk, uint32_t n_kv, const slot_info & sinfo) const;
|
|
||||||
|
|
||||||
// store k_cur and v_cur in the cache based on the provided head location
|
|
||||||
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
|
|
||||||
@@ -386,6 +389,7 @@ public:
|
|
||||||
// current ubatch's stream).
|
|
||||||
uint32_t get_n_gather() const;
|
|
||||||
void get_gather_idxs(int32_t * dst) const;
|
|
||||||
+ void get_block_table(int32_t * dst, uint32_t n_blk) const;
|
|
||||||
|
|
||||||
// store k_cur and v_cur in the cache based on the provided head location
|
|
||||||
// note: the heads in k_cur and v_cur should be laid out contiguously in memory
|
|
||||||
diff --git a/src/paged-attn.cpp b/src/paged-attn.cpp
|
|
||||||
index ade75e8..8eebeaa 100644
|
|
||||||
--- a/src/paged-attn.cpp
|
|
||||||
+++ b/src/paged-attn.cpp
|
|
||||||
@@ -43,6 +43,25 @@ public:
|
|
||||||
ggml_tensor * idxs;
|
|
||||||
};
|
|
||||||
|
|
||||||
+// Block table filler for the in-kernel paged read: fills an I32 [n_blk, n_stream]
|
|
||||||
+// tensor with each stream's position-ordered cells, padded to n_blk (per column)
|
|
||||||
+// with a masked empty cell, by delegating to the kv-cache context.
|
|
||||||
+class input_block_table : public llm_graph_input_i {
|
|
||||||
+public:
|
|
||||||
+ input_block_table(const llama_kv_cache_context * mctx, ggml_tensor * idxs, uint32_t n_blk)
|
|
||||||
+ : mctx(mctx), idxs(idxs), n_blk(n_blk) {}
|
|
||||||
+
|
|
||||||
+ void set_input(const llama_ubatch * ubatch) override {
|
|
||||||
+ GGML_UNUSED(ubatch);
|
|
||||||
+ GGML_ASSERT(idxs && ggml_backend_buffer_is_host(idxs->buffer));
|
|
||||||
+ mctx->get_block_table((int32_t *) idxs->data, n_blk);
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ const llama_kv_cache_context * mctx;
|
|
||||||
+ ggml_tensor * idxs;
|
|
||||||
+ uint32_t n_blk;
|
|
||||||
+};
|
|
||||||
+
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
void gather(ggml_context * ctx0,
|
|
||||||
@@ -125,4 +144,92 @@ void gather(ggml_context * ctx0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
+bool in_kernel_decode(ggml_context * ctx0,
|
|
||||||
+ llm_graph_result * res,
|
|
||||||
+ const llama_kv_cache_context * mctx,
|
|
||||||
+ ggml_tensor ** k,
|
|
||||||
+ ggml_tensor ** v,
|
|
||||||
+ ggml_tensor ** kq_mask,
|
|
||||||
+ ggml_tensor ** block_table) {
|
|
||||||
+ if (!active()) {
|
|
||||||
+ return false;
|
|
||||||
+ }
|
|
||||||
+ // Bench escape hatch: LLAMA_KV_PAGED_GATHER=1 forces the old gather-read decode
|
|
||||||
+ // path (for a same-build BEFORE/AFTER decode-step comparison). Dev-only.
|
|
||||||
+ static const bool force_gather = (std::getenv("LLAMA_KV_PAGED_GATHER") != nullptr);
|
|
||||||
+ if (force_gather) {
|
|
||||||
+ return false;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ ggml_tensor * K = *k;
|
|
||||||
+ ggml_tensor * V = *v;
|
|
||||||
+ ggml_tensor * M = *kq_mask;
|
|
||||||
+
|
|
||||||
+ const int64_t n_stream = K->ne[3];
|
|
||||||
+ GGML_ASSERT(M->ne[3] == n_stream);
|
|
||||||
+
|
|
||||||
+ const int64_t n_gather = (int64_t) mctx->get_n_gather();
|
|
||||||
+ if (n_gather <= 0) {
|
|
||||||
+ // Worst-case reserve / nothing placed yet: keep the dense [0,n_kv) read.
|
|
||||||
+ return false;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ // The in-kernel read addresses V along its d-major (non-transposed) axis. If
|
|
||||||
+ // the cache stores V transposed, fall back to gather() (which normalizes it).
|
|
||||||
+ if (V->nb[1] > V->nb[2]) {
|
|
||||||
+ return false;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ if (debug()) {
|
|
||||||
+ static int64_t once = 0;
|
|
||||||
+ if (once++ < 2) {
|
|
||||||
+ fprintf(stderr, "[paged-attn] in-kernel decode n_stream=%lld n_kv=%lld n_gather=%lld\n",
|
|
||||||
+ (long long) n_stream, (long long) K->ne[2], (long long) n_gather);
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ // Block table [n_gather, n_stream]: column s holds stream s's non-empty cells
|
|
||||||
+ // in token-POSITION order (identical to the gather index, so the reduction
|
|
||||||
+ // order matches stock bit-for-bit), padded with a masked empty cell. Filled
|
|
||||||
+ // at set_input from the kv-cache (get_gather_idxs), exactly like the gather.
|
|
||||||
+ // Pad the logical length to FATTN_KQ_STRIDE (256) so the CUDA fattn vec kernel
|
|
||||||
+ // reads fixed 128-wide KV blocks without overrun and the KV_max mask scan
|
|
||||||
+ // engages; padded entries point at a masked empty cell (0 contribution). Stays
|
|
||||||
+ // <= n_kv since n_kv is itself padded to 256 and n_gather <= n_kv.
|
|
||||||
+ int64_t n_view = GGML_PAD(n_gather, 256);
|
|
||||||
+ if (n_view > K->ne[2]) {
|
|
||||||
+ n_view = K->ne[2];
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ ggml_tensor * idx = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_view, n_stream);
|
|
||||||
+ ggml_set_input(idx);
|
|
||||||
+ res->add_input(llm_graph_input_ptr(new input_block_table(mctx, idx, (uint32_t) n_view)));
|
|
||||||
+
|
|
||||||
+ // Present K and V as [d, h, n_view, ns] VIEWS of the full physical window:
|
|
||||||
+ // identical per-cell (nb1,nb2) and per-stream (nb3) strides, only the cell
|
|
||||||
+ // dim shrinks to n_view. NOT materialized - the kernel reads in place.
|
|
||||||
+ *k = ggml_view_4d(ctx0, K, K->ne[0], K->ne[1], n_view, n_stream,
|
|
||||||
+ K->nb[1], K->nb[2], K->nb[3], 0);
|
|
||||||
+ *v = ggml_view_4d(ctx0, V, V->ne[0], V->ne[1], n_view, n_stream,
|
|
||||||
+ V->nb[1], V->nb[2], V->nb[3], 0);
|
|
||||||
+
|
|
||||||
+ // Compact the mask to [n_gather, n_tps, 1, ns] in the same position order so
|
|
||||||
+ // the kernel's logical mask index aligns with the block table. Cheap: the
|
|
||||||
+ // mask is ~(d*h) smaller than K/V, which is why only its get_rows remains.
|
|
||||||
+ {
|
|
||||||
+ ggml_tensor * m = ggml_reshape_3d(ctx0, M, M->ne[0], M->ne[1], n_stream);
|
|
||||||
+ m = ggml_cont(ctx0, ggml_transpose(ctx0, m));
|
|
||||||
+ m = ggml_get_rows(ctx0, m, idx);
|
|
||||||
+ m = ggml_cont(ctx0, ggml_transpose(ctx0, m));
|
|
||||||
+ m = ggml_reshape_4d(ctx0, m, n_view, M->ne[1], 1, n_stream);
|
|
||||||
+ if (M->type != m->type) {
|
|
||||||
+ m = ggml_cast(ctx0, m, M->type);
|
|
||||||
+ }
|
|
||||||
+ *kq_mask = m;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
+ *block_table = idx;
|
|
||||||
+ return true;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
} // namespace paged_attn
|
|
||||||
diff --git a/src/paged-attn.h b/src/paged-attn.h
|
|
||||||
index c5b7bd7..23e2184 100644
|
|
||||||
--- a/src/paged-attn.h
|
|
||||||
+++ b/src/paged-attn.h
|
|
||||||
@@ -37,4 +37,22 @@ void gather(ggml_context * ctx0,
|
|
||||||
ggml_tensor ** v,
|
|
||||||
ggml_tensor ** kq_mask);
|
|
||||||
|
|
||||||
+// [paged inc1] In-kernel paged decode read. Instead of materializing the
|
|
||||||
+// sequence's cells (gather()), present K and V as n_gather-length VIEWS of the
|
|
||||||
+// full physical window and return the position-ordered physical-cell index list
|
|
||||||
+// as a block table (src[5] of ggml_flash_attn_ext). The fattn kernel/op then
|
|
||||||
+// reads K_base + block_table[j]*nb in-kernel, removing the get_rows of K and V
|
|
||||||
+// (the bulk of the gather). On return (true): *k,*v point at the views, *kq_mask
|
|
||||||
+// at the compacted mask, *block_table at the I32 [n_gather, n_stream] index.
|
|
||||||
+// Returns false (leaving *k,*v,*kq_mask untouched) when the in-kernel path does
|
|
||||||
+// not apply - env off, nothing placed, or a transposed V cache - so the caller
|
|
||||||
+// keeps the dense gather()/contiguous read.
|
|
||||||
+bool in_kernel_decode(ggml_context * ctx0,
|
|
||||||
+ llm_graph_result * res,
|
|
||||||
+ const llama_kv_cache_context * mctx,
|
|
||||||
+ ggml_tensor ** k,
|
|
||||||
+ ggml_tensor ** v,
|
|
||||||
+ ggml_tensor ** kq_mask,
|
|
||||||
+ ggml_tensor ** block_table);
|
|
||||||
+
|
|
||||||
} // namespace paged_attn
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,269 +0,0 @@
|
|||||||
From 9ac56933abd5de4a1f349c811c2d74aab09f7ab1 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Mon, 22 Jun 2026 22:36:09 +0200
|
|
||||||
Subject: [PATCH] paged tile in-kernel decode read + dispatch guard (env
|
|
||||||
LLAMA_KV_PAGED) - patch 0010
|
|
||||||
|
|
||||||
Increment 2 (robustness, ~0 headline ms): make the paged in-kernel decode read
|
|
||||||
safe against silent mis-routing, and plumb the same read into the tile kernel
|
|
||||||
for the increment-3 GQA head-group work.
|
|
||||||
|
|
||||||
fattn-tile.cuh: graft the patch-0009 phys(j) block-table read (mirror of
|
|
||||||
fattn-vec.cuh). Both flash_attn_tile_load_tile overloads, flash_attn_tile_iter_KQ
|
|
||||||
(K) and flash_attn_tile_iter (V) take an optional per-sequence block table; a row
|
|
||||||
i is read from base + block_table[row_base + i]*stride instead of base + i*stride.
|
|
||||||
The table defaults to nullptr (default args + a null bt_seq when src[5] is unset),
|
|
||||||
so every existing non-paged caller is byte-identical to stock. The mask / KV_max
|
|
||||||
stay logical (token-position order), as in vec.
|
|
||||||
|
|
||||||
fattn.cu: DISPATCH GUARD. When the block table (src[5]) is present, route ONLY to
|
|
||||||
the vec or tile kernel and never fall through to the best-kernel switch. The
|
|
||||||
mma/wmma kernels GGML_UNUSED the table and would silently read the wrong
|
|
||||||
(contiguous physical) cells; the guard makes that unreachable. The vec dispatcher
|
|
||||||
GGML_ABORTs for an unsupported D/type rather than mis-reading. Default route is vec
|
|
||||||
(the inc-1 byte-validated path). LLAMA_KV_PAGED_DISPATCH_LOG=1 prints the routed
|
|
||||||
kernel once.
|
|
||||||
|
|
||||||
Gates: CPU byte-identical paged-on vs off (Qwen3-0.6B, build-cpu) PASS. GPU
|
|
||||||
vec-paged == stock at -s 1 PASS. Dispatch confirmed VEC for the real decode shape:
|
|
||||||
Qwen3-0.6B Q ne=[128,1,16,1] and Qwen3-32B NVFP4 Q ne=[128,1,64,N] both route to
|
|
||||||
vec, matching the nsys profile (flash_attn_ext_vec).
|
|
||||||
|
|
||||||
The tile graft is plumbed for increment-3 GQA head-group reuse but is EXPERIMENTAL
|
|
||||||
and NOT yet byte-validated (LLAMA_KV_PAGED_TILE=1). A tile-vs-tile gate shows
|
|
||||||
tile-paged diverging from tile-stock at the first cross-tile KV depth: the
|
|
||||||
GQA-grouped (ncols2>1) tile path reads a full nbatch_fa-row tile with
|
|
||||||
oob_check=false while the compacted paged mask is not padded to cover the tile, so
|
|
||||||
past-end rows leak. vec bounds its KV walk by KV_max and is unaffected. Bounding
|
|
||||||
the tile path is increment-3 work; the default vec route and all stock paths are
|
|
||||||
untouched.
|
|
||||||
|
|
||||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
|
||||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
---
|
|
||||||
ggml/src/ggml-cuda/fattn-tile.cuh | 45 ++++++++++++++++++++-----------
|
|
||||||
ggml/src/ggml-cuda/fattn.cu | 38 +++++++++++++++++++++++---
|
|
||||||
2 files changed, 64 insertions(+), 19 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh
|
|
||||||
index 0ff14e6..bb84d61 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
|
|
||||||
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
|
|
||||||
@@ -373,7 +373,8 @@ static constexpr __device__ int ggml_cuda_fattn_tile_get_nbatch_K(const int DKQ,
|
|
||||||
// TODO: deduplicate with mma-f16
|
|
||||||
template<int warp_size, int nwarps, int I, int J, int J_padding, bool oob_check>
|
|
||||||
static __device__ __forceinline__ void flash_attn_tile_load_tile(
|
|
||||||
- const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int stride_KV, const int i_sup) {
|
|
||||||
+ const half2 * const __restrict__ KV, half2 * const __restrict__ tile_KV, const int stride_KV, const int i_sup,
|
|
||||||
+ const int * const __restrict__ block_table = nullptr, const int row_base = 0) {
|
|
||||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
|
||||||
constexpr int cpy_ne = cpy_nb / 4;
|
|
||||||
|
|
||||||
@@ -402,9 +403,11 @@ static __device__ __forceinline__ void flash_attn_tile_load_tile(
|
|
||||||
const int j = j0*cpy_ne + (stride_j == warp_size ? threadIdx.x : threadIdx.x % stride_j)*cpy_ne;
|
|
||||||
|
|
||||||
const __align__(16) half2 zero[cpy_ne] = {{0.0f, 0.0f}};
|
|
||||||
+ // [paged] remap the row through the block table (nullptr => stock contiguous read).
|
|
||||||
+ const half2 * const KV_row = block_table ? KV + (int64_t) block_table[row_base + i]*stride_KV : KV + i*stride_KV;
|
|
||||||
ggml_cuda_memcpy_1<cpy_nb>(
|
|
||||||
tile_KV + i*(J/2 + J_padding) + j,
|
|
||||||
- !oob_check || i < i_sup ? KV + i*stride_KV + j : zero);
|
|
||||||
+ !oob_check || i < i_sup ? KV_row + j : zero);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -423,7 +426,8 @@ static __device__ __forceinline__ void flash_attn_tile_load_tile(
|
|
||||||
|
|
||||||
template<int warp_size, int nwarps, int I, int J, int J_padding, bool oob_check>
|
|
||||||
static __device__ __forceinline__ void flash_attn_tile_load_tile(
|
|
||||||
- const half2 * const __restrict__ KV, float * const __restrict__ tile_KV, const int stride_KV, const int i_sup) {
|
|
||||||
+ const half2 * const __restrict__ KV, float * const __restrict__ tile_KV, const int stride_KV, const int i_sup,
|
|
||||||
+ const int * const __restrict__ block_table = nullptr, const int row_base = 0) {
|
|
||||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
|
||||||
constexpr int cpy_ne = cpy_nb / 4;
|
|
||||||
|
|
||||||
@@ -453,8 +457,10 @@ static __device__ __forceinline__ void flash_attn_tile_load_tile(
|
|
||||||
|
|
||||||
const half2 zero[cpy_ne/2] = {{0.0f, 0.0f}};
|
|
||||||
__align__(16) half2 tmp_h2[cpy_ne/2];
|
|
||||||
+ // [paged] remap the row through the block table (nullptr => stock contiguous read).
|
|
||||||
+ const half2 * const KV_row = block_table ? KV + (int64_t) block_table[row_base + i]*stride_KV : KV + i*stride_KV;
|
|
||||||
ggml_cuda_memcpy_1<sizeof(tmp_h2)>(
|
|
||||||
- tmp_h2, !oob_check || i < i_sup ? KV + i*stride_KV + j : zero);
|
|
||||||
+ tmp_h2, !oob_check || i < i_sup ? KV_row + j : zero);
|
|
||||||
|
|
||||||
__align__(16) float2 tmp_f2[cpy_ne/2];
|
|
||||||
#pragma unroll
|
|
||||||
@@ -487,6 +493,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter_KQ(
|
|
||||||
const int k_VKQ_0,
|
|
||||||
const int k_VKQ_sup,
|
|
||||||
const int k_KQ_0,
|
|
||||||
+ const int * const __restrict__ block_table,
|
|
||||||
float * KQ_acc) {
|
|
||||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
|
||||||
constexpr int cpy_ne = cpy_nb / 4;
|
|
||||||
@@ -495,8 +502,10 @@ static __device__ __forceinline__ void flash_attn_tile_iter_KQ(
|
|
||||||
constexpr int cpw = ncols > nwarps ? ncols/nwarps : 1; // Q columns per warp
|
|
||||||
constexpr int np = nwarps > ncols ? nwarps/ncols : 1; // number of parallel warps per Q column
|
|
||||||
|
|
||||||
+ // [paged] when block_table is set K_h2 is the un-offset base; the table supplies the row.
|
|
||||||
+ const half2 * const K_base = block_table ? (K_h2 + k_KQ_0/2) : (K_h2 + int64_t(k_VKQ_0)*stride_K2 + k_KQ_0/2);
|
|
||||||
flash_attn_tile_load_tile<warp_size, nwarps, nbatch_fa, nbatch_K, cpy_ne, oob_check>
|
|
||||||
- (K_h2 + int64_t(k_VKQ_0)*stride_K2 + k_KQ_0/2, KV_tmp, stride_K2, k_VKQ_sup);
|
|
||||||
+ (K_base, KV_tmp, stride_K2, k_VKQ_sup, block_table, k_VKQ_0);
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
#ifdef FAST_FP16_AVAILABLE
|
|
||||||
@@ -572,7 +581,8 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
|
||||||
T_acc * const VKQ,
|
|
||||||
const int k_VKQ_0,
|
|
||||||
const int k_VKQ_max,
|
|
||||||
- const int col_Q_0) {
|
|
||||||
+ const int col_Q_0,
|
|
||||||
+ const int * const __restrict__ block_table) {
|
|
||||||
constexpr int cpy_nb = ggml_cuda_get_max_cpy_bytes();
|
|
||||||
constexpr int cpy_ne = cpy_nb / 4;
|
|
||||||
|
|
||||||
@@ -605,12 +615,12 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
|
||||||
#pragma unroll
|
|
||||||
for (int k_KQ_0 = 0; k_KQ_0 < DKQ - nbatch_K_last; k_KQ_0 += nbatch_K) {
|
|
||||||
flash_attn_tile_iter_KQ<warp_size, nwarps, ncols1, ncols2, DKQ, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>(
|
|
||||||
- Q_tmp, K_h2, KV_tmp, stride_K2, k_VKQ_0, k_VKQ_sup, k_KQ_0, KQ_acc);
|
|
||||||
+ Q_tmp, K_h2, KV_tmp, stride_K2, k_VKQ_0, k_VKQ_sup, k_KQ_0, block_table, KQ_acc);
|
|
||||||
}
|
|
||||||
if (nbatch_K_last > 0) {
|
|
||||||
constexpr int k_KQ_0 = DKQ - nbatch_K_last;
|
|
||||||
flash_attn_tile_iter_KQ<warp_size, nwarps, ncols1, ncols2, DKQ, nbatch_fa, nbatch_K_last, use_logit_softcap, oob_check>(
|
|
||||||
- Q_tmp, K_h2, KV_tmp, stride_K2, k_VKQ_0, k_VKQ_sup, k_KQ_0, KQ_acc);
|
|
||||||
+ Q_tmp, K_h2, KV_tmp, stride_K2, k_VKQ_0, k_VKQ_sup, k_KQ_0, block_table, KQ_acc);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Apply logit softcap + mask, update KQ_max:
|
|
||||||
@@ -715,8 +725,10 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
|
|
||||||
static_assert(nbatch_V % np == 0, "bad nbatch_V");
|
|
||||||
#pragma unroll
|
|
||||||
for (int k0 = 0; k0 < nbatch_fa; k0 += nbatch_V) {
|
|
||||||
+ // [paged] when block_table is set V_h2 is the un-offset base; the table supplies the row.
|
|
||||||
+ const half2 * const V_base = block_table ? V_h2 : (V_h2 + int64_t(k_VKQ_0 + k0)*stride_V2);
|
|
||||||
flash_attn_tile_load_tile<warp_size, nwarps, nbatch_V, DV, 0, oob_check>
|
|
||||||
- (V_h2 + int64_t(k_VKQ_0 + k0)*stride_V2, KV_tmp, stride_V2, k_VKQ_sup - k0);
|
|
||||||
+ (V_base, KV_tmp, stride_V2, k_VKQ_sup - k0, block_table, k_VKQ_0 + k0);
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
#ifdef FAST_FP16_AVAILABLE
|
|
||||||
@@ -810,7 +822,6 @@ static __global__ void flash_attn_tile(
|
|
||||||
const int32_t ne31, const int32_t ne32, const int32_t ne33,
|
|
||||||
const int32_t nb31, const int32_t nb32, const int64_t nb33,
|
|
||||||
const int * __restrict__ block_table) {
|
|
||||||
- GGML_UNUSED(block_table); // [paged] block table is honored only by the vec kernel
|
|
||||||
#ifdef FLASH_ATTN_AVAILABLE
|
|
||||||
const char * GGML_CUDA_RESTRICT Q = Q_ptr;
|
|
||||||
const char * GGML_CUDA_RESTRICT K = K_ptr;
|
|
||||||
@@ -837,7 +848,7 @@ static __global__ void flash_attn_tile(
|
|
||||||
nb11, nb12, nb13,
|
|
||||||
nb21, nb22, nb23,
|
|
||||||
ne31, ne32, ne33,
|
|
||||||
- nb31, nb32, nb33);
|
|
||||||
+ nb31, nb32, nb33, block_table);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
@@ -861,6 +872,10 @@ static __global__ void flash_attn_tile(
|
|
||||||
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
|
|
||||||
const half2 * V_h2 = (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); // K and V have same shape
|
|
||||||
|
|
||||||
+ // [paged] per-sequence logical->physical block table in token-position order
|
|
||||||
+ // (mask/KV_max stay logical); nullptr => the stock contiguous read.
|
|
||||||
+ const int * const __restrict__ bt_seq = block_table ? block_table + (size_t) sequence*ne11 : nullptr;
|
|
||||||
+
|
|
||||||
const half * maskh = mask ? (const half *) (mask + nb33*(sequence % ne33)) : nullptr;
|
|
||||||
|
|
||||||
const int stride_K2 = nb11 / sizeof(half2);
|
|
||||||
@@ -963,14 +978,14 @@ static __global__ void flash_attn_tile(
|
|
||||||
constexpr bool oob_check = false;
|
|
||||||
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
|
|
||||||
(Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
|
|
||||||
- stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
|
|
||||||
+ stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0, bt_seq);
|
|
||||||
k_VKQ_0 += gridDim.y*nbatch_fa;
|
|
||||||
}
|
|
||||||
if (k_VKQ_0 < k_VKQ_max) {
|
|
||||||
constexpr bool oob_check = true;
|
|
||||||
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
|
|
||||||
(Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
|
|
||||||
- stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
|
|
||||||
+ stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0, bt_seq);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Branch without out-of-bounds checks.
|
|
||||||
@@ -978,7 +993,7 @@ static __global__ void flash_attn_tile(
|
|
||||||
constexpr bool oob_check = false;
|
|
||||||
flash_attn_tile_iter<warp_size, nwarps, ncols1, ncols2, DKQ, DV, nbatch_fa, nbatch_K, use_logit_softcap, oob_check>
|
|
||||||
(Q_tmp, K_h2, V_h2, maskh, ne01, logit_softcap, slope, KQ, KV_tmp,
|
|
||||||
- stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0);
|
|
||||||
+ stride_K2, stride_V2, stride_mask, KQ_max, KQ_sum, VKQ, k_VKQ_0, k_VKQ_max, col_Q_0, bt_seq);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@@ -1144,7 +1159,7 @@ static __global__ void flash_attn_tile(
|
|
||||||
nb11, nb12, nb13,
|
|
||||||
nb21, nb22, nb23,
|
|
||||||
ne31, ne32, ne33,
|
|
||||||
- nb31, nb32, nb33);
|
|
||||||
+ nb31, nb32, nb33, block_table);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // FLASH_ATTN_AVAILABLE
|
|
||||||
}
|
|
||||||
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
|
|
||||||
index e3771ee..afcafa2 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/fattn.cu
|
|
||||||
+++ b/ggml/src/ggml-cuda/fattn.cu
|
|
||||||
@@ -575,11 +575,41 @@ size_t ggml_cuda_flash_attn_ext_get_alloc_size(int device, const ggml_tensor * d
|
|
||||||
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
||||||
ggml_cuda_set_device(ctx.device);
|
|
||||||
|
|
||||||
- // [paged] the block table (src[5]) is only honored by the vec kernel's
|
|
||||||
- // in-kernel read; force it. build_attn only sets it for a vec-supported
|
|
||||||
- // 1-token-per-stream decode shape.
|
|
||||||
+ // [paged] DISPATCH GUARD. The block table (src[5]) is read in-kernel ONLY by
|
|
||||||
+ // the vec and tile kernels; the mma/wmma kernels GGML_UNUSED it and would
|
|
||||||
+ // silently read the wrong (contiguous physical) cells. So when a block table
|
|
||||||
+ // is present we route here and NEVER fall through to the best-kernel switch
|
|
||||||
+ // below - no decode shape can silently reach an mma/wmma misread. build_attn
|
|
||||||
+ // only sets src[5] for the 1-token-per-stream decode shape; the vec
|
|
||||||
+ // dispatcher GGML_ABORTs for an unsupported D/type rather than mis-reading,
|
|
||||||
+ // and any shape that should not be paged must take the host-side gather path
|
|
||||||
+ // (LLAMA_KV_PAGED_GATHER=1) instead.
|
|
||||||
+ //
|
|
||||||
+ // Default route = vec (inc-1, byte-validated: vec-paged == stock at -s 1 and
|
|
||||||
+ // CPU byte-identical). LLAMA_KV_PAGED_TILE=1 routes the same shape to the
|
|
||||||
+ // tile kernel; the tile in-kernel read is plumbed (fattn-tile.cuh) for the
|
|
||||||
+ // increment-3 GQA head-group reuse, but is EXPERIMENTAL / NOT yet byte-
|
|
||||||
+ // validated: the GQA-grouped (ncols2>1) tile path reads a full nbatch_fa tile
|
|
||||||
+ // with oob_check=false while the compacted paged mask is not padded to cover
|
|
||||||
+ // it, so it diverges from stock. Not for production paged decode until
|
|
||||||
+ // increment-3 bounds that path; the default vec route is unaffected.
|
|
||||||
if (dst->src[5] != nullptr) {
|
|
||||||
- ggml_cuda_flash_attn_ext_vec(ctx, dst);
|
|
||||||
+ static const bool paged_tile = getenv("LLAMA_KV_PAGED_TILE") != nullptr;
|
|
||||||
+ if (getenv("LLAMA_KV_PAGED_DISPATCH_LOG") != nullptr) {
|
|
||||||
+ static bool logged = false;
|
|
||||||
+ if (!logged) {
|
|
||||||
+ logged = true;
|
|
||||||
+ fprintf(stderr, "[paged] decode src[5] set -> routing to %s (Q ne=[%ld,%ld,%ld,%ld])\n",
|
|
||||||
+ paged_tile ? "TILE(experimental)" : "VEC",
|
|
||||||
+ (long) dst->src[0]->ne[0], (long) dst->src[0]->ne[1],
|
|
||||||
+ (long) dst->src[0]->ne[2], (long) dst->src[0]->ne[3]);
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ if (paged_tile) {
|
|
||||||
+ ggml_cuda_flash_attn_ext_tile(ctx, dst);
|
|
||||||
+ } else {
|
|
||||||
+ ggml_cuda_flash_attn_ext_vec(ctx, dst);
|
|
||||||
+ }
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,147 +0,0 @@
|
|||||||
From d5ca5cd756e42214d0003bca815ca91943679b0d Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Tue, 23 Jun 2026 00:18:35 +0200
|
|
||||||
Subject: [PATCH] paged decode: route GQA-grouped tile kernel by default (F16,
|
|
||||||
gqa>=2) - patch 0011
|
|
||||||
|
|
||||||
Increment 3 (the attention lever). In fattn.cu's paged dispatch guard, route the
|
|
||||||
in-kernel decode to the tile kernel for the common grouped-query F16 case, and
|
|
||||||
keep the inc-1 vec kernel for everything else.
|
|
||||||
|
|
||||||
The tile kernel carries native GQA head-group reuse: its ncols2 axis groups the
|
|
||||||
q-heads that share one kv-head, so each K/V row is loaded once for the whole
|
|
||||||
group instead of once per q-head. vec re-streams each kv-head's K/V once per
|
|
||||||
q-head (8x for Qwen3-32B's n_head 64 / n_head_kv 8) and runs at 168 regs ->
|
|
||||||
3 blocks/SM = 25% occupancy on GB10; tile is 108-128 regs with native grouping.
|
|
||||||
The inc-2 phys(j) block-table read was already plumbed into tile (patch 0010);
|
|
||||||
this patch makes it the default for {F16 K and V, gqa_ratio >= 2}.
|
|
||||||
|
|
||||||
Routing guard (why conditional): the tile kernel has no K/V type template - it
|
|
||||||
loads half2 - so a non-F16 cache (BF16 / quantized) would be converted by
|
|
||||||
launch_fattn to a contiguous F16 copy, which breaks the in-kernel block-table
|
|
||||||
read (the table indexes the original paged layout, not the copy). So tile is
|
|
||||||
correct only for an F16 cache; non-F16 caches and the non-grouped gqa==1 shape
|
|
||||||
fall back to the inc-1 vec path, exactly as before this change. The head-group
|
|
||||||
reuse also only helps at gqa_ratio >= 2. LLAMA_KV_PAGED_VEC=1 forces vec for A/B.
|
|
||||||
Note: paged decode is currently exercised with an F16 cache only; quantized +
|
|
||||||
paged is a separate pre-existing limitation, independent of this change
|
|
||||||
(verified: stock + q8_0 cache works, but paged + q8_0 aborts both before and
|
|
||||||
after this patch, since both route the non-F16 cache to vec).
|
|
||||||
|
|
||||||
Measured GB10 (sm_121, 48 SM), Qwen3-32B NVFP4 dense, F16 cache, gqa 8, batch 32,
|
|
||||||
1024 ctx, llama-batched-bench npp=1024 ntg=128 npl=32, GGML_CUDA_DISABLE_GRAPHS=1,
|
|
||||||
same build, env-toggled:
|
|
||||||
STOCK (mma) 174.8 ms/step 183.1 t/s
|
|
||||||
PAGED-VEC (inc-1) 186.3 ms/step 171.8 t/s (+6.6% vs stock)
|
|
||||||
PAGED-TILE (inc-3) 177.9 ms/step 179.8 t/s (+1.8% vs stock)
|
|
||||||
GQA grouping recovers 8.4 ms/step (-4.5%) over the inc-1 vec default and brings
|
|
||||||
paged decode to within 1.8% of stock. The win grows with context (npl=8, tile vs
|
|
||||||
vec decode step): 1024 -2.3%, 4096 -3.3%, 8192 and 16384 wider, as attention
|
|
||||||
takes a larger share of the step.
|
|
||||||
|
|
||||||
Why not the split-K tune: the vec decode grid is already block-saturated
|
|
||||||
(1 x parallel_blocks 3 x 2048 = 6144 blocks ~ 43 waves over 144 resident on 48
|
|
||||||
SM), so raising parallel_blocks / KV_max adds no SM fill. The under-saturation is
|
|
||||||
intra-SM (occupancy + the 8x KV re-streaming), which GQA grouping attacks
|
|
||||||
directly; more split-K does not.
|
|
||||||
|
|
||||||
Correctness (greedy, GGML_CUDA_DISABLE_GRAPHS=1):
|
|
||||||
- CPU plumbing gate (Qwen3-0.6B, build-cpu, paged-on vs off): BYTE-IDENTICAL.
|
|
||||||
- GPU 0.6B gqa=2, 8 seq x 48 tok: tile is token-identical to the inc-1 vec path
|
|
||||||
in 7/8 sequences; the 8th diverges at token 5, within the same kernel-noise
|
|
||||||
band where vec also drifts from stock. Stock uses the mma kernel for this
|
|
||||||
multi-stream GQA shape, so a different kernel = different rounding =
|
|
||||||
autoregressive token drift; vec and tile agree with each other while both
|
|
||||||
differ from stock (both pick 15678 where stock picks 38835), confirming the
|
|
||||||
drift is kernel choice, not a paging error.
|
|
||||||
- GPU 32B gqa=8, 4 seq x 40 tok: tile tracks stock at least as well as vec
|
|
||||||
(seq3: tile == stock == 624 at the token where vec picked 13).
|
|
||||||
|
|
||||||
Stock is byte-identical: the dispatch guard only diverts when the block table
|
|
||||||
(src[5]) is set; the non-paged best-kernel switch is untouched. The ncols2>1 tile
|
|
||||||
path reads the last nbatch_fa tile with oob_check=false and relies on the mask
|
|
||||||
-inf padding - the same pattern stock uses for ncols2>1 - and the compacted paged
|
|
||||||
mask is gathered to the n_view (GGML_PAD 256) width so it carries that padding.
|
|
||||||
|
|
||||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
|
||||||
---
|
|
||||||
ggml/src/ggml-cuda/fattn.cu | 51 ++++++++++++++++++++++++++-----------
|
|
||||||
1 file changed, 36 insertions(+), 15 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
|
|
||||||
index afcafa2..6b15810 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/fattn.cu
|
|
||||||
+++ b/ggml/src/ggml-cuda/fattn.cu
|
|
||||||
@@ -580,32 +580,53 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
|
||||||
// silently read the wrong (contiguous physical) cells. So when a block table
|
|
||||||
// is present we route here and NEVER fall through to the best-kernel switch
|
|
||||||
// below - no decode shape can silently reach an mma/wmma misread. build_attn
|
|
||||||
- // only sets src[5] for the 1-token-per-stream decode shape; the vec
|
|
||||||
+ // only sets src[5] for the 1-token-per-stream decode shape; the vec/tile
|
|
||||||
// dispatcher GGML_ABORTs for an unsupported D/type rather than mis-reading,
|
|
||||||
// and any shape that should not be paged must take the host-side gather path
|
|
||||||
// (LLAMA_KV_PAGED_GATHER=1) instead.
|
|
||||||
//
|
|
||||||
- // Default route = vec (inc-1, byte-validated: vec-paged == stock at -s 1 and
|
|
||||||
- // CPU byte-identical). LLAMA_KV_PAGED_TILE=1 routes the same shape to the
|
|
||||||
- // tile kernel; the tile in-kernel read is plumbed (fattn-tile.cuh) for the
|
|
||||||
- // increment-3 GQA head-group reuse, but is EXPERIMENTAL / NOT yet byte-
|
|
||||||
- // validated: the GQA-grouped (ncols2>1) tile path reads a full nbatch_fa tile
|
|
||||||
- // with oob_check=false while the compacted paged mask is not padded to cover
|
|
||||||
- // it, so it diverges from stock. Not for production paged decode until
|
|
||||||
- // increment-3 bounds that path; the default vec route is unaffected.
|
|
||||||
+ // Default route = the GQA-grouped TILE kernel (inc-3) WHEN it is both correct
|
|
||||||
+ // and a win, else the inc-1 vec path. Tile groups the q-heads that share one
|
|
||||||
+ // kv-head (ncols2), loading each K/V row once for the whole group instead of
|
|
||||||
+ // once per q-head, and runs at higher occupancy than vec (108-128 regs vs 168).
|
|
||||||
+ // Two constraints make this conditional: (1) the tile kernel has no K/V type
|
|
||||||
+ // template - it loads half2 - so a non-F16 cache (BF16/quantized) would be
|
|
||||||
+ // converted by launch_fattn to a contiguous F16 copy, which breaks the
|
|
||||||
+ // in-kernel block-table read (the table indexes the original paged layout, not
|
|
||||||
+ // the copy); vec instead reads the original cache with in-kernel dequant, so it
|
|
||||||
+ // is the only correct paged path for non-F16 caches. (2) the head-group reuse
|
|
||||||
+ // only helps when gqa_ratio>=2. So route to tile only for {F16 K and V,
|
|
||||||
+ // gqa_ratio>=2}; everything else stays on vec, matching stock (which also sends
|
|
||||||
+ // quantized-cache decode to the vector kernel). Measured on GB10 (Qwen3-32B
|
|
||||||
+ // nvfp4, F16 cache, gqa 8, batch 32, 1024 ctx): tile 177.9 ms/step vs vec 186.3
|
|
||||||
+ // vs stock 174.8 - GQA grouping recovers ~4.5% over the inc-1 vec default and
|
|
||||||
+ // brings paged decode to ~1.8% of stock. Validated token-coherent with vec:
|
|
||||||
+ // 0.6B 8-seq 7/8 identical (8th within the kernel-noise band where vec also
|
|
||||||
+ // drifts from stock), 32B gqa=8 tile tracks stock at least as well as vec, CPU
|
|
||||||
+ // plumbing gate byte-identical. The ncols2>1 tile path reads the last nbatch_fa
|
|
||||||
+ // tile with oob_check=false relying on mask -inf padding (the SAME pattern stock
|
|
||||||
+ // uses for ncols2>1); the compacted paged mask is gathered to the n_view
|
|
||||||
+ // (GGML_PAD 256) width so it carries that padding. LLAMA_KV_PAGED_VEC=1 forces
|
|
||||||
+ // the inc-1 vec path for A/B.
|
|
||||||
if (dst->src[5] != nullptr) {
|
|
||||||
- static const bool paged_tile = getenv("LLAMA_KV_PAGED_TILE") != nullptr;
|
|
||||||
+ const ggml_tensor * Qp = dst->src[0];
|
|
||||||
+ const ggml_tensor * Kp = dst->src[1];
|
|
||||||
+ const ggml_tensor * Vp = dst->src[2];
|
|
||||||
+ const bool kv_f16 = Kp->type == GGML_TYPE_F16 && Vp->type == GGML_TYPE_F16;
|
|
||||||
+ const int64_t gqa_ratio = Kp->ne[2] > 0 ? Qp->ne[2] / Kp->ne[2] : 1;
|
|
||||||
+ const bool force_vec = getenv("LLAMA_KV_PAGED_VEC") != nullptr;
|
|
||||||
+ const bool use_tile = !force_vec && kv_f16 && gqa_ratio >= 2;
|
|
||||||
if (getenv("LLAMA_KV_PAGED_DISPATCH_LOG") != nullptr) {
|
|
||||||
static bool logged = false;
|
|
||||||
if (!logged) {
|
|
||||||
logged = true;
|
|
||||||
- fprintf(stderr, "[paged] decode src[5] set -> routing to %s (Q ne=[%ld,%ld,%ld,%ld])\n",
|
|
||||||
- paged_tile ? "TILE(experimental)" : "VEC",
|
|
||||||
- (long) dst->src[0]->ne[0], (long) dst->src[0]->ne[1],
|
|
||||||
- (long) dst->src[0]->ne[2], (long) dst->src[0]->ne[3]);
|
|
||||||
+ fprintf(stderr, "[paged] decode src[5] set -> routing to %s (Q ne=[%ld,%ld,%ld,%ld] gqa=%ld kv_f16=%d)\n",
|
|
||||||
+ use_tile ? "TILE(gqa)" : "VEC",
|
|
||||||
+ (long) Qp->ne[0], (long) Qp->ne[1], (long) Qp->ne[2], (long) Qp->ne[3],
|
|
||||||
+ (long) gqa_ratio, (int) kv_f16);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
- if (paged_tile) {
|
|
||||||
+ if (use_tile) {
|
|
||||||
ggml_cuda_flash_attn_ext_tile(ctx, dst);
|
|
||||||
} else {
|
|
||||||
ggml_cuda_flash_attn_ext_vec(ctx, dst);
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,50 +0,0 @@
|
|||||||
From 6e3e976e2b11adb05519f31dd5aad0c204678f5c Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Tue, 23 Jun 2026 11:12:05 +0200
|
|
||||||
Subject: [PATCH] feat(paged): assert mask-pad invariant for the paged tile
|
|
||||||
route (patch 0012)
|
|
||||||
|
|
||||||
The now-default paged decode route (GQA-grouped fattn-tile kernel) does not
|
|
||||||
leak past-end KV rows only because the compacted mask/block-table length is
|
|
||||||
padded to a whole number of flash-attn KV tiles: n_view = GGML_PAD(n_gather,
|
|
||||||
256), and the tile (nbatch_fa = 64 for head_dim 128) divides 256, so the last
|
|
||||||
tile sits entirely inside the -inf pad window. That invariant was implicit.
|
|
||||||
|
|
||||||
Add a defensive GGML_ASSERT(n_view % 64 == 0) right after the pad/clamp so a
|
|
||||||
future change to the pad (e.g. < 256) or the tile (> 256) that broke the
|
|
||||||
whole-tile property cannot silently reintroduce the leak. Additive only, no
|
|
||||||
behaviour change.
|
|
||||||
|
|
||||||
Verified: build-cpu compiles, and the paged CPU byte gate (LLAMA_KV_PAGED off
|
|
||||||
vs on, Qwen3-0.6B-Q8_0, greedy, -ngl 0) stays byte-identical while the assert
|
|
||||||
stays silent (n_view remains a whole number of tiles across all decode steps).
|
|
||||||
|
|
||||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
|
||||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
---
|
|
||||||
src/paged-attn.cpp | 9 +++++++++
|
|
||||||
1 file changed, 9 insertions(+)
|
|
||||||
|
|
||||||
diff --git a/src/paged-attn.cpp b/src/paged-attn.cpp
|
|
||||||
index 8eebeaa..fed8ca9 100644
|
|
||||||
--- a/src/paged-attn.cpp
|
|
||||||
+++ b/src/paged-attn.cpp
|
|
||||||
@@ -201,6 +201,15 @@ bool in_kernel_decode(ggml_context * ctx0,
|
|
||||||
n_view = K->ne[2];
|
|
||||||
}
|
|
||||||
|
|
||||||
+ // The flash-attn KV tile is 64 rows wide (nbatch_fa for head_dim 128). n_view must be
|
|
||||||
+ // a whole number of such tiles so the in-kernel decode never reads past the gathered
|
|
||||||
+ // rows: the trailing pad cells [n_gather, n_view) are all -inf, so any tile straddling
|
|
||||||
+ // the boundary still contributes zero. This holds today only because the pad (256) is a
|
|
||||||
+ // multiple of the tile; a future pad < 256 (or nbatch_fa > 256) that broke it would
|
|
||||||
+ // silently reintroduce a past-end KV leak, so assert it rather than trust it.
|
|
||||||
+ // pad must be a multiple of the flash-attn KV tile so the last tile is fully inside the -inf pad
|
|
||||||
+ GGML_ASSERT(n_view % 64 == 0);
|
|
||||||
+
|
|
||||||
ggml_tensor * idx = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, n_view, n_stream);
|
|
||||||
ggml_set_input(idx);
|
|
||||||
res->add_input(llm_graph_input_ptr(new input_block_table(mctx, idx, (uint32_t) n_view)));
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,137 +0,0 @@
|
|||||||
From 17d97cb74e3e8c93751afd33f5c183e57056fde9 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Tue, 23 Jun 2026 11:52:45 +0200
|
|
||||||
Subject: [PATCH] feat(paged): decoupled per-step prefill-token budget (patch
|
|
||||||
0013)
|
|
||||||
|
|
||||||
llama-server already co-batches decode with chunked prefill: update_slots()
|
|
||||||
appends every generating slot's sampled token first, then fills the rest of the
|
|
||||||
n_batch budget with prompt tokens, deferring the overflow to the next step. But
|
|
||||||
the prefill chunk size is hard-wired to n_batch (default 2048): one slot's
|
|
||||||
~2048-token prefill chunk lands in a single compute-heavy step, and every decode
|
|
||||||
co-batched into that step sees a multi-second inter-token-latency (ITL) spike.
|
|
||||||
Lowering n_batch shrinks the chunk but also caps decode-concurrency width and
|
|
||||||
prefill throughput, because they are coupled.
|
|
||||||
|
|
||||||
Add LLAMA_PREFILL_BUDGET: a per-step prefill-token budget decoupled from n_batch
|
|
||||||
(the analogue of vLLM's --max-num-batched-tokens / long_prefill_token_threshold).
|
|
||||||
The prompt-fill loop and the outer slot loop now also stop once this many prompt
|
|
||||||
tokens have been added in the current update_slots() step, so a long prefill is
|
|
||||||
split across more steps that each still advance in-flight decode. Default (env
|
|
||||||
unset or <= 0) = disabled, so stock behaviour is byte-identical. Orthogonal to
|
|
||||||
LLAMA_KV_PAGED: this is a pure scheduler knob and works with paged off.
|
|
||||||
|
|
||||||
Measured on GB10 (sm_121), dense Qwen3-32B-NVFP4, paged build, 8 steady decode
|
|
||||||
streams with one 6000-token prefill injected mid-stream; same binary, only
|
|
||||||
LLAMA_PREFILL_BUDGET differs:
|
|
||||||
|
|
||||||
metric stock(off) budget=256 budget=512
|
|
||||||
worst decode freeze (ms) 3380 482 (7.0x) 778 (4.3x)
|
|
||||||
median decode ITL in window 2264 411 (5.5x) 689
|
|
||||||
decode_stall (ms) 3285 387 (8.5x) 684 (4.8x)
|
|
||||||
decode steps during prefill 38 201 (5.3x) 108
|
|
||||||
injected-req TTFT (ms) 8493 10172 (+20%) 8432 (~0%)
|
|
||||||
steady-state baseline ITL 94 95 94
|
|
||||||
|
|
||||||
This is a LATENCY/fairness lever, not an aggregate-throughput lever: it flattens
|
|
||||||
the decode ITL spike a long prefill inflicts on co-batched decoders (8.5x smaller
|
|
||||||
worst freeze and 5.3x more decode progress during the prefill at budget=256), in
|
|
||||||
exchange for a modest TTFT rise on the long request (the classic chunked-prefill
|
|
||||||
trade-off; budget=512 buys 4.8x with ~no TTFT cost). Steady aggregate decode is
|
|
||||||
unchanged: it is bandwidth/weight-capped on GB10 (the NVFP4 weight-read floor),
|
|
||||||
which the scheduler cannot lift.
|
|
||||||
|
|
||||||
Correctness (same model, greedy temp 0, fa on):
|
|
||||||
- budget unset or >= n_batch: byte-identical to stock (the added break never
|
|
||||||
fires before the existing n_batch break; the off-path is a no-op by
|
|
||||||
construction).
|
|
||||||
- short prompt (<= budget): byte-identical to stock.
|
|
||||||
- the knob is exactly equivalent to stock's native -b chunking: budget=512 ==
|
|
||||||
stock -b512 and budget=256 == stock -b256, both BYTE-IDENTICAL, while keeping
|
|
||||||
n_batch=2048 for decode width.
|
|
||||||
- on a prompt larger than the budget the chunked greedy output diverges from the
|
|
||||||
single n_batch chunk only by intrinsic flash-attn chunk-size FP grouping: PURE
|
|
||||||
stock -b256 diverges from stock -b2048 the same way with the patch inactive,
|
|
||||||
and the output stays coherent and answers correctly.
|
|
||||||
|
|
||||||
Productisation (LocalAI): surface as a model options knob (max_prefill_tokens /
|
|
||||||
mpt) parsed in grpc-server.cpp, default 0 = disabled, per CHUNKED_PREFILL_PLAN
|
|
||||||
Phase B; the vendored update_slots() hunk here is that plan's scheduler patch and
|
|
||||||
stays disjoint from the paged allocation hunks.
|
|
||||||
|
|
||||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
|
||||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
---
|
|
||||||
tools/server/server-context.cpp | 35 ++++++++++++++++++++++++++++++++-
|
|
||||||
1 file changed, 34 insertions(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
|
|
||||||
index 04c6361..5d83b30 100644
|
|
||||||
--- a/tools/server/server-context.cpp
|
|
||||||
+++ b/tools/server/server-context.cpp
|
|
||||||
@@ -2723,6 +2723,29 @@ private:
|
|
||||||
int32_t n_batch = llama_n_batch(ctx_tgt);
|
|
||||||
int32_t n_ubatch = llama_n_ubatch(ctx_tgt);
|
|
||||||
|
|
||||||
+ // PAGED serving lever (patch 0013): decoupled per-step prefill-token budget.
|
|
||||||
+ // Analogue of vLLM's --max-num-batched-tokens. Stock llama-server caps the prompt
|
|
||||||
+ // tokens ingested per update_slots() step at n_batch only; with cont_batching the
|
|
||||||
+ // sampled decode tokens of every generating slot are appended FIRST, then prompt
|
|
||||||
+ // tokens fill the batch up to n_batch. A long prompt therefore grabs an ~n_batch
|
|
||||||
+ // chunk in a SINGLE compute-heavy step, spiking the inter-token latency of every
|
|
||||||
+ // co-batched decoder (head-of-line jitter). LLAMA_PREFILL_BUDGET caps the prompt
|
|
||||||
+ // tokens added per step independently of n_batch, splitting a long prefill across
|
|
||||||
+ // more steps so in-flight decode keeps advancing smoothly. Default (env unset or
|
|
||||||
+ // <=0) = disabled => stock behavior is byte-identical. Orthogonal to LLAMA_KV_PAGED
|
|
||||||
+ // (this is a pure scheduler knob; works with paged off).
|
|
||||||
+ int32_t n_prefill_budget = 0; // 0 = disabled (stock n_batch-only chunking)
|
|
||||||
+ {
|
|
||||||
+ const char * env_pb = getenv("LLAMA_PREFILL_BUDGET");
|
|
||||||
+ if (env_pb) {
|
|
||||||
+ const int v = atoi(env_pb);
|
|
||||||
+ if (v > 0) {
|
|
||||||
+ n_prefill_budget = std::min(n_batch, std::max(1, v));
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+ int32_t n_prompt_budgeted = 0; // prompt tokens added to the batch this step (across slots)
|
|
||||||
+
|
|
||||||
float alora_scale = -1.0f;
|
|
||||||
size_t alora_disabled_id = 0;
|
|
||||||
|
|
||||||
@@ -3159,7 +3182,10 @@ private:
|
|
||||||
const bool n_before_user_known = n_before_user > 0;
|
|
||||||
|
|
||||||
// add prompt tokens for processing in the current batch
|
|
||||||
- while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch) {
|
|
||||||
+ // (patch 0013) also stop once the per-step prefill budget is spent, so a long
|
|
||||||
+ // prompt is split across more steps and leaves batch room for co-batched decode
|
|
||||||
+ while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch &&
|
|
||||||
+ (n_prefill_budget == 0 || n_prompt_budgeted < n_prefill_budget)) {
|
|
||||||
// get next token to process
|
|
||||||
llama_token cur_tok = input_tokens[slot.prompt.n_tokens()];
|
|
||||||
if (cur_tok == LLAMA_TOKEN_NULL) {
|
|
||||||
@@ -3185,6 +3211,7 @@ private:
|
|
||||||
slot.prompt.tokens.push_back(cur_tok);
|
|
||||||
|
|
||||||
slot.n_prompt_tokens_processed++;
|
|
||||||
+ n_prompt_budgeted++; // (patch 0013) count toward the per-step prefill budget
|
|
||||||
|
|
||||||
// stop the prompt batch exactly before the latest user input, so a checkpoint
|
|
||||||
// can be created after the previous messages
|
|
||||||
@@ -3293,6 +3320,12 @@ private:
|
|
||||||
if (batch.n_tokens >= n_batch) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
+
|
|
||||||
+ // (patch 0013) stop adding prompts once the per-step prefill budget is spent,
|
|
||||||
+ // leaving the remaining batch capacity for co-batched decode of other slots
|
|
||||||
+ if (n_prefill_budget > 0 && n_prompt_budgeted >= n_prefill_budget) {
|
|
||||||
+ break;
|
|
||||||
+ }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,140 +0,0 @@
|
|||||||
From 652b858252b354f4d4fb49e5ed7468eeee8e32fc Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Tue, 23 Jun 2026 15:47:06 +0200
|
|
||||||
Subject: [PATCH] feat(paged): expert-aware MoE token-tile cap (patch 0014)
|
|
||||||
|
|
||||||
On GB10 (sm_121) the Qwen3-30B-A3B-class mxfp4 MoE decode path already uses the
|
|
||||||
sorted grouped FP4-MMA GEMM (MUL_MAT_ID -> ggml_cuda_mul_mat_q ids branch:
|
|
||||||
mm_ids_helper moe_align/scatter + one persistent stream-k mul_mat_q), so the
|
|
||||||
originally reported npl128 throughput cliff does NOT reproduce on this build.
|
|
||||||
llama-batched-bench decode (S_TG t/s) is monotonic across batch:
|
|
||||||
|
|
||||||
npl 1 8 32 64 128 256
|
|
||||||
S_TG 85 282 629 935 1295 1779 (stock, mxfp4 MoE, -fa on)
|
|
||||||
|
|
||||||
There is no knee to erase; the old cliff (a real high-batch regression, 620 t/s
|
|
||||||
at npl128) was fixed upstream by grouped-mmq + MoE stream-k load balancing.
|
|
||||||
|
|
||||||
What remains is a pure tile-shape micro-inefficiency. In mul_mat_q_case the
|
|
||||||
token-tile width mmq_x is chosen to cover ncols_max (= ne12, the per-expert
|
|
||||||
column upper bound = token count, up to 128) in one column-tile. At MoE decode
|
|
||||||
the per-expert token density is ~ne12*k/n_experts (top-8 of 128 => ~1/16 of
|
|
||||||
ne12, e.g. ~8 tokens/expert at npl128), so each expert's single mmq_x-wide
|
|
||||||
col-tile is only ~6% filled: the MMA accumulator tile is mmq_x-wide at compile
|
|
||||||
time and burns throughput on the padding columns while the larger y-tile lowers
|
|
||||||
occupancy. Stock picks the LARGEST tile (128) where the SMALLEST tile that still
|
|
||||||
covers the density would raise fill + occupancy at no extra weight read (at
|
|
||||||
tokens/expert <= mmq_x there is exactly one non-empty col-tile per expert; the
|
|
||||||
emptier tiles are skipped by the jt*mmq_x >= col_diff guard in the stream-k
|
|
||||||
kernel) - the inverse of vLLM's small per-expert BLOCK_SIZE_M.
|
|
||||||
|
|
||||||
Add LLAMA_MOE_MMQ_X: an env cap on mmq_x for the MUL_MAT_ID path only
|
|
||||||
(expert_bounds != nullptr). Default (unset or <= 0) = disabled, so the mmq_x
|
|
||||||
selection, and therefore every kernel launched, is byte-identical to stock. The
|
|
||||||
cap only ever lowers the loop's upper bound and still selects from the same
|
|
||||||
granularity- and shared-memory-validated mmq_x set stock already uses for
|
|
||||||
smaller batches, so no new kernel configuration is exercised.
|
|
||||||
|
|
||||||
Measured on GB10, qwen3coder-mxfp4.gguf, -fa on, -npp 128 -ntg 128, same binary,
|
|
||||||
only LLAMA_MOE_MMQ_X differs (decode S_TG t/s / prefill S_PP t/s):
|
|
||||||
|
|
||||||
npl stock S_TG cap64 S_TG d% stock S_PP cap64 S_PP
|
|
||||||
64 936 938 +0.1 2924 2883
|
|
||||||
128 1295 1357 +4.8 3075 3038
|
|
||||||
256 1784 1825 +2.3 3085 3046
|
|
||||||
|
|
||||||
(reproduced across interleaved reps; cap64 npl128 = 1357.5/1357.0, very stable)
|
|
||||||
|
|
||||||
cap64 lifts high-batch decode +4.8% (npl128) / +2.3% (npl256), neutral at
|
|
||||||
npl <= 64, for a consistent ~1.3% prefill cost. Smaller caps are net-negative:
|
|
||||||
cap16 / cap32 crater prefill -41% / -17% (a 512-token prefill ubatch has ~32
|
|
||||||
tokens/expert, which overflows a 16/32-wide tile into extra col-tiles + weight
|
|
||||||
re-reads), so 64 is the recommended value and the only one that helps net.
|
|
||||||
|
|
||||||
Honest framing: this is NOT a cliff fix (no cliff exists) and not a real-server
|
|
||||||
throughput unlock (llama-server continuous batching already scales). It is a
|
|
||||||
modest high-effective-batch DECODE micro-optimization that matches vLLM's
|
|
||||||
smaller per-expert M-tiling, surfaced as an opt-in, default-off knob. The
|
|
||||||
durable density-aware auto-select (drop the blunt global cap, choose mmq_x from
|
|
||||||
ne_get_rows / n_active_experts so prefill keeps its large tile) is scoped in
|
|
||||||
patches/paged/MOE_GROUPED_GEMM_SCOPE.md.
|
|
||||||
|
|
||||||
Correctness: greedy temp-0 llama-server output with cap64 is byte-identical to
|
|
||||||
stock for single-stream generation (fibonacci / capital-of-France / photosynthesis
|
|
||||||
prompts) and stays coherent; batched-bench ran thousands of capped MoE matmuls at
|
|
||||||
npl128/256 (mmq_x forced 128 -> 64) with no CUDA error / NaN and stable output.
|
|
||||||
|
|
||||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
|
||||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
---
|
|
||||||
ggml/src/ggml-cuda/mmq.cuh | 37 ++++++++++++++++++++++++++++++++++++-
|
|
||||||
1 file changed, 36 insertions(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
|
|
||||||
index edf546d..cff608e 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/mmq.cuh
|
|
||||||
+++ b/ggml/src/ggml-cuda/mmq.cuh
|
|
||||||
@@ -6,6 +6,7 @@
|
|
||||||
|
|
||||||
#include <climits>
|
|
||||||
#include <cstdint>
|
|
||||||
+#include <cstdlib>
|
|
||||||
|
|
||||||
using namespace ggml_cuda_mma;
|
|
||||||
|
|
||||||
@@ -4052,6 +4053,18 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
+// [paged patch 0014] MoE token-tile (mmq_x) cap, read once from env LLAMA_MOE_MMQ_X.
|
|
||||||
+// Returns 0 when unset / non-positive => disabled (stock mmq_x selection, byte-identical).
|
|
||||||
+// On the MUL_MAT_ID grouped-GEMM path this caps the per-expert column-tile width toward the
|
|
||||||
+// low MoE-decode per-expert token density, raising tile fill + occupancy (see mul_mat_q_case).
|
|
||||||
+static inline int ggml_cuda_moe_mmq_x_cap() {
|
|
||||||
+ static const int cap = []() -> int {
|
|
||||||
+ const char * s = getenv("LLAMA_MOE_MMQ_X");
|
|
||||||
+ return s ? atoi(s) : 0;
|
|
||||||
+ }();
|
|
||||||
+ return cap;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
template <ggml_type type>
|
|
||||||
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
|
||||||
const int id = ggml_cuda_get_device();
|
|
||||||
@@ -4063,10 +4076,32 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
|
||||||
const int mmq_x_max = get_mmq_x_max_host(cc);
|
|
||||||
const int mmq_y = get_mmq_y_host(cc);
|
|
||||||
|
|
||||||
+ // [paged patch 0014] expert-aware MoE token-tile (mmq_x) cap.
|
|
||||||
+ // On the MUL_MAT_ID grouped-GEMM path (expert_bounds != nullptr) the GEMM columns are
|
|
||||||
+ // tokens sorted by expert; stock picks mmq_x to cover ncols_max (= ne12, the token count,
|
|
||||||
+ // up to 128) in a single column-tile. At MoE decode the per-expert token density is low
|
|
||||||
+ // (top-k of many experts: ~ne12*k/n_experts tokens/expert, e.g. ~8 at npl128 for
|
|
||||||
+ // Qwen3-30B-A3B top-8/128), so each expert's single mmq_x-wide col-tile is mostly empty:
|
|
||||||
+ // the MMA accumulator tile is mmq_x-wide at compile time and wastes throughput on the
|
|
||||||
+ // padding columns while the larger y-tile lowers occupancy. Capping mmq_x toward the
|
|
||||||
+ // per-expert density raises tile fill + occupancy with no extra weight reads (at
|
|
||||||
+ // tokens/expert <= mmq_x there is still exactly one non-empty col-tile per expert; the
|
|
||||||
+ // emptier tiles are skipped by the jt*mmq_x >= col_diff guard in the stream-k kernel).
|
|
||||||
+ // Default (env unset or <= 0) = disabled => mmq_x selection is byte-identical to stock;
|
|
||||||
+ // off the ids path the cap never applies.
|
|
||||||
+ int mmq_x_lim = mmq_x_max;
|
|
||||||
+ if (args.expert_bounds != nullptr) {
|
|
||||||
+ const int moe_cap = ggml_cuda_moe_mmq_x_cap();
|
|
||||||
+ if (moe_cap > 0) {
|
|
||||||
+ const int cap = moe_cap < 8 ? 8 : moe_cap;
|
|
||||||
+ mmq_x_lim = cap < mmq_x_max ? cap : mmq_x_max;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
int mmq_x_best = 0;
|
|
||||||
int ntiles_x_best = INT_MAX;
|
|
||||||
|
|
||||||
- for (int mmq_x = 8; mmq_x <= mmq_x_max && ntiles_x_best > 1; mmq_x += 8) {
|
|
||||||
+ for (int mmq_x = 8; mmq_x <= mmq_x_lim && ntiles_x_best > 1; mmq_x += 8) {
|
|
||||||
const int granularity = mmq_get_granularity_host(mmq_x, cc);
|
|
||||||
|
|
||||||
if (mmq_x % granularity != 0 || mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc, warp_size, nwarps) > smpbo) {
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,238 +0,0 @@
|
|||||||
From 151343bc8c7b956c99eafc855704b70d44637a3b Mon Sep 17 00:00:00 2001
|
|
||||||
From: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
Date: Tue, 23 Jun 2026 21:03:00 +0200
|
|
||||||
Subject: [PATCH] feat(paged): expert-density-aware MoE token-tile auto-select
|
|
||||||
(patch 0015)
|
|
||||||
|
|
||||||
The durable follow-up to patch 0014's blunt LLAMA_MOE_MMQ_X global cap (which the
|
|
||||||
0014 doc itself scoped): replace the manual env cap with a host-side, default-on
|
|
||||||
auto-select inside mul_mat_q_case that picks a small token-tile (mmq_x) for the
|
|
||||||
MUL_MAT_ID grouped FP4-MMA GEMM only when the per-expert token density is low
|
|
||||||
(decode), and keeps the large 128-wide tile when density is high (prefill). No new
|
|
||||||
kernel: the selection only lowers the loop's upper bound to an already-compiled,
|
|
||||||
granularity- and shared-memory-validated mmq_x.
|
|
||||||
|
|
||||||
Density is estimated host-side from the args the ids path already passes:
|
|
||||||
ne_get_rows = ncols_dst = ne12 * n_expert_used (token-expert assignments)
|
|
||||||
n_experts = nchannels_x = ne02
|
|
||||||
density = ceil(ne_get_rows / min(ne_get_rows, n_experts)) (tokens/expert)
|
|
||||||
Cap to the small tile (default 64) only when density <= density_max. Unlike 0014's
|
|
||||||
global cap, the high-density prefill ubatch stays on the big tile, so S_PP does not
|
|
||||||
regress by construction.
|
|
||||||
|
|
||||||
density_max default = 8 (not tile/4 = 16). The cap must fire for decode but not for
|
|
||||||
a prefill ubatch, and each has per-expert density n_tokens*n_used/n_experts. At the
|
|
||||||
standard n_ubatch=512, n_used=8: prefill density = 4096/n_experts (32 at 128 experts,
|
|
||||||
16 at 256), decode at npl<=128 is <= 1024/n_experts (8 at 128, 4 at 256). Default 8
|
|
||||||
sits strictly between for every n_experts in [128,511], so it caps decode and leaves
|
|
||||||
prefill on the big tile. tile/4 (=16) equalled the 256-expert prefill density and
|
|
||||||
cratered its S_PP by ~2%, the regression this threshold exists to avoid.
|
|
||||||
|
|
||||||
Measured on GB10 (sm_121), Qwen3.6-35B-A3B NVFP4 (256 experts, top-8, GDN linear
|
|
||||||
attention), llama-batched-bench -fa on -npp 128 -ntg 128, default-on vs stock
|
|
||||||
(LLAMA_MOE_AUTO_TILE=0), median of 5 reps:
|
|
||||||
|
|
||||||
npl S_TG stock S_TG 0015 dTG% S_PP stock S_PP 0015 dPP%
|
|
||||||
8 183.59 183.18 -0.22% 1489.2 1500.1 +0.73%
|
|
||||||
32 264.02 263.44 -0.22% 2034.5 2033.5 -0.05%
|
|
||||||
64 311.76 310.41 -0.43% 2028.3 2027.6 -0.03%
|
|
||||||
128 336.10 337.32 +0.36% 2025.0 2027.7 +0.13%
|
|
||||||
|
|
||||||
Honest read: on THIS model the decode effect is within run-to-run noise (neutral)
|
|
||||||
and prefill is neutral. q36-35b-a3b decode is bound by the GDN/SSM recurrence and
|
|
||||||
256 tiny-expert weight bandwidth, not the MoE col-tile occupancy, so the col-tile
|
|
||||||
lever (worth +4.8% @npl128 on Qwen3-Coder-30B, 128 larger experts, patch 0014
|
|
||||||
cap64) does not move it. A npl128 tile sweep on this model confirms 64 is the only
|
|
||||||
useful width (TILE8 -6.3%, TILE16 -3.2%, TILE32 -0.2%, TILE64 +0.7%, TILE96 -0.8%):
|
|
||||||
smaller tiles lose to grid/scheduling overhead and the FP4-MMA minimum width.
|
|
||||||
|
|
||||||
Value banked default-on: (1) removes 0014's ~1.3% prefill cost by construction
|
|
||||||
(density-gated, not global); (2) auto-selects the small tile for col-tile-bound MoE
|
|
||||||
decode, reproducing 0014 cap64's tile=64 at npl128 by construction, so it preserves
|
|
||||||
the +4.8% on Qwen3-Coder-30B without the prefill cost; (3) prefill-safe and decode-
|
|
||||||
neutral on the SSM model, harmless where it does not help. Conservative by design:
|
|
||||||
at npl256 the qwen3coder decode density (16) equals the 256-expert prefill density
|
|
||||||
(16), indistinguishable to a pure-density gate, so density_max=8 forgoes 0014's
|
|
||||||
+2.3% @npl256 to keep 256-expert prefill safe; an ne12-aware refinement is future
|
|
||||||
work.
|
|
||||||
|
|
||||||
LLAMA_MOE_MMQ_X (patch 0014) is KEPT as a manual override that, when > 0, forces the
|
|
||||||
old blunt global cap and bypasses the auto-select (explicit A/B knob). The auto-
|
|
||||||
select is the default; LLAMA_MOE_AUTO_TILE=0 restores exact stock mmq_x selection.
|
|
||||||
LLAMA_MOE_DECODE_TILE / LLAMA_MOE_DENSITY_MAX tune the small tile / threshold.
|
|
||||||
|
|
||||||
Correctness: extends tests/test-backend-ops test_mul_mat_id with a ragged small-M
|
|
||||||
NVFP4/MXFP4 MoE decode-density gate (128 experts, top-8, m=768, k=2048, n in
|
|
||||||
{16,33,64,128,130,200,256,512} spanning the cap boundary and ragged token counts).
|
|
||||||
All 16 shapes pass CUDA-vs-CPU oracle on GB10 both default-on and with
|
|
||||||
LLAMA_MOE_AUTO_TILE=0; full MUL_MAT_ID suite 2/2 backends OK. Off the ids path
|
|
||||||
nothing changes (non-MoE mul_mat byte-identical to stock).
|
|
||||||
|
|
||||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
|
||||||
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
|
|
||||||
---
|
|
||||||
ggml/src/ggml-cuda/mmq.cuh | 100 ++++++++++++++++++++++++++++++-------
|
|
||||||
tests/test-backend-ops.cpp | 16 ++++++
|
|
||||||
2 files changed, 99 insertions(+), 17 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
|
|
||||||
index cff608e..9718b12 100644
|
|
||||||
--- a/ggml/src/ggml-cuda/mmq.cuh
|
|
||||||
+++ b/ggml/src/ggml-cuda/mmq.cuh
|
|
||||||
@@ -4053,10 +4053,11 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
-// [paged patch 0014] MoE token-tile (mmq_x) cap, read once from env LLAMA_MOE_MMQ_X.
|
|
||||||
-// Returns 0 when unset / non-positive => disabled (stock mmq_x selection, byte-identical).
|
|
||||||
-// On the MUL_MAT_ID grouped-GEMM path this caps the per-expert column-tile width toward the
|
|
||||||
-// low MoE-decode per-expert token density, raising tile fill + occupancy (see mul_mat_q_case).
|
|
||||||
+// [paged patch 0014] MoE token-tile (mmq_x) MANUAL cap, read once from env LLAMA_MOE_MMQ_X.
|
|
||||||
+// Returns 0 when unset / non-positive => disabled (fall through to the patch-0015 auto-select).
|
|
||||||
+// When > 0 it forces a blunt GLOBAL cap on the per-expert column-tile width for the MUL_MAT_ID
|
|
||||||
+// grouped-GEMM path (decode AND prefill), overriding the density-aware auto-select below. Kept
|
|
||||||
+// as an explicit override / A-B knob; the default path is now the auto-select.
|
|
||||||
static inline int ggml_cuda_moe_mmq_x_cap() {
|
|
||||||
static const int cap = []() -> int {
|
|
||||||
const char * s = getenv("LLAMA_MOE_MMQ_X");
|
|
||||||
@@ -4065,6 +4066,43 @@ static inline int ggml_cuda_moe_mmq_x_cap() {
|
|
||||||
return cap;
|
|
||||||
}
|
|
||||||
|
|
||||||
+// [paged patch 0015] expert-density-aware MoE token-tile (mmq_x) auto-select knobs (DEFAULT-ON).
|
|
||||||
+// LLAMA_MOE_AUTO_TILE=0 disables the auto-select => exact stock mmq_x selection.
|
|
||||||
+static inline bool ggml_cuda_moe_auto_tile_enabled() {
|
|
||||||
+ static const bool en = []() -> bool {
|
|
||||||
+ const char * s = getenv("LLAMA_MOE_AUTO_TILE");
|
|
||||||
+ return !(s && atoi(s) == 0);
|
|
||||||
+ }();
|
|
||||||
+ return en;
|
|
||||||
+}
|
|
||||||
+// The small high-occupancy token-tile chosen for low-density (decode) MoE matmuls. Default 64:
|
|
||||||
+// the measured GB10 sweet spot (full per-expert fill with >=4x routing-imbalance headroom).
|
|
||||||
+static inline int ggml_cuda_moe_decode_tile() {
|
|
||||||
+ static const int t = []() -> int {
|
|
||||||
+ const char * s = getenv("LLAMA_MOE_DECODE_TILE");
|
|
||||||
+ const int v = s ? atoi(s) : 0;
|
|
||||||
+ return v >= 8 ? v : 64;
|
|
||||||
+ }();
|
|
||||||
+ return t;
|
|
||||||
+}
|
|
||||||
+// Per-expert token-density ceiling under which the small tile is selected. Default 8: the cap must
|
|
||||||
+// fire for decode but NOT for a prefill ubatch, and the per-expert density of each is
|
|
||||||
+// n_tokens*n_used/n_experts. For the standard n_ubatch=512, n_used=8 the prefill density is
|
|
||||||
+// 4096/n_experts (= 32 at 128 experts, 16 at 256 experts); decode at npl<=128 is <=1024/n_experts
|
|
||||||
+// (= 8 at 128 experts, 4 at 256). Default 8 sits strictly between the two for every n_experts in
|
|
||||||
+// [128,511], so it caps decode and leaves the prefill ubatch on the big 128 tile - whereas the old
|
|
||||||
+// tile/4 (=16) equalled the 256-expert prefill density and cratered its S_PP by ~2% (measured on
|
|
||||||
+// Qwen3.6-35B-A3B NVFP4). 8 also keeps >=8x fill headroom at tile 64 so an imbalanced expert
|
|
||||||
+// segment never splits into an extra col-tile.
|
|
||||||
+static inline int ggml_cuda_moe_density_max() {
|
|
||||||
+ static const int d = []() -> int {
|
|
||||||
+ const char * s = getenv("LLAMA_MOE_DENSITY_MAX");
|
|
||||||
+ const int v = s ? atoi(s) : 0;
|
|
||||||
+ return v > 0 ? v : 8;
|
|
||||||
+ }();
|
|
||||||
+ return d;
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
template <ggml_type type>
|
|
||||||
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
|
||||||
const int id = ggml_cuda_get_device();
|
|
||||||
@@ -4076,25 +4114,53 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
|
|
||||||
const int mmq_x_max = get_mmq_x_max_host(cc);
|
|
||||||
const int mmq_y = get_mmq_y_host(cc);
|
|
||||||
|
|
||||||
- // [paged patch 0014] expert-aware MoE token-tile (mmq_x) cap.
|
|
||||||
- // On the MUL_MAT_ID grouped-GEMM path (expert_bounds != nullptr) the GEMM columns are
|
|
||||||
- // tokens sorted by expert; stock picks mmq_x to cover ncols_max (= ne12, the token count,
|
|
||||||
- // up to 128) in a single column-tile. At MoE decode the per-expert token density is low
|
|
||||||
- // (top-k of many experts: ~ne12*k/n_experts tokens/expert, e.g. ~8 at npl128 for
|
|
||||||
- // Qwen3-30B-A3B top-8/128), so each expert's single mmq_x-wide col-tile is mostly empty:
|
|
||||||
- // the MMA accumulator tile is mmq_x-wide at compile time and wastes throughput on the
|
|
||||||
- // padding columns while the larger y-tile lowers occupancy. Capping mmq_x toward the
|
|
||||||
- // per-expert density raises tile fill + occupancy with no extra weight reads (at
|
|
||||||
- // tokens/expert <= mmq_x there is still exactly one non-empty col-tile per expert; the
|
|
||||||
- // emptier tiles are skipped by the jt*mmq_x >= col_diff guard in the stream-k kernel).
|
|
||||||
- // Default (env unset or <= 0) = disabled => mmq_x selection is byte-identical to stock;
|
|
||||||
- // off the ids path the cap never applies.
|
|
||||||
+ // [paged patch 0015] expert-density-aware MoE token-tile (mmq_x) auto-select (DEFAULT-ON).
|
|
||||||
+ // On the MUL_MAT_ID grouped-GEMM path (expert_bounds != nullptr) the GEMM columns are tokens
|
|
||||||
+ // sorted by expert; stock picks mmq_x to cover ncols_max (= ne12, the token count, up to 128)
|
|
||||||
+ // in a single column-tile, i.e. it MAXIMIZES the tile (128 on Blackwell) for the aggregate
|
|
||||||
+ // batch. But the tile is then applied PER EXPERT, and at MoE decode the per-expert token
|
|
||||||
+ // density is tiny (top-k of many experts), so each expert's single 128-wide col-tile is mostly
|
|
||||||
+ // empty: the MMA accumulator tile is mmq_x-wide at compile time and burns throughput on the
|
|
||||||
+ // padding columns while the larger y-tile lowers occupancy. vLLM's fused-MoE does the opposite
|
|
||||||
+ // (a small per-expert BLOCK_SIZE_M). We reproduce that here, host-side only, by picking a
|
|
||||||
+ // SMALLER mmq_x when - and only when - the per-expert density is low:
|
|
||||||
+ //
|
|
||||||
+ // ne_get_rows = args.ncols_dst = ne12 * n_expert_used (total token-expert assignments)
|
|
||||||
+ // n_experts = args.nchannels_x = ne02
|
|
||||||
+ // n_active_est = min(n_experts, ne_get_rows) (upper bound on active experts)
|
|
||||||
+ // density = ceil(ne_get_rows / n_active_est) (avg tokens per active expert)
|
|
||||||
+ //
|
|
||||||
+ // Cap to the small tile (default 64) only when density <= density_max (default 8). 8 sits below
|
|
||||||
+ // every prefill-ubatch density and above every decode density for n_experts in [128,511] at the
|
|
||||||
+ // standard n_ubatch=512 (prefill 4096/n_experts, decode <=1024/n_experts), with >=8x fill headroom
|
|
||||||
+ // so a capped expert segment never splits a col-tile. Decode (per-expert density 4 at 256 experts,
|
|
||||||
+ // 8 at 128 experts @npl128) gets the fuller high-occupancy tile; the prefill ubatch (density 16 at
|
|
||||||
+ // 256 / 32 at 128 experts) stays ABOVE the threshold and keeps the big
|
|
||||||
+ // 128 compute tile - so unlike the blunt global cap (LLAMA_MOE_MMQ_X / patch 0014) this is
|
|
||||||
+ // prefill-safe by construction. The selection only ever picks an already-compiled, granularity-
|
|
||||||
+ // and shared-memory-validated mmq_x that the loop below would consider for a smaller batch; no
|
|
||||||
+ // new kernel. Off the ids path (expert_bounds == nullptr) nothing changes => non-MoE mul_mat
|
|
||||||
+ // and the gated f16/bf16 host-loop fallback stay byte-identical to stock.
|
|
||||||
+ // - LLAMA_MOE_MMQ_X=<n> : manual blunt global cap, overrides the auto-select (patch 0014).
|
|
||||||
+ // - LLAMA_MOE_AUTO_TILE=0 : disable the auto-select (exact stock selection).
|
|
||||||
+ // - LLAMA_MOE_DECODE_TILE=<n>, LLAMA_MOE_DENSITY_MAX=<n> : tune the tile / threshold.
|
|
||||||
int mmq_x_lim = mmq_x_max;
|
|
||||||
if (args.expert_bounds != nullptr) {
|
|
||||||
const int moe_cap = ggml_cuda_moe_mmq_x_cap();
|
|
||||||
if (moe_cap > 0) {
|
|
||||||
const int cap = moe_cap < 8 ? 8 : moe_cap;
|
|
||||||
mmq_x_lim = cap < mmq_x_max ? cap : mmq_x_max;
|
|
||||||
+ } else if (ggml_cuda_moe_auto_tile_enabled()) {
|
|
||||||
+ const int64_t ne_get_rows = args.ncols_dst;
|
|
||||||
+ const int64_t n_experts = args.nchannels_x;
|
|
||||||
+ if (ne_get_rows > 0 && n_experts > 0) {
|
|
||||||
+ const int64_t n_active = ne_get_rows < n_experts ? ne_get_rows : n_experts;
|
|
||||||
+ const int64_t density = (ne_get_rows + n_active - 1) / n_active;
|
|
||||||
+ const int tile = ggml_cuda_moe_decode_tile();
|
|
||||||
+ if (density <= (int64_t) ggml_cuda_moe_density_max() && tile < mmq_x_max) {
|
|
||||||
+ mmq_x_lim = tile;
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
|
|
||||||
index 15ae389..f219309 100644
|
|
||||||
--- a/tests/test-backend-ops.cpp
|
|
||||||
+++ b/tests/test-backend-ops.cpp
|
|
||||||
@@ -8575,6 +8575,22 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
|
||||||
// gpt-oss issue with Vulkan mmq_id
|
|
||||||
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_MXFP4, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880));
|
|
||||||
|
|
||||||
+ // [paged P0] MXFP4/NVFP4 qwen3-30b-a3b MoE decode-density regression gate for the expert-
|
|
||||||
+ // density-aware mmq_x auto-select (patch 0015). Real expert-FFN slice (128 experts, top-8,
|
|
||||||
+ // m=768, k=2048) so this exercises the exact grouped FP4-MMA mmq kernel the model runs.
|
|
||||||
+ // Per-expert token density = n*n_used/n_mats = n/16; cover the decode band (density 1/4/8/16
|
|
||||||
+ // at n 16/64/128/256), ragged token counts (n 33/130/200: experts with 0/1/2 tokens, n not a
|
|
||||||
+ // multiple of the tile) where the tiny-M col-tiles change geometry and any masking can leak,
|
|
||||||
+ // and a prefill-density shape (n 512 => density 32) the auto-select must leave on the large
|
|
||||||
+ // 128 tile. n>=128 is exactly where stock picks mmq_x=128 and the auto-select picks 64, so the
|
|
||||||
+ // op-test (CPU oracle vs CUDA, deterministic) is the bit-exact regression gate for P1: it must
|
|
||||||
+ // pass with the auto-select on (default) and with LLAMA_MOE_AUTO_TILE=0 (stock selection).
|
|
||||||
+ for (ggml_type type_a : {GGML_TYPE_MXFP4, GGML_TYPE_NVFP4}) {
|
|
||||||
+ for (int n : {16, 33, 64, 128, 130, 200, 256, 512}) {
|
|
||||||
+ test_cases.emplace_back(new test_mul_mat_id(type_a, GGML_TYPE_F32, 128, 8, false, 768, n, 2048));
|
|
||||||
+ }
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
for (ggml_type type_a : all_types) {
|
|
||||||
test_cases.emplace_back(new test_mul_mat_id(type_a, GGML_TYPE_F32, 4, 2, false, 64, 16, 3*ggml_blck_size(type_a)));
|
|
||||||
}
|
|
||||||
--
|
|
||||||
2.43.0
|
|
||||||
|
|
||||||
@@ -1,107 +0,0 @@
|
|||||||
# Additive layout for the paged-KV patch series - "hook, don't edit"
|
|
||||||
|
|
||||||
Goal: ship paged KV as a vendored patch series that **survives llama.cpp pin bumps with
|
|
||||||
minimal rebase pain**. PR #22569 (the upstream draft) was rejected by maintainers as
|
|
||||||
"slop" and is far too invasive to vendor - it rewrites core attention. Our series must be
|
|
||||||
the opposite: **additive**. This document is the design rule and the per-patch core-touch
|
|
||||||
budget.
|
|
||||||
|
|
||||||
## The rule
|
|
||||||
|
|
||||||
> Every change is either (a) **new code in a new vendored file** under `src/`, or (b) a
|
|
||||||
> **single, env-gated hook** at one call site in a core file that delegates to the new
|
|
||||||
> file. No logic lives in a core file. No core struct/signature is edited.
|
|
||||||
|
|
||||||
Why it works: a hook is a 1-3 line diff against a core file. When upstream churns that file,
|
|
||||||
`git apply` either still lands the hook (context unchanged) or fails *only on that tiny
|
|
||||||
hunk*, which is trivial to re-place. Logic embedded inside a core function (the PR #22569 /
|
|
||||||
old-0003 approach) conflicts on every bump and must be re-understood each time.
|
|
||||||
|
|
||||||
This is enforceable as a **core-touch budget**: each patch declares the core files it
|
|
||||||
touches and the line count; review rejects anything that grows logic in core.
|
|
||||||
|
|
||||||
## Why it's achievable here (grounded in the pinned source)
|
|
||||||
|
|
||||||
The two seams paged KV needs are both already abstract in llama.cpp at the pin
|
|
||||||
(`LLAMA_VERSION=f3e1828`), so new behavior plugs in without editing core types:
|
|
||||||
|
|
||||||
- **KV placement** - `llama_kv_cache::find_slot` already returns a `slot_info` of physical
|
|
||||||
cell indices. Paged placement is just *different indices*. 0002 already does this as one
|
|
||||||
gated block (`if (paged_mode) { ... continue; }`, 41 lines, one file). Ideal.
|
|
||||||
- **Graph inputs** - `llm_graph_input_i` is a pure-virtual base (`set_input()`), and
|
|
||||||
`llm_graph_result::add_input(llm_graph_input_ptr)` lets *any* code register a new input
|
|
||||||
subclass. So a paged graph input (the gather index) can be **a new class in a new file**,
|
|
||||||
added from a one-line hook - no edit to `llm_graph_input_attn_kv` or `llama-graph.h`.
|
|
||||||
|
|
||||||
## Per-patch core-touch budget
|
|
||||||
|
|
||||||
| # | Patch | New files (additive) | Core hooks (gated, minimal) | Core lines |
|
|
||||||
|---|-------|----------------------|------------------------------|-----------:|
|
|
||||||
| 0001 | vendor manager | `paged-kv-manager.{h,cpp}` | `CMakeLists.txt` +1 | 1 |
|
|
||||||
| 0002 | block placement | - | one `if(paged_mode){...continue;}` in `find_slot` | ~41 |
|
|
||||||
| 0003 | gather-read | `paged-attn.{h,cpp}` | `CMakeLists.txt` +1; **one** hook in `build_attn`; 2 tiny accessors on `llama_kv_cache_context` | ~8 |
|
|
||||||
| 0004 | on-demand alloc | (uses 0001 manager) | one branch in `find_slot` calling the manager | ~10 |
|
|
||||||
| 0005 | continuous batching | - | **LocalAI `grpc-server.cpp`** (already a LocalAI override, not a core patch) | 0 core |
|
|
||||||
| 0006 | prefix caching | (uses 0001 manager) | one hash-lookup hook in the 0004 alloc branch | ~6 |
|
|
||||||
|
|
||||||
Net core surface for the *entire* engine: `find_slot` (placement/alloc - where physical
|
|
||||||
cells are already chosen) + **one** line in `build_attn` + two accessors. Everything else
|
|
||||||
is new files or the LocalAI-side server loop.
|
|
||||||
|
|
||||||
## 0003 redesigned to the rule (replaces the 4-file-surgery plan)
|
|
||||||
|
|
||||||
The old `0003-gather-read-plan.md` edited `llama-kv-cache.{h,cpp}` + `llama-graph.{h,cpp}`
|
|
||||||
(including a field added to `llm_graph_input_attn_kv` and fill logic in its `set_input`).
|
|
||||||
The additive form removes the core-struct and core-`set_input` edits entirely:
|
|
||||||
|
|
||||||
**New file `src/paged-attn.{h,cpp}`** holds *all* logic:
|
|
||||||
- `class llm_graph_input_paged_gather : public llm_graph_input_i` - owns the `I32 [n_gather]`
|
|
||||||
gather-index tensor and a `const llama_kv_cache_context * mctx`. Its `set_input()` fills
|
|
||||||
the index with the sequence's used cells (`{ i in [0,n_kv) : !cells.is_empty(i) }`, the
|
|
||||||
same set the `kq_mask` keeps), in the canonical order.
|
|
||||||
- `paged_attn::gather(ctx0, res, mctx, v_trans, &k, &v, &kq_mask)` - when paged is active,
|
|
||||||
constructs that input via `res->add_input(...)`, and applies `ggml_get_rows` to `k`, `v`,
|
|
||||||
and the transposed `kq_mask` by the shared index (mask: `transpose -> get_rows ->
|
|
||||||
transpose`). When not active it returns immediately -> **stock path byte-identical**.
|
|
||||||
|
|
||||||
**Core hooks (the whole core diff for 0003):**
|
|
||||||
1. `src/llama-graph.cpp`, in `build_attn` right before `build_attn_mha` (~line 2357):
|
|
||||||
```cpp
|
|
||||||
paged_attn::gather(ctx0, res, mctx_cur, v_trans, &k, &v, &kq_mask); // no-op unless LLAMA_KV_PAGED
|
|
||||||
```
|
|
||||||
One line. No new field on `llm_graph_input_attn_kv`; the gather input is a *separate*
|
|
||||||
registered input, so `llama-graph.h` is untouched.
|
|
||||||
2. `src/llama-kv-cache.{h,cpp}`: two thin accessors on `llama_kv_cache_context` so the new
|
|
||||||
file can read the used-cell set without reaching into internals -
|
|
||||||
`uint32_t get_n_gather() const;` and `void get_gather_idxs(int32_t * dst) const;`
|
|
||||||
(delegate to `kv`/`sinfos[i_cur]`, mirroring the existing `get_n_kv` / `set_input_k_idxs`
|
|
||||||
pattern). ~8 lines total, no signature changes to existing methods.
|
|
||||||
3. `src/CMakeLists.txt`: `+ paged-attn.cpp`.
|
|
||||||
|
|
||||||
First cut: gate to **flash-attn + single-stream** (`GGML_ASSERT` otherwise) - the V-transposed
|
|
||||||
(non-FA) and multi-stream gathers are a localized follow-up entirely inside `paged-attn.cpp`,
|
|
||||||
no new core touch. Gate 0 stays the same: `diff` of greedy `llama-simple` output, stock vs
|
|
||||||
`LLAMA_KV_PAGED=1`, must be identical (attention is permutation-invariant over the gathered
|
|
||||||
KV set; `n_gather < n_kv` proves compaction, not identity).
|
|
||||||
|
|
||||||
## Anti-drift practices (already in `README.md`, restated as policy)
|
|
||||||
|
|
||||||
- **Stacking patches, one concern each**, exported 1:1 from a dev branch via
|
|
||||||
`git format-patch`. On a pin bump, rebase the branch; only the conflicting small patch
|
|
||||||
needs a touch, and the failure names the exact step.
|
|
||||||
- **Default-off (`LLAMA_KV_PAGED`)** until each gate is green, so a partial series never
|
|
||||||
changes stock behavior - and the hooks compile to a no-op branch when the env is unset.
|
|
||||||
- **Dev tree:** `git worktree add <dev> <LLAMA_VERSION>` off any checkout that has the pin
|
|
||||||
(e.g. the existing llama.cpp clone), `git apply` the series, develop the next patch as one
|
|
||||||
commit, re-export. (Set up and verified for this pin during this work.)
|
|
||||||
|
|
||||||
## Status / next step
|
|
||||||
|
|
||||||
- 0001, 0002: done, additive, verified token-identical.
|
|
||||||
- 0003: **redesigned to the additive form above** (this doc). Dev tree at the pin with
|
|
||||||
0001+0002 applied is ready (`paged` branch). Remaining work is the focused
|
|
||||||
implement-and-verify block for `paged-attn.{h,cpp}` + the one `build_attn` hook, driven to
|
|
||||||
the token-identical Gate 0. That is a numerical-correctness task (mask/gather alignment,
|
|
||||||
FA-first), not a structural one - the structure is settled here.
|
|
||||||
- 0004-0006: follow the budget above; 0005 lands in LocalAI's `grpc-server.cpp` (no core
|
|
||||||
patch at all).
|
|
||||||
@@ -1,185 +0,0 @@
|
|||||||
# llama-server vs vLLM: decode-step gap decomposition (DGX Spark, GB10 / sm_121)
|
|
||||||
|
|
||||||
Profiling study (no engine changes). Question: matched apples-to-apples (both
|
|
||||||
batched servers, NVFP4-class weights, prefix caching on, both eager), why is
|
|
||||||
`llama-server` ~4-6x slower **per decode step** than vLLM on Qwen3-32B at a
|
|
||||||
1024-token shared-prefix / batch-32 fan-out, and what is closable vs structural.
|
|
||||||
|
|
||||||
Hardware: NVIDIA GB10 (sm_121), unified LPDDR5X. Model: Qwen3-32B, 64 layers.
|
|
||||||
llama side: `~/llama-paged-dev/build-cuda/bin/llama-server`, `q3-32b-nvfp4-dense.gguf`
|
|
||||||
(NVFP4 weights, type-40 FP4-MMA path), `-ngl 99 --parallel 32 -c 40960 -fa on`,
|
|
||||||
`GGML_CUDA_DISABLE_GRAPHS=1` (eager). vLLM 0.23.0 NVFP4A16 (W4A16/Marlin),
|
|
||||||
`--enforce-eager`. Workload: 1024-token shared prefix + unique 32-token suffix,
|
|
||||||
K=32 concurrent, generate 64. All profiling scripts are dev-tree only
|
|
||||||
(`~/bench/decode_study/`); minimal in-code timers were not needed (server already
|
|
||||||
reports per-slot `eval time`, which excludes prompt-eval = pure decode).
|
|
||||||
|
|
||||||
## TL;DR
|
|
||||||
|
|
||||||
1. **The real-server decode is GPU-BOUND, not host-bound.** During steady decode
|
|
||||||
the GPU is **~94.6% utilized** (nvidia-smi, real run) / 85-95% busy (nsys).
|
|
||||||
Per-slot CPU sampling, detokenize, and `update_slots` are fully hidden: a 5-stage
|
|
||||||
sampler chain gives the *identical* step time as greedy (1346 vs 1343 ms). The
|
|
||||||
"GPU stalls on the CPU serving loop" hypothesis is **refuted** for this workload.
|
|
||||||
2. **At 1024 context the decode step is ~84% KV/attention, ~16% weight GEMM** - the
|
|
||||||
opposite of the thin-batch-GEMM story. Attention scaling with context length, not
|
|
||||||
the matmul, is the load-bearing cost.
|
|
||||||
3. **The worktree's paged KV engine is a decode REGRESSION: ~1.85x slower than
|
|
||||||
stock** at 1024 ctx (paged 1279-1343 ms/step vs stock 650-729 ms/step). It
|
|
||||||
gathers K/V/mask into a contiguous buffer (`ggml_get_rows`) every layer every
|
|
||||||
step, then runs a dense FA kernel - paying a full extra KV read+copy that vLLM's
|
|
||||||
in-kernel PagedAttention never pays. Paging helps prefix-prefill memory; it hurts
|
|
||||||
decode latency.
|
|
||||||
4. Even **stock** llama-server (~650-729 ms/step) is **~4-5x slower than vLLM**
|
|
||||||
(~120-185 ms/step). The residual gap is the **long-context decode-attention
|
|
||||||
kernel** and, secondarily, the **thin-batch FP4 weight GEMM** - both kernel-maturity
|
|
||||||
gaps vs vLLM's FlashInfer/FA paged-decode + Marlin, not serving-loop gaps.
|
|
||||||
|
|
||||||
## The measured numbers (batch 32, server-reported pure-decode step time)
|
|
||||||
|
|
||||||
`server_decode_step_ms` = max / mean-of-top-8 of per-slot `eval time ms-per-token`
|
|
||||||
(the most-contended, full-batch-32 slots; excludes prompt eval).
|
|
||||||
|
|
||||||
| config | decode step ms (max / top8) | client wall ms/step |
|
|
||||||
|------------------------------------------|-----------------------------|---------------------|
|
|
||||||
| paged, ctx 1024, greedy | 1343 / 1279 | 1468 |
|
|
||||||
| paged, ctx 1024, **heavy 5-sampler** | 1346 / 1280 | 1470 |
|
|
||||||
| **stock** (no paging), ctx 1024, greedy | **729 / 650** | 768 |
|
|
||||||
| paged, **ctx 64** (short), greedy | **215 / 215** | 253 |
|
|
||||||
| vLLM NVFP4A16, ctx 1024 (K=32) | **~120-185** (270 tok/s) | - |
|
|
||||||
|
|
||||||
The brief's reference ~828 ms/step sits between the stock (650-729) and paged
|
|
||||||
(1279-1343) numbers measured here; the decomposition below is what is robust. Our
|
|
||||||
fan-out shares no prefix across the 32 slots (each slot independently prefills 1056
|
|
||||||
tokens - confirmed in the log), so the 32 sequences are genuinely concurrent and the
|
|
||||||
"max" slot is maximally contended, which is why our paged max runs a little above 828.
|
|
||||||
|
|
||||||
### Context sweep - decode step is attention-scaling, not fixed overhead
|
|
||||||
|
|
||||||
Pure-decode step vs shared-prefix length (paged, batch 32):
|
|
||||||
|
|
||||||
| prefix ctx | decode step ms |
|
|
||||||
|-----------|----------------|
|
|
||||||
| 64 | 215 |
|
|
||||||
| 128 | ~290 |
|
|
||||||
| 256 | ~410 |
|
|
||||||
| 512 | ~660 |
|
|
||||||
| 1024 | ~1280 |
|
|
||||||
|
|
||||||
Roughly linear in context length: ~1 ms of added step time per added context token.
|
|
||||||
The **215 ms at ctx 64 is the fixed floor** (weight GEMM + activations + norm/rope +
|
|
||||||
loop + sampling, attention negligible). Everything above it scales with KV length =
|
|
||||||
attention + KV plumbing. At 1024 ctx the fixed floor is only ~16% of the step.
|
|
||||||
|
|
||||||
## Where the ~1280 ms paged decode step goes (nsys, pure-decode window)
|
|
||||||
|
|
||||||
`nsys profile --delay=70 --duration=25 --trace=cuda` windowed onto steady 32-way
|
|
||||||
decode (`srv_decode2.nsys-rep`; an earlier 25-60s window was discarded because nsys's
|
|
||||||
own slowdown stretched the 32 prefills into it, inflating GEMM to a misleading 58%).
|
|
||||||
GPU busy in-window 85.5% (nsys adds gaps; the real run is ~94.6% by nvidia-smi).
|
|
||||||
|
|
||||||
| bucket | % GPU time | abs (of ~1280 ms) | what it is |
|
|
||||||
|--------------------------------|-----------:|------------------:|------------|
|
|
||||||
| `flash_attn_ext_f16` ATTENTION | **47.7%** | ~610 ms | decode attention over the 1056-cell KV |
|
|
||||||
| `cpy_scalar` KV copy/cast | 18.3% | ~234 ms | KV write + f32->f16 casts |
|
|
||||||
| `get_rows/set_rows` KV gather | 17.8% | ~228 ms | **paged** gather of K/V/mask to contiguous |
|
|
||||||
| `mul_mat_q` + `quantize_mmq` | 15.7% | ~201 ms | NVFP4 weight GEMM (+ activation requant) |
|
|
||||||
| rmsnorm / silu / rope / add | ~0.6% | ~8 ms | elementwise |
|
|
||||||
|
|
||||||
Cross-check: the GEMM bucket (~201 ms) matches the ctx-64 floor (215 ms) - i.e. the
|
|
||||||
weight matmul is ~the entire short-context step, and is context-independent, as
|
|
||||||
expected. KV/attention buckets (47.7+18.3+17.8 = **83.8%**) match the context-sweep
|
|
||||||
finding that ~84% of the step scales with context.
|
|
||||||
|
|
||||||
Power signature: ~33-36 W at 94% "utilization" (GB10 can pull far more). High util%
|
|
||||||
+ low power = the kernels are **memory/latency-bound, not compute-saturated** - the
|
|
||||||
classic decode signature (stream 19 GB of NVFP4 weights + a growing KV every step).
|
|
||||||
|
|
||||||
### Stock vs paged decomposition
|
|
||||||
|
|
||||||
- **Stock** (~650 ms): ~215 ms GEMM floor + ~435 ms attention/KV (contiguous KV read
|
|
||||||
directly by the FA kernel, **no gather**).
|
|
||||||
- **Paged** (~1280 ms): same ~215 ms floor + ~610 ms attention + **~455 ms paged
|
|
||||||
gather/copy overhead** (the `get_rows` of K/V/mask plus the extra KV copy that
|
|
||||||
feeds the dense FA kernel). That ~455 ms (~36% of the step) is the paged engine's
|
|
||||||
self-inflicted cost and is the entire ~1.85x stock->paged regression.
|
|
||||||
|
|
||||||
## vLLM decode architecture mapped onto each llama bucket
|
|
||||||
|
|
||||||
vLLM at ~120-185 ms/step is faster on **every** bucket:
|
|
||||||
|
|
||||||
| llama bucket (paged) | ms | vLLM equivalent | does vLLM avoid it? |
|
|
||||||
|-----------------------------|-------|-----------------|---------------------|
|
|
||||||
| paged KV gather (get_rows) | ~228 | PagedAttention reads blocks **in-kernel** via a block table | **Yes - entirely.** No gather op exists. |
|
|
||||||
| KV copy/cast | ~234 | KV written once into block pool; FA reads it in place | Mostly - no per-step recopy |
|
|
||||||
| decode attention | ~610 | FlashInfer / FA paged-decode GQA kernel, split over KV | Same op, far faster kernel on sm_121 |
|
|
||||||
| weight GEMM + act quant | ~201 | fused Marlin/Machete W4A16 dequant+MMA, no separate quant pass | Faster + removes the requant kernel |
|
|
||||||
| CPU sampling / loop | ~0 (hidden) | on-GPU batched sampling | N/A here - already hidden on llama side too |
|
|
||||||
|
|
||||||
vLLM's whole-step (~150 ms) is **less than llama's GEMM floor alone (~215 ms)**, so
|
|
||||||
vLLM is ahead on the matmul *and* the attention *and* avoids the gather. The gap is a
|
|
||||||
stack of kernel-efficiency wins, not one silver bullet.
|
|
||||||
|
|
||||||
## Ranked levers - closable vs structural
|
|
||||||
|
|
||||||
1. **Remove the paged gather regression. [Tractable, ~455 ms / ~36% on the paged
|
|
||||||
path; net-zero risk - it is a regression]** The worktree's paged engine makes
|
|
||||||
decode 1.85x slower than stock by gathering K/V/mask to contiguous every layer
|
|
||||||
every step (patch 0003 `ggml_get_rows`). For latency-bound decode, **do not enable
|
|
||||||
paged KV** - it only ever helps prefix-prefill *memory*, never decode latency.
|
|
||||||
Fully recovering this *and* keeping paging requires reading paged blocks
|
|
||||||
in-kernel like vLLM (a from-scratch paged-attention CUDA kernel) - see lever 2.
|
|
||||||
|
|
||||||
2. **Long-context decode-attention kernel. [Biggest real lever, ~435 ms of stock /
|
|
||||||
~610 ms of paged; partly structural]** Even stock is attention-bound at 1024 ctx.
|
|
||||||
llama.cpp's `flash_attn_ext_f16` decode path is ~4-5x slower than vLLM's
|
|
||||||
FlashInfer/FA paged-decode GQA kernel on this Blackwell-class part. This is the
|
|
||||||
cost that *grows with context* - exactly the regime the brief targets. Tractable in
|
|
||||||
principle (a proper flash-decoding / split-K-over-KV kernel, and a true in-kernel
|
|
||||||
paged read that also kills lever 1's gather), but it is deep CUDA work on a new
|
|
||||||
arch and partly gated by kernel maturity on sm_121. **Highest-impact, hardest.**
|
|
||||||
|
|
||||||
3. **Thin-batch FP4 weight GEMM floor. [Tractable, ~201-215 ms / 15-30%; bounded]**
|
|
||||||
The NVFP4 `mul_mat_q` + separate `quantize_mmq` activation pass is memory-bound and
|
|
||||||
less efficient than vLLM's fused Marlin/Machete W4A16. Fusing dequant into the MMA
|
|
||||||
and folding the activation quant into the GEMM is tractable kernel work. Bounded
|
|
||||||
impact: the floor cannot drop below weight-read-bound (~19 GB / HBM BW per step).
|
|
||||||
|
|
||||||
4. **Host serving loop / per-slot sampling. [NOT a lever]** Measured zero: greedy ==
|
|
||||||
heavy-sampler step time; GPU 94.6% busy. On-GPU/batched sampling buys nothing until
|
|
||||||
the kernels (levers 1-3) get fast enough to expose host overhead. Refutes the
|
|
||||||
"host-bound serving loop" hypothesis for this decode-bound workload.
|
|
||||||
|
|
||||||
5. **Continuous-batch scheduler. [NOT the gap / structural elsewhere]** llama-server
|
|
||||||
already fuses all 32 slots into one decode step (one set of kernels per step over
|
|
||||||
batch 32 - confirmed in the trace). vLLM's continuous/chunked-prefill batching wins
|
|
||||||
on *mixed* prefill+decode overlap, but the steady decode-step gap measured here is
|
|
||||||
kernel-bound, not scheduler-bound.
|
|
||||||
|
|
||||||
## Honest bottom line
|
|
||||||
|
|
||||||
The ~4-6x per-step gap is **GPU-kernel-bound**, and it decomposes as:
|
|
||||||
|
|
||||||
- ~36% of the *paged* step is a **self-inflicted gather regression** - remove it
|
|
||||||
(don't run paged for decode-latency workloads).
|
|
||||||
- The remaining ~4-5x vs vLLM (true even for stock) is **kernel efficiency**:
|
|
||||||
llama.cpp's long-context decode-attention and thin-batch FP4 GEMM are slower than
|
|
||||||
vLLM's PagedAttention + Marlin on GB10. That is a **kernel project** (in-kernel
|
|
||||||
paged attention + flash-decoding + fused W4A16 GEMM), not a serving-loop project.
|
|
||||||
- Sampling, detokenize, `update_slots`, and the continuous-batch scheduler are **not**
|
|
||||||
the gap; the GPU is ~95% busy on memory-bound kernels the whole step.
|
|
||||||
|
|
||||||
What is closable: lever 1 (immediately, by not paging), lever 3 (bounded, with kernel
|
|
||||||
work). What is structural / hard: lever 2 (the decode-attention kernel + a real
|
|
||||||
in-kernel paged read), which is where the context-scaling gap actually lives and where
|
|
||||||
any serious effort to approach vLLM on GB10 must go.
|
|
||||||
|
|
||||||
## Reproduction (dev-tree only, `~/bench/decode_study/`)
|
|
||||||
|
|
||||||
- `launch_srv.sh` / `runcfg.sh` - launch llama-server (paged on/off) and a config.
|
|
||||||
- `client.py` - K=32 token-id fan-out (1024 prefix + 32 suffix), `SAMP=greedy|heavy`.
|
|
||||||
- `d2drv.sh` - nsys pure-decode window (delay 70s past prefill) -> `srv_decode2.nsys-rep`.
|
|
||||||
- `cat2.py` - kernel-time categorization from the sqlite export.
|
|
||||||
- vLLM side: `~/bench/run_vllm.sh` + `vllm_prefix.py` (K=32, ~270 tok/s).
|
|
||||||
</content>
|
|
||||||
</invoke>
|
|
||||||
@@ -1,143 +0,0 @@
|
|||||||
# Patch 0015 findings: expert-density-aware MoE token-tile auto-select
|
|
||||||
|
|
||||||
The durable follow-up to patch 0014 (`MOE_TOKEN_TILE_CAP.md`): replace the blunt,
|
|
||||||
opt-in `LLAMA_MOE_MMQ_X` global cap with a host-side, **default-on** density-aware
|
|
||||||
`mmq_x` auto-select in `mul_mat_q_case`. Companion to
|
|
||||||
`0015-paged-expert-density-aware-moe-token-tile-auto-select.patch`. Dev tree
|
|
||||||
`~/llama-paged-dev` (branch `paged`), `build-cuda` sm_121.
|
|
||||||
|
|
||||||
Primary model: **Qwen3.6-35B-A3B NVFP4** (`~/bench/q36-35b-a3b-nvfp4.gguf`),
|
|
||||||
**256 experts, top-8**, expert FFN 512, GDN linear attention (SSM inner 4096),
|
|
||||||
41 layers. This is a different beast from 0014's Qwen3-Coder-30B-A3B (128 experts,
|
|
||||||
larger expert FFN, standard attention).
|
|
||||||
|
|
||||||
## What it does (vs 0014)
|
|
||||||
|
|
||||||
`mul_mat_q_case` picks the token-tile width `mmq_x` to cover `ncols_max` (= `ne12`,
|
|
||||||
the per-expert column upper bound = token count) in one column-tile, i.e. stock
|
|
||||||
**maximizes** the tile (128 on Blackwell). Applied per expert at MoE decode, where
|
|
||||||
per-expert density is tiny, that 128-wide tile is mostly padding.
|
|
||||||
|
|
||||||
Patch 0014 capped `mmq_x` globally on the ids path via `LLAMA_MOE_MMQ_X` (decode
|
|
||||||
**and** prefill), which cost ~1.3% prefill. Patch 0015 instead estimates the
|
|
||||||
per-expert density host-side, from args the ids path already passes:
|
|
||||||
|
|
||||||
```
|
|
||||||
ne_get_rows = ncols_dst = ne12 * n_expert_used (token-expert assignments)
|
|
||||||
n_experts = nchannels_x = ne02
|
|
||||||
density = ceil(ne_get_rows / min(ne_get_rows, n_experts)) (tokens/expert)
|
|
||||||
```
|
|
||||||
|
|
||||||
and caps to the small tile (default 64) **only when `density <= density_max`**, so
|
|
||||||
the high-density prefill ubatch keeps the big 128 tile. Prefill-safe by construction.
|
|
||||||
No new kernel: the selection only lowers the loop's upper bound to an
|
|
||||||
already-compiled, granularity- and shared-memory-validated `mmq_x`.
|
|
||||||
|
|
||||||
## The threshold matters: `density_max = 8`, not `tile/4 = 16`
|
|
||||||
|
|
||||||
The cap must fire for decode but not for a prefill ubatch. Each has per-expert
|
|
||||||
density `n_tokens * n_used / n_experts`. At the standard `n_ubatch=512`, `n_used=8`:
|
|
||||||
|
|
||||||
```
|
|
||||||
128 experts 256 experts
|
|
||||||
prefill ubatch (512) 32 16
|
|
||||||
decode npl128 (128) 8 4
|
|
||||||
```
|
|
||||||
|
|
||||||
`tile/4 = 16` (0014's first auto-select draft default) **equals the 256-expert
|
|
||||||
prefill density** and caps prefill: measured -2.0% to -2.9% S_PP on q36-35b-a3b.
|
|
||||||
`density_max = 8` sits strictly between decode and prefill for every `n_experts` in
|
|
||||||
`[128, 511]`, so it caps decode and leaves prefill on the big tile. This single
|
|
||||||
default change is what makes the patch prefill-safe on the 256-expert model.
|
|
||||||
|
|
||||||
## Measurements (default-on vs stock, median of 5 reps)
|
|
||||||
|
|
||||||
`llama-batched-bench`, q36-35b-a3b-nvfp4.gguf, `-fa on -npp 128 -ntg 128`, GB10
|
|
||||||
sm_121. STOCK = `LLAMA_MOE_AUTO_TILE=0` (exact stock selection); 0015 = default.
|
|
||||||
|
|
||||||
```
|
|
||||||
npl S_TG stock S_TG 0015 dTG% S_PP stock S_PP 0015 dPP%
|
|
||||||
8 183.59 183.18 -0.22% 1489.2 1500.1 +0.73%
|
|
||||||
32 264.02 263.44 -0.22% 2034.5 2033.5 -0.05%
|
|
||||||
64 311.76 310.41 -0.43% 2028.3 2027.6 -0.03%
|
|
||||||
128 336.10 337.32 +0.36% 2025.0 2027.7 +0.13%
|
|
||||||
```
|
|
||||||
|
|
||||||
Raw npl128 reps: S_TG 0015 `[337.3, 336.9, 336.4, 338.9, 338.1]` vs stock
|
|
||||||
`[336.2, 336.1, 335.9, 336.9, 335.8]` (distributions overlap); S_PP 0015
|
|
||||||
`[2028.6, 2023.0, 2024.9, 2028.0, 2027.7]` vs stock `[2024.9, 2025.0, 2023.2,
|
|
||||||
2029.4, 2029.0]`.
|
|
||||||
|
|
||||||
### Honest read: neutral on this model
|
|
||||||
|
|
||||||
On q36-35b-a3b the decode delta is **within run-to-run noise** (npl128 +0.36%,
|
|
||||||
npl<=64 slightly negative) and prefill is **neutral** (within +/-0.7%, well inside
|
|
||||||
the 1% target). The `+5%` decode target from the localmaxxing reference does **not**
|
|
||||||
materialize here. q36-35b-a3b decode is bound by the GDN/SSM recurrence and
|
|
||||||
256-tiny-expert weight bandwidth, not the MoE col-tile occupancy, so the col-tile
|
|
||||||
lever has nothing to bite on.
|
|
||||||
|
|
||||||
### npl128 decode tile sweep confirms 64 is the only useful width
|
|
||||||
|
|
||||||
`density_max=8` fixed, varying `LLAMA_MOE_DECODE_TILE`, S_TG @ npl128 vs stock:
|
|
||||||
|
|
||||||
```
|
|
||||||
TILE8 TILE16 TILE32 TILE64 TILE96
|
|
||||||
-6.31% -3.18% -0.17% +0.70% -0.76%
|
|
||||||
```
|
|
||||||
|
|
||||||
Smaller tiles are **worse**, not better: more column-tiles per expert = more
|
|
||||||
grid/scheduling overhead, and the FP4-MMA has a minimum efficient width. So matching
|
|
||||||
the tile to the literal density (4) is counterproductive; 64 is the sweet spot,
|
|
||||||
same as 0014.
|
|
||||||
|
|
||||||
## Why ship it default-on anyway
|
|
||||||
|
|
||||||
1. **Removes 0014's prefill cost by construction.** The cap is density-gated, not
|
|
||||||
global, so prefill keeps its 128 tile (S_PP neutral above).
|
|
||||||
2. **Banks the col-tile-bound gain for free.** At npl128 the auto-select picks
|
|
||||||
`tile=64` for a 128-expert model (decode density 8 <= 8), i.e. exactly 0014's
|
|
||||||
`cap64`, so it reproduces 0014's **+4.8% @npl128 on Qwen3-Coder-30B** without the
|
|
||||||
-1.3% prefill cost. (That model was unavailable to re-bench here; the tile choice
|
|
||||||
is identical by construction.)
|
|
||||||
3. **Prefill-safe and decode-neutral on the SSM model**, so it is harmless where it
|
|
||||||
does not help.
|
|
||||||
4. **Correctness-gated** by the P0 harness (below).
|
|
||||||
|
|
||||||
## Conservative by design (known limitation)
|
|
||||||
|
|
||||||
A pure-density gate cannot separate two cases with the **same** per-expert density:
|
|
||||||
Qwen3-Coder npl256 decode (density 16) and the 256-expert prefill ubatch (density
|
|
||||||
16) are identical to the estimator. `density_max=8` therefore **forgoes 0014's
|
|
||||||
+2.3% @npl256** on the 128-expert model to keep 256-expert prefill safe. Recovering
|
|
||||||
it needs an `ne12`-aware (absolute token count) gate in addition to density; scoped
|
|
||||||
as future work, not implemented.
|
|
||||||
|
|
||||||
## Knobs
|
|
||||||
|
|
||||||
- `LLAMA_MOE_AUTO_TILE=0` : disable the auto-select, exact stock `mmq_x` selection.
|
|
||||||
- `LLAMA_MOE_MMQ_X=<n>` (patch 0014) : **kept** as a manual override; when > 0 it
|
|
||||||
forces the old blunt global cap and bypasses the auto-select (explicit A/B knob).
|
|
||||||
- `LLAMA_MOE_DECODE_TILE=<n>` : the small tile (default 64).
|
|
||||||
- `LLAMA_MOE_DENSITY_MAX=<n>` : the density ceiling (default 8).
|
|
||||||
|
|
||||||
## P0 correctness gate
|
|
||||||
|
|
||||||
`tests/test-backend-ops` `test_mul_mat_id` is extended with a ragged small-M
|
|
||||||
NVFP4/MXFP4 MoE decode-density block: 128 experts, top-8, m=768, k=2048, n in
|
|
||||||
`{16,33,64,128,130,200,256,512}` spanning the cap boundary (n>=130 keeps the 128
|
|
||||||
tile at `density_max=8`, n<=128 takes tile 64) and ragged token counts (experts with
|
|
||||||
0/1/2 tokens, n not a multiple of the tile). All 16 shapes pass the CUDA-vs-CPU
|
|
||||||
oracle on GB10 both default-on and with `LLAMA_MOE_AUTO_TILE=0`; full `MUL_MAT_ID`
|
|
||||||
suite 2/2 backends OK. Off the ids path nothing changes (non-MoE `mul_mat`
|
|
||||||
byte-identical to stock).
|
|
||||||
|
|
||||||
## Verdict
|
|
||||||
|
|
||||||
- Correct, prefill-safe, default-on density-aware tile select; the durable design
|
|
||||||
0014's own doc scoped. Supersedes 0014's global cap as the default path; the
|
|
||||||
`LLAMA_MOE_MMQ_X` knob is retained as a manual override.
|
|
||||||
- **Net effect on q36-35b-a3b NVFP4: neutral** (decode within noise, prefill neutral)
|
|
||||||
because the model is SSM/bandwidth-bound, not col-tile-bound. The lever's real win
|
|
||||||
lives on col-tile-bound MoE (Qwen3-Coder-30B, +4.8% @npl128), banked here at zero
|
|
||||||
prefill cost.
|
|
||||||
@@ -1,220 +0,0 @@
|
|||||||
# Durable scope: grouped FP4-MMA MoE GEMM for ggml CUDA on GB10 (sm_121)
|
|
||||||
|
|
||||||
Build-ready plan. **Not implemented in this workflow** (large kernel work). This
|
|
||||||
document scopes the durable path to match or beat vLLM MoE grouped-GEMM efficiency
|
|
||||||
on GB10 for the Qwen3-30B-A3B-class mxfp4 MoE, and records the single honest
|
|
||||||
finding that re-shapes the whole effort.
|
|
||||||
|
|
||||||
Hardware: NVIDIA GB10 (sm_121, CC=1210 = `GGML_CUDA_CC_DGX_SPARK`), unified
|
|
||||||
LPDDR5X ~273 GB/s. Model: Qwen3-Coder-30B-A3B, 128 experts, top-8, mxfp4 experts
|
|
||||||
(`~/bench/qwen3coder-mxfp4.gguf`). Dev tree `~/llama-paged-dev` (branch `paged`,
|
|
||||||
HEAD at patch 0013), `build-cuda` sm_121.
|
|
||||||
|
|
||||||
## TL;DR (the honest reframe)
|
|
||||||
|
|
||||||
**The grouped GEMM the mission scoped to build from scratch already exists in
|
|
||||||
upstream ggml, and it already runs on GB10 for mxfp4.** For mxfp4 experts on
|
|
||||||
sm_121 `ggml_cuda_should_use_mmq()` returns true (`turing_mma_available`), so
|
|
||||||
MUL_MAT_ID takes the **grouped mmq path**, which already contains both vLLM
|
|
||||||
building blocks:
|
|
||||||
|
|
||||||
1. a moe_align / token-sort-by-expert (`mmid.cu` `mm_ids_helper`:
|
|
||||||
count -> warp-scan/cumsum -> scatter into expert-sorted contiguous buffers),
|
|
||||||
2. a **single persistent stream-k grouped FP4-MMA GEMM** (one `mul_mat_q` launch;
|
|
||||||
grid flattened into kbc-continuous space over expert x col-tile x row-tile x
|
|
||||||
k-block; native FP4 MMA via `block_fp4_mmq` under `BLACKWELL_MMA_AVAILABLE`).
|
|
||||||
|
|
||||||
The per-expert host-side row-gather loop in `ggml-cuda.cu`
|
|
||||||
`ggml_cuda_mul_mat_id()` (~L2632-2790) - the path the mission's root-cause
|
|
||||||
analysis describes as "the cliff" - is a **fallback only reached when
|
|
||||||
`should_use_mmq()==false`** (f16/bf16 experts, non-Blackwell). It is **never the
|
|
||||||
GB10 mxfp4 path.**
|
|
||||||
|
|
||||||
Consequence: the "npl128 MoE cliff" does not exist on the current dev HEAD.
|
|
||||||
Re-measured batched-bench decode (`S_TG` t/s) on the mxfp4 MoE rises monotonically
|
|
||||||
`85 / 278 / 637 / 950 / 1306 / 1771` at npl `1 / 8 / 32 / 64 / 128 / 256`. The
|
|
||||||
original `253/505/830/620` cliff was a real high-batch regression that has since
|
|
||||||
been **fixed upstream** (FP4-native grouped mmq + MoE stream-k balancing), not a
|
|
||||||
batched-bench artifact.
|
|
||||||
|
|
||||||
**Therefore the durable work is NOT "port moe_align + a grouped GEMM."** It is a
|
|
||||||
**surgical fix to the one place ggml diverges from vLLM: the M-tile (token-tile)
|
|
||||||
sizing heuristic.** This document scopes that delta, plus the optional
|
|
||||||
block-padded align, plus the parity gate and phased plan. It also records what is
|
|
||||||
intentionally NOT built and why (the W4A16 occupancy wall).
|
|
||||||
|
|
||||||
## The one structural gap: M-tile sizing
|
|
||||||
|
|
||||||
`mul_mat_q_case` / `launch_mul_mat_q` pick `mmq_x` (the token/M tile) by
|
|
||||||
**minimizing** `ntiles_x = ceil(ncols_max / mmq_x)` over the **aggregate** token
|
|
||||||
count (`ncols_max = ne12`). On Blackwell `get_mmq_x_max = 128`, so the heuristic
|
|
||||||
always selects the **largest** `mmq_x` that fits shared memory. vLLM's
|
|
||||||
CUTLASS/Triton fused_moe does the **opposite**: a small tuned `BLOCK_SIZE_M`
|
|
||||||
(typ. 16/32/64), padded **per expert**.
|
|
||||||
|
|
||||||
ggml then applies its over-large `mmq_x` **per expert**. In MoE decode the tokens
|
|
||||||
per expert is tiny - Qwen3-30B-A3B top-8 of 128: at npl64 ~512 assignments over
|
|
||||||
~126 activated experts ~= 4 tok/expert; at npl128 ~1024 over ~128 ~= 8 tok/expert.
|
|
||||||
So each expert's single M-tile of width 128 is **3-6% filled** -> ragged tiny-M
|
|
||||||
tiles run a dense-GEMM-tuned config, wasting MMA M-throughput, and (with
|
|
||||||
`need_check`) every expert runs as a masked partial tail.
|
|
||||||
|
|
||||||
The FP4 MMA N-fragment (`tile_C::J`) is 8, so the **ideal M-tile ~= tokens/expert
|
|
||||||
(~8)**, 16x smaller than the 128 ggml picks. This mismatch is the durable gap.
|
|
||||||
|
|
||||||
Critically for GB10: at tokens/expert <= 8 there is exactly **one col-tile per
|
|
||||||
expert**, so a smaller `mmq_x` causes **no extra weight re-read** (weight rows are
|
|
||||||
re-read only across multiple col-tiles, of which there is one) while it **lowers
|
|
||||||
shared-mem footprint and raises occupancy** - strictly aligned with the GB10
|
|
||||||
occupancy lessons.
|
|
||||||
|
|
||||||
## What already exists (reuse, do NOT rebuild)
|
|
||||||
|
|
||||||
Engine files on DGX `~/llama-paged-dev/ggml/src/ggml-cuda/`:
|
|
||||||
|
|
||||||
- **[A] moe_align / scatter** = `mmid.cu` `mm_ids_helper`. One CUDA block per
|
|
||||||
expert (`gridDim.x = n_experts`); warp counts tokens routed to this expert,
|
|
||||||
warp-scan for the compaction index, scatters into `ids_src1` (column gather
|
|
||||||
permutation, expert-sorted contiguous), `ids_dst` (output scatter), and writes
|
|
||||||
`expert_bounds[expert] = prefix start`, `expert_bounds[n_experts] = total`.
|
|
||||||
This **is** count -> cumsum -> permute; `expert_bounds` is the analogue of
|
|
||||||
vLLM's `num_tokens_post_padded` boundaries. No `-1` pad today because segments
|
|
||||||
are exact (not block-padded).
|
|
||||||
- **[B] persistent grouped FP4 GEMM** = `mmq.cuh` `mul_mat_q` stream-k
|
|
||||||
(kernel ~L3542, `process_tile` ~L3447, launch ~L3943, case-select ~L4055).
|
|
||||||
Single launch, fixed grid (`nsm` CTAs, or `ntiles` when >=90% tile efficiency).
|
|
||||||
Each CTA walks a contiguous `kbc` slice of (expert `zt` via `expert_bounds`,
|
|
||||||
col-tile `jt`, row-tile `it`, k-block) space; the weight row-tile (`mmq_y=128`
|
|
||||||
x K) is loaded once per col-tile in the `process_tile` k-loop; empty col-tiles
|
|
||||||
past `col_diff` are SKIPPED by advancing `kbc += blocks_per_ne00`; a
|
|
||||||
`stream_k_fixup` pass recombines split tiles.
|
|
||||||
- **[C] native FP4-MMA expert weights** = `block_fp4_mmq` + `MMQ_MMA_TILE_X_K_FP4`
|
|
||||||
(== Q8_1 tile, skew-pad +4) under `BLACKWELL_MMA_AVAILABLE`;
|
|
||||||
`quantize_mmq_fp4_cuda` quantizes activations to the q8-style y-layout **with
|
|
||||||
the `ids_src1` gather fused** (one pass, no separate row-copy).
|
|
||||||
|
|
||||||
Dispatch seam: `ggml-cuda.cu` `ggml_cuda_mul_mat_id()` (~L2632-2790). For mxfp4
|
|
||||||
with `ne2`(tokens) > 7, `should_use_mmq()` -> true -> `ggml_cuda_mul_mat_q()`
|
|
||||||
(`mmq.cu` id-branch ~L162-225) -> `mm_ids_helper` then ONE
|
|
||||||
`mul_mat_q_switch_type`. The per-expert host loop below it is the gated fallback.
|
|
||||||
|
|
||||||
(Below npl8, MXFP4 mmid routes through `mmvq` - `MMVQ_MAX_BATCH_SIZE=8`, mmid max
|
|
||||||
7 for turing_plus - which is fine for thin batch and out of scope here.)
|
|
||||||
|
|
||||||
## What to add (the durable delta, priority order)
|
|
||||||
|
|
||||||
### [1] Expert-aware M-tile selection (host-side only, zero new kernel)
|
|
||||||
|
|
||||||
In `mul_mat_q_case` / `launch_mul_mat_q`, when `ids != null`, choose `mmq_x` from
|
|
||||||
**per-expert density** (~`ne_get_rows / n_active_experts`, derivable cheaply, or
|
|
||||||
capped via env) instead of minimizing `ntiles` over aggregate `ncols_max`.
|
|
||||||
|
|
||||||
- `mmq_x` is a **compile-time template** (switch 8..128 step 8), so this is a pure
|
|
||||||
host-side SELECTION change - it picks a different already-compiled instantiation.
|
|
||||||
**Zero new kernel. Very low risk, high leverage.** Matches vLLM `BLOCK_SIZE_M`.
|
|
||||||
- Doubles as near-term lever-1: env-gated `LLAMA_MOE_MMQ_X` cap at the knee.
|
|
||||||
- GB10-aligned: smaller `mmq_x` -> smaller shared mem -> higher occupancy, and at
|
|
||||||
tokens/expert <= 8 (one col-tile/expert) it costs no extra weight read.
|
|
||||||
|
|
||||||
This is the single highest-leverage change and the seed of the durable port.
|
|
||||||
|
|
||||||
### [2] Block-padded moe_align (the moe_align_block_size port proper)
|
|
||||||
|
|
||||||
Extend `mm_ids_helper` to pad each expert segment up to a multiple of the chosen
|
|
||||||
block: write a sentinel (`-1`) `ids_dst` for pad lanes, put `expert_bounds` on
|
|
||||||
block boundaries. Then every col-tile is **full**, which:
|
|
||||||
|
|
||||||
- drops the `need_check` masking + per-expert partial-tail MMA,
|
|
||||||
- makes the stream-k `kbc` space exact (no skipped tiles, cleaner persistent
|
|
||||||
schedule), removing the `col_diff` skip branch.
|
|
||||||
|
|
||||||
Medium risk: touches the scatter, the `col_diff`/`need_check` logic, and the
|
|
||||||
`write_back` masking (pad rows must not write output). This is the proper
|
|
||||||
`moe_align_block_size` analogue and the durable second step.
|
|
||||||
|
|
||||||
### [3] Bespoke masked-grouped FP4 kernel - ONLY if [1]+[2] insufficient
|
|
||||||
|
|
||||||
A CUTLASS/DeepGEMM-style masked-grouped FP4 kernel. **Largest risk, likely
|
|
||||||
unnecessary** given [B] is already a persistent stream-k grouped GEMM. Listed for
|
|
||||||
completeness; do not start without [1]+[2] measured as insufficient.
|
|
||||||
|
|
||||||
## Integration into ggml_mul_mat_id (dispatch seam + gated fallback)
|
|
||||||
|
|
||||||
- The seam is unchanged: `ggml_cuda_mul_mat_id()` -> `should_use_mmq()` ->
|
|
||||||
`ggml_cuda_mul_mat_q()`. [1] and [2] live entirely inside the mmq id-branch
|
|
||||||
(`mmq.cu` ~L162-225) and its callees (`mmq.cuh` selection/launch, `mmid.cu`
|
|
||||||
scatter). No change to the host dispatch decision.
|
|
||||||
- **Gated fallback preserved**: the existing per-expert host loop
|
|
||||||
(`should_use_mmq()==false` path) stays as-is for f16/bf16 experts and
|
|
||||||
non-Blackwell GPUs. The new selection only fires on the grouped path.
|
|
||||||
- **Env gates** (off = exact current behavior):
|
|
||||||
- `LLAMA_MOE_MMQ_X=<8..128>` - cap/override the token tile for the id-path
|
|
||||||
(lever-1 + [1] manual knob).
|
|
||||||
- `LLAMA_MOE_BLOCK_ALIGN=0|1` - enable block-padded scatter ([2]).
|
|
||||||
Default both off until parity + throughput proven, then flip [1]'s
|
|
||||||
auto-selection on by default.
|
|
||||||
|
|
||||||
## Correctness / parity gate
|
|
||||||
|
|
||||||
Primary: `tests/test-backend-ops.cpp` `test_mul_mat_id` (~L4181). The CPU
|
|
||||||
reference is **deterministic** - the op test must be **bit-exact**.
|
|
||||||
|
|
||||||
- Sweep `type_a` in {`MXFP4`, `NVFP4`}, `type_b = F32`, `n_mats = 128`,
|
|
||||||
`n_expert_used = 8`, `n_tokens` in {8, 32, 64, 128} (the decode-density band).
|
|
||||||
- **Add ragged small-M shapes** to the harness if absent (n_tokens not a multiple
|
|
||||||
of mmq_x; experts with 0/1/2 tokens) - these are exactly where [1]/[2] change
|
|
||||||
tile geometry and where block-pad masking can leak.
|
|
||||||
- Pass criterion: new `mmq_x` selection and padded-align produce dst **identical**
|
|
||||||
to current op-test output (op test is exact; the GB10 CUDA greedy-decode
|
|
||||||
non-determinism band applies only to end-to-end, never to the op test).
|
|
||||||
- End-to-end sanity: `llama-batched-bench` on `~/bench/qwen3coder-mxfp4.gguf`,
|
|
||||||
`-fa on -npp 128 -ntg 128`, npl 8/32/64/128/256; confirm `S_TG` stays monotonic
|
|
||||||
and `S_PP` flat ~3050-3090. Verify greedy-decode output within the documented
|
|
||||||
CUDA batch-shape non-determinism band (CPU is the deterministic oracle).
|
|
||||||
|
|
||||||
Bench/parity scripts stay **dev-tree-only** (`~/llama-paged-dev/benches/`).
|
|
||||||
|
|
||||||
## Phased plan, expected payoff, risk per phase
|
|
||||||
|
|
||||||
| Phase | Work | Expected payoff | Risk |
|
|
||||||
|-------|------|-----------------|------|
|
|
||||||
| **P0** harness | Add ragged small-M + MXFP4/NVFP4 mmid shapes to `test_mul_mat_id`; capture current bit-exact baseline + the monotonic batched-bench curve as the reference. | None (gate). Locks correctness + the 85->1771 t/s baseline so any regression is caught. | Low. |
|
|
||||||
| **P1** sort op | Confirm `mm_ids_helper` is the moe_align; if [2] is pursued, prototype the block-pad scatter behind `LLAMA_MOE_BLOCK_ALIGN`. | Enables exact stream-k schedule; removes `need_check` masking (P3 payoff). | Medium (scatter + write-back masking). |
|
|
||||||
| **P2** grouped GEMM ([1]) | Expert-aware `mmq_x` selection in `mul_mat_q_case`/launch, `LLAMA_MOE_MMQ_X` gate. | The headline: reclaim the 3-6% M-tile fill waste at npl64-128. Modeled as removing wasted MMA M-throughput on every activated expert; net throughput up at high batch with no extra weight read. | **Low** (host-side template selection, no new kernel). |
|
|
||||||
| **P3** tune ([2] + fixup) | Land block-padded align; tune `mmq_x` per density, profile stream-k `fixup` overhead and `mmq_x`/`mmq_y` tile choice with nsys on the grouped `mul_mat_q<MXFP4>` kernel. | Remove per-expert partial-tail MMA; tighten the persistent schedule. Diminishing vs P2; this is pure micro-efficiency toward/past vLLM's saturated grouped-GEMM. | Medium-high (kernel masking paths). |
|
|
||||||
|
|
||||||
**Honest payoff framing:** the npl128 "cliff" is already gone on HEAD, so there is
|
|
||||||
no broken path to unlock. The durable win is **matching vLLM's saturated
|
|
||||||
grouped-GEMM M-tiling** (small per-expert block) and erasing the dense-GEMM-tuned
|
|
||||||
M-tile mismatch - a micro-efficiency gain at large effective batch, not a
|
|
||||||
step-change. vLLM 0.23.0 cannot even serve this model on GB10 (bf16 MoE-warmup
|
|
||||||
hang + hard reboot; GGUF loader can't map fused qwen3moe experts), and llama
|
|
||||||
already uses the same sorted-grouped-GEMM algorithm, so structural parity is
|
|
||||||
**already met**; this closes the residual kernel micro-gap.
|
|
||||||
|
|
||||||
## The biggest risk: the GB10 W4A16 occupancy wall
|
|
||||||
|
|
||||||
The dominant risk is **repeating the W4A16 dead-end** that hit only ~9 TFLOPS /
|
|
||||||
178 t/s on GB10. GB10 is **occupancy-dominated**: deep `cp.async` pipelines and
|
|
||||||
XOR-swizzle shared layouts **collapse occupancy** there. Any P3 kernel work MUST:
|
|
||||||
|
|
||||||
- keep **small shared mem + high occupancy** (do NOT add deep `cp.async` stages
|
|
||||||
or XOR-swizzle - they are exactly what killed W4A16);
|
|
||||||
- preserve the **skew-pad (+4)** tile layout already in `MMQ_MMA_TILE_X_K_FP4`;
|
|
||||||
- stay on the **FP4-MMA path** (`block_fp4_mmq`), the only path that hits Blackwell
|
|
||||||
FP4 = 2x INT8/BF16 rate;
|
|
||||||
- respect the ~273 GB/s LPDDR5X weight-read floor (dense decode is already at it;
|
|
||||||
MoE wins come from occupancy/tile fit, not bandwidth).
|
|
||||||
|
|
||||||
Smaller `mmq_x` ([1]) is **strictly consistent** with these lessons: it reduces
|
|
||||||
shared-mem footprint, raises occupancy, and at tokens/expert <= 8 adds no weight
|
|
||||||
re-read. So the low-risk lever ([1]) is also the one most aligned with what GB10
|
|
||||||
rewards - which is why it leads the plan and [3] is gated behind it.
|
|
||||||
|
|
||||||
## Commit / hygiene
|
|
||||||
|
|
||||||
Scope doc only (this file). No engine change committed in this workflow. Bench and
|
|
||||||
parity scripts are dev-tree-only. Commit with `git -s`, trailer
|
|
||||||
`Assisted-by: Claude:opus-4.8 [Claude Code]`, no `Co-Authored-By`, no em-dashes.
|
|
||||||
Do not push (human pushes). When [1]/[2] are implemented they mirror to
|
|
||||||
`backend/cpp/llama-cpp/patches/paged/0014-*` (next free slot).
|
|
||||||
@@ -1,99 +0,0 @@
|
|||||||
# Patch 0014 findings: expert-aware MoE token-tile cap (LLAMA_MOE_MMQ_X)
|
|
||||||
|
|
||||||
Near-term lever for the MoE-vs-vLLM workflow on GB10 (sm_121). Companion to
|
|
||||||
`0014-paged-expert-aware-moe-token-tile-cap.patch`. Model:
|
|
||||||
Qwen3-Coder-30B-A3B, 128 experts, top-8, mxfp4 experts
|
|
||||||
(`~/bench/qwen3coder-mxfp4.gguf`). Dev tree `~/llama-paged-dev` (branch `paged`),
|
|
||||||
`build-cuda` sm_121.
|
|
||||||
|
|
||||||
## Headline (honest): there is no npl128 cliff to erase on this build
|
|
||||||
|
|
||||||
The mission premise was a 25% decode drop at npl128 (batched-bench 253/505/830/620
|
|
||||||
@ npl 8/32/64/128). It does **not** reproduce. Stock decode is monotonic:
|
|
||||||
|
|
||||||
```
|
|
||||||
llama-batched-bench, qwen3coder-mxfp4.gguf, -fa on, -npp 128 -ntg 128, S_TG t/s
|
|
||||||
npl 1 8 32 64 128 256
|
|
||||||
stock 85 282 629 935 1295 1779 <- monotonic, no knee
|
|
||||||
```
|
|
||||||
|
|
||||||
The old cliff was a real high-batch regression since fixed upstream: mxfp4 MoE
|
|
||||||
decode on GB10 already takes the sorted grouped FP4-MMA GEMM (MUL_MAT_ID ->
|
|
||||||
`ggml_cuda_mul_mat_q` ids branch: `mm_ids_helper` moe_align/scatter + one
|
|
||||||
persistent stream-k `mul_mat_q`), i.e. vLLM's algorithm. See
|
|
||||||
`MOE_GROUPED_GEMM_SCOPE.md`.
|
|
||||||
|
|
||||||
## What the knob does
|
|
||||||
|
|
||||||
`mul_mat_q_case` picks the token-tile width `mmq_x` to cover `ncols_max`
|
|
||||||
(= `ne12`, the per-expert column upper bound = token count, up to 128) in one
|
|
||||||
column-tile. At MoE decode the per-expert density is `~ne12*k/n_experts`
|
|
||||||
(top-8/128 => ~1/16 of `ne12`), so each expert's `mmq_x`-wide col-tile is only
|
|
||||||
~6% filled: the MMA accumulator tile is `mmq_x`-wide at compile time and wastes
|
|
||||||
throughput on the padding columns, and the larger y-tile lowers occupancy.
|
|
||||||
|
|
||||||
`LLAMA_MOE_MMQ_X=<n>` caps `mmq_x` on the MUL_MAT_ID path only
|
|
||||||
(`expert_bounds != nullptr`). It only lowers the selection-loop upper bound and
|
|
||||||
still chooses from the same granularity/shared-memory-validated `mmq_x` set stock
|
|
||||||
already uses for smaller batches - no new kernel configuration. Default
|
|
||||||
(unset/<=0) = disabled => byte-identical to stock.
|
|
||||||
|
|
||||||
## Measurements (same binary, only LLAMA_MOE_MMQ_X differs)
|
|
||||||
|
|
||||||
Decode throughput, S_TG t/s:
|
|
||||||
|
|
||||||
```
|
|
||||||
npl stock cap16 cap32 cap64
|
|
||||||
1 85 85 85 85
|
|
||||||
8 282 280 282 282
|
|
||||||
32 629 623 629 628
|
|
||||||
64 935 915 949 934
|
|
||||||
128 1295 1204 1344 1357 <- cap64 +4.8% (cap16 -7%)
|
|
||||||
256 1779 1370 1723 1820 <- cap64 +2.3% (cap16 -23%)
|
|
||||||
```
|
|
||||||
|
|
||||||
Prefill throughput, S_PP t/s (the cost):
|
|
||||||
|
|
||||||
```
|
|
||||||
npl stock cap16 cap32 cap64
|
|
||||||
128 3083 1817 2559 3038
|
|
||||||
256 3084 1818 2560 3046
|
|
||||||
-41% -17% -1.3%
|
|
||||||
```
|
|
||||||
|
|
||||||
Reproducibility (interleaved off/cap64, two reps each):
|
|
||||||
|
|
||||||
```
|
|
||||||
npl off rep1/rep2 cap64 rep1/rep2
|
|
||||||
128 1300 / 1290 1357.5 / 1357.0
|
|
||||||
256 1786 / 1782 1826.3 / 1824.5
|
|
||||||
```
|
|
||||||
|
|
||||||
cap64 is stable to <0.1% and the gain sits well above the ~1% run-to-run band.
|
|
||||||
|
|
||||||
## Why 64 is the only value that helps net
|
|
||||||
|
|
||||||
A 512-token prefill ubatch routes ~32 tokens/expert. cap16/cap32 force those into
|
|
||||||
16/32-wide tiles, overflowing into extra col-tiles + weight re-reads -> prefill
|
|
||||||
craters (-41% / -17%). cap64 still holds the prefill density in one tile (32 < 64)
|
|
||||||
so prefill is near-neutral (-1.3%), while decode (~8 tokens/expert at npl128) gets
|
|
||||||
the fuller, higher-occupancy tile.
|
|
||||||
|
|
||||||
## Verdict
|
|
||||||
|
|
||||||
- Real but **modest** high-effective-batch DECODE micro-optimization
|
|
||||||
(+4.8% npl128, +2.3% npl256), neutral at npl<=64, ~1.3% prefill cost at cap64.
|
|
||||||
- **Not** a cliff fix (no cliff) and **not** a real-server unlock (llama-server
|
|
||||||
continuous batching already scales). Shipped as an opt-in, default-off knob;
|
|
||||||
recommended value 64 for decode-heavy high-concurrency deployments.
|
|
||||||
- Correctness: greedy temp-0 server output with cap64 is byte-identical to stock
|
|
||||||
for single-stream generation and stays coherent; thousands of capped MoE
|
|
||||||
matmuls at npl128/256 ran with no CUDA error / NaN.
|
|
||||||
|
|
||||||
## Durable follow-up (scoped, not implemented)
|
|
||||||
|
|
||||||
Replace the blunt global cap with a density-aware auto-select: choose `mmq_x`
|
|
||||||
from `ne_get_rows / n_active_experts` inside `mul_mat_q_case` so decode gets the
|
|
||||||
small tile while prefill keeps its large tile automatically (removes the ~1.3%
|
|
||||||
prefill cost). Plus the block-padded `moe_align` in `mm_ids_helper`. See
|
|
||||||
`MOE_GROUPED_GEMM_SCOPE.md`.
|
|
||||||
@@ -1,107 +0,0 @@
|
|||||||
# Paged-KV: GPU 0007 re-run + shared-prefix throughput benchmark
|
|
||||||
|
|
||||||
DGX Spark (NVIDIA GB10, sm_121 / cc 12.1), CUDA 13, dev tree `~/llama-paged-dev`
|
|
||||||
branch `paged`, base pin `f3e182816421c648188b5eab269853bf1531d950`, full paged
|
|
||||||
engine (0001-0004, 0006, 0007). All paged behaviour stays gated by
|
|
||||||
`LLAMA_KV_PAGED`; default-off is byte-identical to stock. Models:
|
|
||||||
`Qwen3-0.6B-Q8_0.gguf` and `Qwen3-32B-Q4_K_M.gguf`.
|
|
||||||
|
|
||||||
## Deliverable 1 - GPU run of the 0007 prefix-engine correctness driver
|
|
||||||
|
|
||||||
The committed driver `examples/simple/paged-prefix-engine.cpp` hardcodes
|
|
||||||
`n_gpu_layers = 0`. For this GPU run it was given a dev-only
|
|
||||||
`PAGED_NGL` env override (`mp.n_gpu_layers = getenv("PAGED_NGL") ? atoi(...) : 0`),
|
|
||||||
rebuilt in `build-cuda`, run, then the edit was **reverted** so the committed
|
|
||||||
driver stays byte-clean (it is dev scaffolding, never shipped in a patch).
|
|
||||||
|
|
||||||
Three runs of the same Gate-0 driver, Qwen3-0.6B, `LLAMA_KV_PAGED=1`:
|
|
||||||
|
|
||||||
| binary / offload | result |
|
|
||||||
|------------------------------------------|-------------------------|
|
|
||||||
| committed `build-cpu` driver | **ALL PASS (failures=0)** |
|
|
||||||
| `build-cuda`, `PAGED_NGL=99` (all layers)| GATE FAILED (failures=3)|
|
|
||||||
| `build-cuda`, `PAGED_NGL=0` (same binary)| GATE FAILED (failures=2)|
|
|
||||||
|
|
||||||
**The GPU run did NOT print ALL PASS - reported honestly.** But the failures are
|
|
||||||
narrow and are not a paged-engine bug:
|
|
||||||
|
|
||||||
- Every **structural / mechanical** paged invariant PASSES on GPU, in both
|
|
||||||
scenarios (boundary and mid-block): prefill computed ONLY the suffix (32 prefix
|
|
||||||
tokens skipped), shared prefix block-aligned, shared-block `ref_cnt == 2` while
|
|
||||||
both sequences hold it, ref drops `2 -> 1` on freeing one sharer, only the
|
|
||||||
private (suffix) blocks are returned, and the prefix block returns to the pool
|
|
||||||
once all sharers free. The cross-request KV reuse mechanism itself is GPU-clean.
|
|
||||||
- The only failures are the **exact greedy-token byte-identical** assertions
|
|
||||||
(e.g. boundary `B-shared` vs `B-from-scratch`). They diverge at a single near-tie
|
|
||||||
token (boundary: 2nd generated token `17971` vs `5671`) and then cascade
|
|
||||||
autoregressively.
|
|
||||||
|
|
||||||
Root cause is **CUDA float-kernel non-determinism, not the paged logic**: the
|
|
||||||
*same* CUDA binary fails the exact-token assertions even with `PAGED_NGL=0` (zero
|
|
||||||
layers offloaded), whereas the genuine `build-cpu` binary passes all 16/16. The
|
|
||||||
CUDA backend (loaded via `ggml_backend_load_all`) uses non-associative reductions
|
|
||||||
whose result differs between the full-prefill batch shape and the
|
|
||||||
incremental-suffix batch shape; under greedy decode a single logit near-tie flips
|
|
||||||
and the sequences cascade apart. This refines the earlier note in
|
|
||||||
`PAGED_GPU_VERIFY.md` (which framed it as "not GPU-specific" and had no CPU pass
|
|
||||||
to compare against): the CPU build now passes clean, so the divergence is a strict
|
|
||||||
test-assertion artefact of CUDA float ordering, not a defect in 0006/0007.
|
|
||||||
|
|
||||||
## Deliverable 2 - shared-prefix throughput benchmark (the real-win test)
|
|
||||||
|
|
||||||
Dev-only driver `examples/simple/paged-prefix-bench.cpp` (registered in
|
|
||||||
`examples/simple/CMakeLists.txt`, dev tree only - not in any shipped patch).
|
|
||||||
Workload: `K` sequences that all share a `P`-token common prefix (a system /
|
|
||||||
RAG preamble), each with a unique `S`-token suffix; prefill only (`G=0`,
|
|
||||||
generation is identical compute in both modes so it is excluded from the
|
|
||||||
headline). GPU, `-ngl 99`, `kv_unified = true`.
|
|
||||||
|
|
||||||
- **NO-SHARE (stock):** `LLAMA_KV_PAGED` unset; every sequence prefills the full
|
|
||||||
`P+S` tokens. Total prefill work `= K*(P+S)`.
|
|
||||||
- **PAGED-SHARE:** `LLAMA_KV_PAGED=1`; the prefix is computed ONCE on seq 0,
|
|
||||||
committed via `paged_prefix_api::commit`, then every other seq calls
|
|
||||||
`paged_prefix_api::share` to physically reuse the ref-counted prefix blocks and
|
|
||||||
prefills ONLY its suffix. Total prefill work `= P + K*S`.
|
|
||||||
|
|
||||||
**`kv_unified` note:** this engine's cross-request share is built around the
|
|
||||||
*unified* stream-0 pool (ref-counted shared cells), so `kv_unified = true` is what
|
|
||||||
makes the share engage - the same setting the committed 0007 driver uses. With
|
|
||||||
`kv_unified = true` the share engaged in every run (evidence below).
|
|
||||||
|
|
||||||
### Reuse actually engaged (share mode)
|
|
||||||
|
|
||||||
In every share run: `kshare(seq 1) = 1024` (the full block-aligned prefix is
|
|
||||||
reused, not recomputed), the shared prefix block's `ref_cnt == K` (all sharers
|
|
||||||
point at one physical copy), and `prefill_tokens_submitted` collapses from
|
|
||||||
`K*(P+S)` to `P + K*S`.
|
|
||||||
|
|
||||||
### Results (P=1024, S=32, prefill-only)
|
|
||||||
|
|
||||||
| model | K | mode | prefill tokens | prefill time | raw tok/s | shared ref_cnt |
|
|
||||||
|--------------|----|-----------|----------------|--------------|-----------|----------------|
|
|
||||||
| Qwen3-0.6B | 32 | no-share | 33792 | 4.659 s | 7253 | - |
|
|
||||||
| Qwen3-0.6B | 32 | **share** | 2048 | **0.554 s** | 3695 | 32 |
|
|
||||||
| Qwen3-32B | 16 | no-share | 16896 | 26.14 s | 647 | - |
|
|
||||||
| Qwen3-32B | 16 | **share** | 1536 | **3.64 s** | 422 | 16 |
|
|
||||||
| Qwen3-32B | 32 | no-share | 33792 | 61.91 s | 546 | - |
|
|
||||||
| Qwen3-32B | 32 | **share** | 2048 | **6.02 s** | 340 | 32 |
|
|
||||||
|
|
||||||
### Verdict: YES, a real and substantial win, and it grows with K
|
|
||||||
|
|
||||||
- Prefill wall-time speedup: **0.6B K=32 -> 8.4x**, **32B K=16 -> 7.2x**,
|
|
||||||
**32B K=32 -> 10.3x**. The win grows with the number of sharers because
|
|
||||||
no-share prefix recompute is `O(K)` while the shared prefix is `O(1)` plus
|
|
||||||
`K` tiny suffixes.
|
|
||||||
- Note the honest caveat in the raw-throughput column: share mode submits small
|
|
||||||
32-token suffix batches that are *less* GPU-efficient (340-422 tok/s) than the
|
|
||||||
large no-share batches (546-7253 tok/s). The win is **not** higher tok/s - it is
|
|
||||||
computing ~11-16x **fewer** tokens. On a fast GB10 prefill that still nets a
|
|
||||||
7-10x wall-time reduction because prefill is compute-bound and the shared prefix
|
|
||||||
dominates the token count.
|
|
||||||
- This is exactly the many-users-one-system-prompt / RAG-preamble fan-out
|
|
||||||
scenario, and the paged cross-request prefix cache delivers there.
|
|
||||||
|
|
||||||
Scaffolding (`paged-prefix-bench.cpp`, the `PAGED_NGL` driver tweak) stays
|
|
||||||
dev-tree-only and is not part of any shipped patch.
|
|
||||||
|
|
||||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
|
||||||
@@ -1,81 +0,0 @@
|
|||||||
# Paged-KV GPU verification + full backend CUDA build
|
|
||||||
|
|
||||||
Verification run on a DGX Spark (NVIDIA GB10, compute capability 12.1 / sm_121),
|
|
||||||
CUDA 13.0, against pin `f3e182816421c648188b5eab269853bf1531d950`. Models:
|
|
||||||
`Qwen3-0.6B-Q8_0.gguf` (core gate) and `Qwen3-32B-Q4_K_M.gguf` (sanity).
|
|
||||||
|
|
||||||
All paged behaviour stays gated by `LLAMA_KV_PAGED` (env) / the `kv_paged`
|
|
||||||
server option; default-off is byte-identical to stock.
|
|
||||||
|
|
||||||
## Deliverable 1 - GPU-path correctness (all on GPU, `-ngl 99`)
|
|
||||||
|
|
||||||
CUDA build of the dev tree configured with
|
|
||||||
`-DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=121 -DCMAKE_BUILD_TYPE=Release`;
|
|
||||||
all paged drivers (`llama-simple`, `llama-paged-multiseq`,
|
|
||||||
`llama-paged-prefix`, `llama-paged-prefix-engine`) compiled clean under sm_121.
|
|
||||||
|
|
||||||
1. Core token-identical gate - PASS. `llama-simple` greedy, Qwen3-0.6B, `-ngl 99`:
|
|
||||||
stock (env unset) vs `LLAMA_KV_PAGED=1` output is BYTE-IDENTICAL. The paged
|
|
||||||
path is genuinely engaged: `LLAMA_KV_PAGED_DEBUG=1` shows the device gather
|
|
||||||
firing (`[paged-attn] gather n_stream=1 ...`), per-token block placement
|
|
||||||
(`[paged-alloc] ... grew`), and the stock run uses CUDA Graphs while the paged
|
|
||||||
run takes the distinct gather path - yet output matches exactly.
|
|
||||||
|
|
||||||
2. Multi-stream - PASS. `llama-paged-multiseq -s 4 -ngl 99`, stock vs paged:
|
|
||||||
all 4 concurrent sequences BYTE-IDENTICAL on GPU (n_seqs=4, CUDA0 compute
|
|
||||||
buffer matches expectation). Same result reproduced on the CPU build.
|
|
||||||
|
|
||||||
Prefix recompute-skip (`llama-paged-prefix-engine`, patch 0007) - MIXED, and
|
|
||||||
this is a dev-scaffolding driver ("not shipped"); it was never built on CPU
|
|
||||||
(absent from the CPU Gate-0 set), so there is no prior CPU pass to match.
|
|
||||||
The driver hardcodes `n_gpu_layers = 0`; a reported test-harness-only env
|
|
||||||
override (`PAGED_NGL`) was added to run it at `-ngl 99` (29/29 layers
|
|
||||||
offloaded confirmed), then reverted. Results are IDENTICAL on CPU and GPU
|
|
||||||
(so not a GPU issue):
|
|
||||||
- PASS: measured recompute-skip (32 prefix tokens skipped, block-aligned),
|
|
||||||
ref-count == 2 on shared block, ref drop 2->1 on free, only-private-blocks
|
|
||||||
returned, block returned to pool.
|
|
||||||
- FAIL: 2 of ~16 greedy-token-equality assertions. `boundary` case diverges
|
|
||||||
from the from-scratch baseline at the 2nd generated token (`17971` vs
|
|
||||||
`5671`) and then completely; `mid-block` "A re-shareable after free, output
|
|
||||||
unchanged" also differs. Driver prints `GATE FAILED (failures=2)`.
|
|
||||||
This is a divergence in the prefix recompute-skip path (0006/0007), NOT in the
|
|
||||||
core gather gate, and not GPU-specific. Reported, not fixed (out of scope).
|
|
||||||
|
|
||||||
3. 32B GPU sanity - PASS. `LLAMA_KV_PAGED=1 llama-simple -ngl 99 -n 16` on
|
|
||||||
Qwen3-32B-Q4_K_M (65/65 layers offloaded): coherent output
|
|
||||||
("The capital of France is Paris..."), no crash, no OOM.
|
|
||||||
|
|
||||||
## Deliverable 2 - full backend build with the paged patches
|
|
||||||
|
|
||||||
Built in a nested LocalAI tree on the DGX; gRPC v1.59.0 built from source
|
|
||||||
(LocalAI bundle; the system protobuf ships no CMake CONFIG) in ~26 min.
|
|
||||||
|
|
||||||
- (2a) `make llama.cpp LLAMA_PAGED=on` - PASS. All 6 paged patches
|
|
||||||
(0001,0002,0003,0004,0006,0007) `git apply` cleanly to the pin (EXIT=0). The 8
|
|
||||||
vendored paged sources land in `llama.cpp/src/` and are BYTE-IDENTICAL to the
|
|
||||||
dev tree; `grpc-server.cpp` carries the `kv_paged`/`paged_attention` option
|
|
||||||
(patch 0005); `llama-kv-cache.cpp` has the env-gated hooks.
|
|
||||||
|
|
||||||
- (2b) grpc-server under CUDA sm_121 - PASS (with the single-application caveat
|
|
||||||
below). 89 MB ARM aarch64 executable, build ~139 s, linked against
|
|
||||||
libcudart.so.13 / libcublas.so.13; binary contains the paged option strings
|
|
||||||
and `paged_alloc`/`paged_attn`/gather symbols.
|
|
||||||
|
|
||||||
- (2c) `make llama.cpp LLAMA_PAGED=off` - PASS. "skipping paged-attention patch
|
|
||||||
series", EXIT=0, NO `paged-*` sources in the checkout (clean escape hatch).
|
|
||||||
|
|
||||||
### Build-flow finding: paged patches are applied TWICE in the on-flow
|
|
||||||
|
|
||||||
A plain `make grpc-server LLAMA_PAGED=on` FAILS to compile. The paged series is
|
|
||||||
applied by BOTH the Makefile `llama.cpp` target (`git apply`) AND `prepare.sh`
|
|
||||||
(`patch -p1`). On the already-git-applied tree, `prepare.sh` hits "Reversed (or
|
|
||||||
previously applied) patch detected! Assume -R? [n]", declines, and re-applies the
|
|
||||||
pure-addition hunks a second time. `llama_kv_cache::get_n_gather` etc. end up
|
|
||||||
defined twice -> redefinition errors in `llama-kv-cache.cpp` (`.rej`/`.orig`
|
|
||||||
litter `src/`). Single application (one of the two appliers) compiles clean -
|
|
||||||
the 2b build above used a single git-apply with `prepare.sh` patching suppressed.
|
|
||||||
Reported only; the fix (drop one of the two application sites for
|
|
||||||
`patches/paged/`) is out of scope for this verification.
|
|
||||||
|
|
||||||
Assisted-by: Claude:opus-4.8 [Claude Code]
|
|
||||||
@@ -1,111 +0,0 @@
|
|||||||
# Paged llama.cpp vs vLLM - apples-to-apples (batched + NVFP4 + prefix cache)
|
|
||||||
|
|
||||||
Definitive matched comparison on a DGX Spark (GB10, sm_121). Both engines batched,
|
|
||||||
both NVFP4-class weights, both with prefix caching on, both eager (no CUDA graphs).
|
|
||||||
Workload: shared 1024-token system prefix + unique 32-token suffix, generate 64
|
|
||||||
tokens, K requests fired concurrently (cold fan-out), one client hitting both
|
|
||||||
OpenAI-compatible servers with identical token-id prompts.
|
|
||||||
|
|
||||||
This run fixes the two confounders in the earlier comparison (a *serial* Q4_K dev
|
|
||||||
driver vs a *batched* FP4 vLLM server). Here both sides are batched and NVFP4.
|
|
||||||
|
|
||||||
## Setup
|
|
||||||
|
|
||||||
- llama.cpp: `llama-server` built from the paged dev tree (`~/llama-paged-dev`,
|
|
||||||
branch `paged`, patches 0001-0007), CUDA `build-cuda/` (sm_121).
|
|
||||||
`LLAMA_KV_PAGED=1`, `-ngl 99 --parallel 32 -c 40960`, model
|
|
||||||
`q3-32b-nvfp4-dense.gguf` (NVFP4 weights, FP4-MMA kernel). OpenAI `/completion`.
|
|
||||||
- vLLM 0.23.0: `vllm serve q3-32b-nvfp4a16/` (compressed-tensors W4A16 / Marlin),
|
|
||||||
`--enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.9
|
|
||||||
--max-num-seqs 64`, APC on (default). OpenAI `/v1/completions`.
|
|
||||||
|
|
||||||
## Finding 1 - the paged cross-request prefix cache does NOT engage in llama-server
|
|
||||||
|
|
||||||
This is itself a key result. The paged engine has two distinct mechanisms:
|
|
||||||
|
|
||||||
1. Physical paged block placement (patches 0002/0004) - runs inside
|
|
||||||
`llama_kv_cache::find_slot`, gated only by `LLAMA_KV_PAGED`. This DOES engage in
|
|
||||||
the server: with `LLAMA_KV_PAGED_DEBUG=1`, 2 concurrent shared-prefix requests
|
|
||||||
produced 14 `[paged-alloc] ... grew` lines, one stream per `seq`.
|
|
||||||
|
|
||||||
2. Cross-request prefix recompute-skip (patch 0007) - the actual fan-out win
|
|
||||||
(`shares N prefix blocks ... prefix NOT recomputed`, ref-counted block sharing).
|
|
||||||
This is reachable ONLY through `paged_prefix_api::share/commit`
|
|
||||||
(`src/paged-prefix-api.cpp`), which only the standalone driver calls.
|
|
||||||
|
|
||||||
Evidence it does not reach the server:
|
|
||||||
- Static: `grep -rn "paged_prefix\|share_prefix\|LLAMA_KV_PAGED" tools/server/`
|
|
||||||
returns nothing; `nm` on the binary finds no `paged_prefix` symbol use from the
|
|
||||||
server path. Nothing in `llama_decode` or the server calls `share`/`commit`.
|
|
||||||
- Runtime: the 2-request verify run logged **0** `shares prefix blocks` /
|
|
||||||
`NOT recomputed` lines. Both `seq=0` and `seq=1` independently grew to 65 blocks,
|
|
||||||
each allocating and recomputing the full ~972-token prefix separately - no
|
|
||||||
cross-slot KV block sharing, no `ref_cnt>1`.
|
|
||||||
|
|
||||||
So the 0007 recompute-skip, proven in the driver, does **not** yet reach the
|
|
||||||
server. Closing it needs server-side wiring: when admitting a slot whose prompt
|
|
||||||
shares a prefix with another live/committed slot, the server would have to call
|
|
||||||
the `paged_prefix_api::share` / `commit` seam. That is a future patch.
|
|
||||||
|
|
||||||
Note: llama-server has its OWN native prefix reuse (the slot prompt cache /
|
|
||||||
"context checkpoints"). In the K=32 wave the server reused the prefix cached by the
|
|
||||||
earlier wave, so prefill was only the 32-token suffix (`prompt eval ... / 32
|
|
||||||
tokens`). But that is a separate mechanism, it only helps prefill, and prefill is
|
|
||||||
not the bottleneck here (see below), so it does not change the verdict.
|
|
||||||
|
|
||||||
## Finding 2 - the matched comparison
|
|
||||||
|
|
||||||
Both batched, both NVFP4, both prefix-cache on, both eager. Cold concurrent fan-out,
|
|
||||||
identical token-id prompts via one client.
|
|
||||||
|
|
||||||
| K | engine | wall (s) | aggregate gen tok/s | req/s | vLLM speedup |
|
|
||||||
|----|----------|----------|---------------------|-------|--------------|
|
|
||||||
| 16 | llama.cpp| 50.7 | 18.9 | 0.30 | - |
|
|
||||||
| 16 | vLLM | 8.57 | 119.5 | 1.87 | ~5.9x |
|
|
||||||
| 32 | llama.cpp| 58.3 | 34.0 | 0.53 | - |
|
|
||||||
| 32 | vLLM | 8.86 | 231.1 | 3.61 | ~6.6x |
|
|
||||||
|
|
||||||
vLLM APC confirmed engaged: prefix cache hit rate 90.9% (K=16), 95.5% (K=32),
|
|
||||||
enforce_eager (CUDA graphs disabled), `enable_prefix_caching=True`.
|
|
||||||
|
|
||||||
### Verdict: not competitive - vLLM ~6x faster, and prefix caching is not why
|
|
||||||
|
|
||||||
With every confounder removed (both batched, both NVFP4, both eager, both with
|
|
||||||
prefix caching on), vLLM is still ~6x faster end-to-end. The gap is decode-bound,
|
|
||||||
not prefill/cache-bound:
|
|
||||||
|
|
||||||
- The G=64 workload is dominated by decode. In the llama K=32 run, decode was
|
|
||||||
52.98s of the 58.3s wall; prefill was ~3.5s (and only the 32-token suffix, since
|
|
||||||
the server's native prompt cache already reused the prefix). So even perfect
|
|
||||||
prefix sharing - paged or native - cannot move the total much.
|
|
||||||
- llama.cpp batched decode: **~828 ms per decode step** at batch 32
|
|
||||||
(1.21 tok/s per sequence).
|
|
||||||
- vLLM batched decode: ~170 tok/s aggregate gen at 32 running reqs ->
|
|
||||||
**~185 ms per step**, roughly **4-5x faster per decode step**.
|
|
||||||
- CUDA graphs are NOT the differentiator: both sides are eager (llama
|
|
||||||
`graphs reused = 0`, vLLM `--enforce-eager`). The win is vLLM's batched-decode
|
|
||||||
efficiency: PagedAttention + fused W4A16 (Marlin) GEMMs + chunked-prefill
|
|
||||||
scheduler, versus llama.cpp's per-step eager graph and NVFP4-GGUF decode path on
|
|
||||||
this Blackwell-class part.
|
|
||||||
|
|
||||||
Because decode dominates, wiring the paged 0007 recompute-skip into the server
|
|
||||||
(Finding 1) would mainly remove redundant prefill across slots - a real saving for
|
|
||||||
short-generation / long-prefix RAG fan-out, but at G=64 it is a few seconds against
|
|
||||||
a decode floor that is already ~6x slower than vLLM. The fan-out win does not, on
|
|
||||||
its own, make llama.cpp competitive here; the decode kernel/batching gap is the
|
|
||||||
load-bearing factor.
|
|
||||||
|
|
||||||
## Caveats
|
|
||||||
|
|
||||||
- NVFP4-GGUF is double-quant and is speed-representative (it routes onto the
|
|
||||||
FP4-MMA kernel); output quality is not the subject of this run.
|
|
||||||
- vLLM side is NVFP4A16 (W4A16 / Marlin) - 4-bit weights, 16-bit activations;
|
|
||||||
llama side is NVFP4 weights on FP4-MMA. Both are NVFP4-weight class.
|
|
||||||
- One llama request per run hit an intermittent HTTP 500 ("output does not match
|
|
||||||
the expected Content-only format" - a Qwen3 thinking-output quirk on
|
|
||||||
`/completion`), so llama counts were 15/16 and 31/32. The failed request returns
|
|
||||||
early and reduces batch contention for the rest, so a clean 16/16 / 32/32 llama
|
|
||||||
run would be marginally slower - i.e. the ~6x gap reported here is conservative
|
|
||||||
(favorable to llama.cpp).
|
|
||||||
- Both servers cold-started; numbers are end-to-end wall from the concurrent
|
|
||||||
client. Disk healthy (~325 GB free), GPU otherwise idle.
|
|
||||||
@@ -1,165 +0,0 @@
|
|||||||
# Paged-attention closing measurements: stock GPU determinism + vLLM comparison
|
|
||||||
|
|
||||||
Two closing measurements for the paged-attention series, run on a DGX Spark
|
|
||||||
(NVIDIA GB10, compute capability 12.1 / sm_121), CUDA 13. Dev tree
|
|
||||||
`~/llama-paged-dev` branch `paged`, paged engine gated by env `LLAMA_KV_PAGED`
|
|
||||||
(default-off = stock). Models: `Qwen3-0.6B-Q8_0.gguf` and
|
|
||||||
`Qwen3-32B-Q4_K_M.gguf` (llama.cpp), `Qwen3-32B` nvfp4a16 / W4A16 HF safetensors
|
|
||||||
(vLLM 0.23.0). All dev drivers are dev-tree-only and not shipped.
|
|
||||||
|
|
||||||
## Deliverable 1: stock GPU determinism across batch shapes (no paging)
|
|
||||||
|
|
||||||
Question: is the patch-0007 GPU byte-identity "failure" (a near-tie greedy token
|
|
||||||
flips on CUDA, e.g. 17971 vs 5671) caused by paging, or is it inherent stock
|
|
||||||
CUDA non-determinism from running the same tokens in a different batch shape?
|
|
||||||
|
|
||||||
Method: a new dev-only driver `llama-paged-batchshape` (paging explicitly OFF:
|
|
||||||
`unsetenv("LLAMA_KV_PAGED")`). For a prompt `[P+S]` it greedy-decodes two ways,
|
|
||||||
both stock contiguous KV:
|
|
||||||
|
|
||||||
- (a) `full` - prefill the whole `[P+S]` in ONE `llama_decode`.
|
|
||||||
- (b) `split` - prefill `P` in one `llama_decode`, then `S` in a second.
|
|
||||||
|
|
||||||
The two paths write byte-for-identical token ids; the only difference is the
|
|
||||||
batch shape submitted to the kernels (full prefill vs P-then-S), which changes
|
|
||||||
the float reduction order in the GEMMs and therefore the KV values by tiny
|
|
||||||
amounts. 5 distinct prompts, suffix S=16.
|
|
||||||
|
|
||||||
### Single next token (the literal T_full vs T_split)
|
|
||||||
|
|
||||||
Both CPU and CUDA returned the SAME greedy next token for all 5 prompts
|
|
||||||
(0/5 flips). BUT the top-2 logit gap measurably changes with the batch shape on
|
|
||||||
CUDA, proving the float order does differ:
|
|
||||||
|
|
||||||
```
|
|
||||||
CUDA, S=8: prompt 1 T_full=1896 (gap 0.07072) T_split=1896 (gap 0.17986)
|
|
||||||
CUDA, S=8: prompt 4 T_full=49584 (gap 0.93304) T_split=49584 (gap 0.85785)
|
|
||||||
```
|
|
||||||
|
|
||||||
The argmax simply did not flip on the immediate next token for these prompts -
|
|
||||||
the gaps, while shifting, stayed wide enough.
|
|
||||||
|
|
||||||
### Generated stream (what 0007 actually byte-asserts)
|
|
||||||
|
|
||||||
0007 asserts byte-identity over a *generated* token stream, where the tiny
|
|
||||||
prefill-shape KV perturbation accumulates and eventually crosses a near-tie.
|
|
||||||
Generating G tokens greedily from `full` vs `split` and reporting first
|
|
||||||
divergence:
|
|
||||||
|
|
||||||
| gen length | CPU diverged | CUDA diverged |
|
|
||||||
|-----------|--------------|---------------|
|
|
||||||
| G=24 (0007 default) | 1/5 (prompt 0 @ step 5) | 2/5 (prompt 1 @ step 3, prompt 4 @ step 6) |
|
|
||||||
| G=64 | 2/5 (steps 5, 42) | 3/5 (steps 3, 6, 30) |
|
|
||||||
|
|
||||||
Example CUDA divergence, pure stock, zero paging:
|
|
||||||
`prompt 1: DIVERGES at gen step 3: full=1260 split=576`.
|
|
||||||
|
|
||||||
### Verdict (Deliverable 1): HYPOTHESIS HELD
|
|
||||||
|
|
||||||
The 0007 GPU byte-identity failure is **stock batch-shape non-determinism, not a
|
|
||||||
paged bug**. With paging entirely OFF, stock llama.cpp produces a different
|
|
||||||
greedy token stream when the same prompt is processed in a full-prefill batch vs
|
|
||||||
a split (prefix-then-suffix) batch - exactly the shape difference that 0007's
|
|
||||||
prefix-share path introduces (full B-from-scratch vs prefix-cached + suffix-only).
|
|
||||||
|
|
||||||
Refinement (reported honestly): it is **not strictly CUDA-only**. CPU exhibits
|
|
||||||
the same divergence, just less often and later (1/5 vs 2/5 at G=24, and CPU's
|
|
||||||
flips land at later generation steps). This is exactly why 0007's small, short
|
|
||||||
CPU scenarios happened to pass 16/16 while the CUDA run flipped: CUDA's larger
|
|
||||||
parallel reductions reorder more aggressively, so a near-tie crosses earlier and
|
|
||||||
more frequently. The phenomenon is floating-point GEMM-batching non-determinism,
|
|
||||||
inherent to both backends; paging is not the cause.
|
|
||||||
|
|
||||||
## Deliverable 2: vLLM vs llama.cpp+paged on a shared-prefix fan-out
|
|
||||||
|
|
||||||
Workload: K requests share a 1024-token system prefix, each with a unique
|
|
||||||
32-token suffix, then generate 64 tokens. Both engines cache the shared prefix
|
|
||||||
(vLLM automatic prefix caching ON by default; llama.cpp via the paged
|
|
||||||
cross-request prefix cache, `LLAMA_KV_PAGED=1`).
|
|
||||||
|
|
||||||
Quant is the realistic apples-to-oranges, reported honestly:
|
|
||||||
- llama.cpp: Qwen3-32B **Q4_K_M** (GGUF), `-ngl 99`, CUDA dequant kernels.
|
|
||||||
- vLLM: Qwen3-32B **nvfp4a16 (W4A16)**, served via the **Marlin FP4
|
|
||||||
weight-only** kernel because GB10 (sm_121) has **no native FP4 compute** -
|
|
||||||
i.e. vLLM is on a slower-than-ideal kernel path here. vLLM also ran
|
|
||||||
`enforce_eager=True` (no CUDA graphs / torch.compile; the env lacked a working
|
|
||||||
inductor/ninja toolchain), so the vLLM numbers are if anything **conservative**.
|
|
||||||
|
|
||||||
### vLLM (automatic prefix caching), end-to-end
|
|
||||||
|
|
||||||
APC hits confirmed in the engine log: **"Prefix cache hit rate: 97.0%"**,
|
|
||||||
`prefix_cache_hits 33040/34848` (K=16) and `99344/102432` (K=32).
|
|
||||||
|
|
||||||
| K | APC | prefill wall (G=1) | total wall (G=64) | throughput |
|
|
||||||
|---|-----|--------------------|--------------------|-----------|
|
|
||||||
| 16 | ON | 0.749 s | 6.63 s | 2.41 req/s |
|
|
||||||
| 16 | OFF | 20.19 s | 27.21 s | 0.59 req/s |
|
|
||||||
| 32 | ON | 1.13 s | 7.56 s | 4.23 req/s |
|
|
||||||
| 32 | OFF | 40.19 s | 48.71 s | 0.66 req/s |
|
|
||||||
|
|
||||||
vLLM's APC cuts the fan-out prefill ~27x (K=16) to ~36x (K=32) vs APC-off; the
|
|
||||||
huge ratio reflects how slow the FP4-emulation prefill is when forced to
|
|
||||||
recompute all K prefixes.
|
|
||||||
|
|
||||||
### llama.cpp + paged prefix cache (prefill phase)
|
|
||||||
|
|
||||||
The paged shared-prefix bench (`llama-paged-prefix-bench`, `BENCH_GEN=0`,
|
|
||||||
`PAGED_NGL=99`). Reuse confirmed: `kshare(seq1)=1024`, shared-block
|
|
||||||
`ref_cnt = K` (all sequences hold the one prefix), 15360 / 31744 prefix tokens
|
|
||||||
skipped.
|
|
||||||
|
|
||||||
| K | mode | prefill tokens submitted | prefill wall | vs no-share |
|
|
||||||
|---|------|--------------------------|--------------|-------------|
|
|
||||||
| 16 | PAGED-SHARE | 1536 | 3.66 s | 7.15x |
|
|
||||||
| 16 | NO-SHARE | 16896 | 26.17 s | 1.0x |
|
|
||||||
| 32 | PAGED-SHARE | 2048 | 6.04 s | 10.3x |
|
|
||||||
| 32 | NO-SHARE | 33792 | 62.17 s | 1.0x |
|
|
||||||
|
|
||||||
The paged prefix cache delivers the expected **7.15x (K=16) / 10.3x (K=32)**
|
|
||||||
prefill wall-time reduction - the headline cross-request prefix-skip win, on a
|
|
||||||
real 32B model on GPU.
|
|
||||||
|
|
||||||
### Head-to-head, both engines caching the shared prefix
|
|
||||||
|
|
||||||
Prefill of the cached fan-out (vLLM G=1, ~prefill; llama.cpp G=0, pure prefill):
|
|
||||||
|
|
||||||
| K | llama.cpp+paged prefill | vLLM APC prefill | vLLM faster by |
|
|
||||||
|---|-------------------------|------------------|----------------|
|
|
||||||
| 16 | 3.66 s | 0.749 s | ~4.9x |
|
|
||||||
| 32 | 6.04 s | 1.13 s | ~5.3x |
|
|
||||||
|
|
||||||
### Verdict (Deliverable 2): competitive in kind, behind in absolute terms
|
|
||||||
|
|
||||||
With both engines caching the shared prefix, **llama.cpp+paged is qualitatively
|
|
||||||
competitive but absolutely behind vLLM on this GB10 box**:
|
|
||||||
|
|
||||||
- **Same optimization, same order of magnitude.** llama.cpp's paged prefix cache
|
|
||||||
reproduces exactly the win vLLM's APC gives - skip the shared-prefix recompute
|
|
||||||
- and yields a 7-10x prefill reduction vs its own no-share baseline. On the
|
|
||||||
RAG/system-prompt fan-out the algorithmic gap is closed: llama.cpp no longer
|
|
||||||
pays K x prefix.
|
|
||||||
|
|
||||||
- **vLLM still wins head-to-head by ~5x on the cached prefill** (0.75s vs 3.66s
|
|
||||||
at K=16; 1.13s vs 6.04s at K=32), and by more end-to-end because it does
|
|
||||||
**continuous batched decode** (all K sequences decoded in one fused step)
|
|
||||||
while the llama.cpp paged *dev driver* decodes each sequence serially. That
|
|
||||||
decode-batching gap is a property of the serving stack, not of the paged
|
|
||||||
prefix cache. Notably vLLM wins here while handicapped (eager mode, FP4
|
|
||||||
weight-only emulation with no native FP4 on GB10); a tuned vLLM would lead by
|
|
||||||
more.
|
|
||||||
|
|
||||||
- **Honest caveats / blockers.** (1) Quant differs (Q4_K_M vs nvfp4a16). (2) The
|
|
||||||
comparison is prefill-vs-prefill plus vLLM end-to-end; a clean llama.cpp
|
|
||||||
end-to-end on this driver is blocked because its generation phase has a
|
|
||||||
stale-logits bug (`get_logits_ith` reads seq 0's prefill index after later
|
|
||||||
sequences' prefills overwrote the logits buffer -> segfault), and even fixed
|
|
||||||
its decode is serial, so it would not be apples-to-apples vs vLLM's batched
|
|
||||||
decode. The fair end-to-end llama.cpp number needs the grpc / llama-server
|
|
||||||
continuous-batching path, not this dev scaffold. (3) vLLM ran eager + FP4
|
|
||||||
emulation, making its numbers conservative.
|
|
||||||
|
|
||||||
Bottom line: paged gives llama.cpp the cross-request prefix-skip that vLLM's APC
|
|
||||||
provides, which is the categorical win and removes the K x prefix penalty on
|
|
||||||
RAG/system-prompt fan-out. On absolute wall-time on this hardware vLLM retains a
|
|
||||||
~5x prefill lead and a larger end-to-end lead from continuous batched decode and
|
|
||||||
a more optimized serving stack.
|
|
||||||
@@ -1,138 +0,0 @@
|
|||||||
# GB10 same-day head-to-head server sweep: llama-server (paged) vs vLLM
|
|
||||||
|
|
||||||
Date: 2026-06-23. Hardware: GB10 / DGX Spark (sm_121, 128 GB LPDDR5x unified, ~273 GB/s
|
|
||||||
weight-read floor). GPU otherwise idle (sibling vLLM had exited; LocalAI docker workers
|
|
||||||
stopped for the run).
|
|
||||||
|
|
||||||
This sweep **replaces** the stale carried "~75-80% of vLLM" figure (commit 07985ba4,
|
|
||||||
pre-co-batching, single-point). It measures *real serving* steady-state aggregate decode
|
|
||||||
throughput across the full concurrency curve, for three model classes, with one identical
|
|
||||||
client driving both engines.
|
|
||||||
|
|
||||||
## Method
|
|
||||||
|
|
||||||
- **llama**: `llama-server` from the paged dev tree (`~/llama-paged-dev/build-cuda`, HEAD =
|
|
||||||
patch 0013 / commit 17d97cb), `LLAMA_KV_PAGED=1`, `-fa on -ngl 999 --parallel 128 -c 65536`.
|
|
||||||
- **vLLM**: 0.23.0, `vllm serve --enforce-eager --enable-prefix-caching --max-num-seqs >=128
|
|
||||||
--max-model-len 4096` (APC on, eager per the GB10 no-CUDA-graphs edge).
|
|
||||||
- **Client** (`sweep_client2.py`): N concurrent **non-streaming** `/v1/completions`, short
|
|
||||||
shared prompt, `max_tokens=min_tokens=256`, `ignore_eos=true`. Aggregate decode tok/s =
|
|
||||||
total generated tokens / wall. Non-streaming keeps the Python client off the critical path
|
|
||||||
(one JSON parse per request, not per token), so the **server** is the bottleneck. Validated:
|
|
||||||
vLLM pushed 4227 tok/s through the exact same client where llama topped out at 2087, so the
|
|
||||||
client is not the cap. Both engines use the identical client + prompt -> apples-to-apples.
|
|
||||||
- npl (concurrency) sweep: 8 / 32 / 64 / 128.
|
|
||||||
|
|
||||||
Quant parity:
|
|
||||||
- Dense: llama **NVFP4-dense GGUF** (weight-only FP4, 16-bit compute) vs vLLM **NVFP4A16**
|
|
||||||
(weight FP4, 16-bit activation) -> matched precision class.
|
|
||||||
- Small: llama **Q8_0** vs vLLM **bf16** (closest loadable form).
|
|
||||||
- MoE: llama **mxfp4** GGUF. **vLLM could not serve this MoE on GB10 at all** (see below), so
|
|
||||||
there is no vLLM MoE column.
|
|
||||||
|
|
||||||
## Results: aggregate decode tok/s (higher is better)
|
|
||||||
|
|
||||||
### Dense 32B (llama NVFP4-dense vs vLLM NVFP4A16)
|
|
||||||
|
|
||||||
| npl | llama (NVFP4) | vLLM (NVFP4A16) | llama % of vLLM |
|
|
||||||
|----:|--------------:|----------------:|----------------:|
|
|
||||||
| 8 | 83.2 | 85.9 | **96.9%** |
|
|
||||||
| 32 | 228.9 | 301.3 | 76.0% |
|
|
||||||
| 64 | 367.1 | 507.8 | 72.3% |
|
|
||||||
| 128 | 520.6 | 604.0 | 86.2% |
|
|
||||||
|
|
||||||
Plateau: neither has plateaued at 128 (both still climbing, weight-read bound). llama is at
|
|
||||||
**parity at batch-8** (97%), dips to ~72% mid-curve (npl 32-64), recovers to 86% at 128.
|
|
||||||
|
|
||||||
### Small Qwen3-0.6B (llama Q8_0 vs vLLM bf16)
|
|
||||||
|
|
||||||
| npl | llama (Q8_0) | vLLM (bf16) | llama % of vLLM |
|
|
||||||
|----:|-------------:|------------:|----------------:|
|
|
||||||
| 8 | 911.3 | 923.0 | **98.7%** |
|
|
||||||
| 32 | 1701.6 | 2531.4 | 67.2% |
|
|
||||||
| 64 | 1911.7 | 3497.1 | 54.7% |
|
|
||||||
| 128 | 2087.6 | 4227.6 | 49.4% |
|
|
||||||
|
|
||||||
Plateau: **llama plateaus hard** at ~2.0-2.1k by npl 64-128 (+9% from 64->128). vLLM keeps
|
|
||||||
scaling (3497 -> 4227). For a tiny runtime-bound model, vLLM's scheduler/batching amortizes
|
|
||||||
better; llama-server's per-token host cost (sampling, detok, slot mgmt) caps it. This is the
|
|
||||||
worst llama-vs-vLLM ratio in the sweep (down to 49%).
|
|
||||||
|
|
||||||
### MoE Qwen3-Coder-30B-A3B (llama mxfp4; vLLM = NOT SERVABLE on GB10)
|
|
||||||
|
|
||||||
| npl | llama (mxfp4) | vLLM |
|
|
||||||
|----:|--------------:|-----:|
|
|
||||||
| 8 | 290.0 | n/a |
|
|
||||||
| 32 | 582.5 | n/a |
|
|
||||||
| 64 | 931.8 | n/a |
|
|
||||||
| 128 | 1041.3 | n/a |
|
|
||||||
|
|
||||||
llama-server scales cleanly to **1041 tok/s** at npl 128 with **no npl-128 expert-activation
|
|
||||||
cliff** (unlike the prior `llama-batched-bench` MoE numbers 253/505/830/620 that peaked at 64;
|
|
||||||
short-prompt continuous batching in the server avoids it).
|
|
||||||
|
|
||||||
**vLLM could not serve this MoE on GB10 (two independent failures):**
|
|
||||||
1. **bf16** (`Qwen/Qwen3-Coder-30B-A3B-Instruct`, the only HF form on the box): loads the
|
|
||||||
56.9 GB of weights, then **hangs at the MoE warmup** (`Using MoEPrepareAndFinalize
|
|
||||||
NoDPEPModular` -> `Model loading took ...`), GPU 0% util, and **takes the whole box down
|
|
||||||
(hard reboot)**. Reproduced twice. With tight `--gpu-memory-utilization` it still hangs at
|
|
||||||
the same step before the API server ever comes up.
|
|
||||||
2. **mxfp4 GGUF** (same weights llama uses): vLLM 0.23.0's GGUF loader **cannot map the fused
|
|
||||||
qwen3moe expert tensors** (`RuntimeError: Failed to map GGUF parameters (48):
|
|
||||||
['model.layers.N.mlp.experts.gate_up_proj', ...]`). Engine init fails outright.
|
|
||||||
|
|
||||||
So on GB10, llama.cpp is the *only* engine of the two that serves this 30B-A3B MoE at all -
|
|
||||||
an availability win, independent of throughput.
|
|
||||||
|
|
||||||
## Batch-8 anomaly triage (dense NVFP4) -- RESOLVED
|
|
||||||
|
|
||||||
The prior mixed-load run reported llama batch-8 steady decode at **471 ms/step (~19% of vLLM
|
|
||||||
aggregate, ~17 tok/s)**. This sweep does **not** reproduce it. Clean isolated batch-8 decode:
|
|
||||||
|
|
||||||
- `llama-server` batch-8 dense paged = **83.2 tok/s** aggregate = ~96 ms/step = **96.9% of
|
|
||||||
vLLM's 85.9** (parity, both at the LPDDR5x weight-read floor).
|
|
||||||
|
|
||||||
`llama-batched-bench` cross-check, dense NVFP4, `-npp 16 -ntg 128 -npl 1,8`, the three
|
|
||||||
hypotheses isolated (S_TG = decode tok/s aggregate at batch 8):
|
|
||||||
|
|
||||||
| config | batch-8 S_TG t/s | ms/decode-step |
|
|
||||||
|-----------------------|-----------------:|---------------:|
|
|
||||||
| paged, ctx 65536 | 90.32 | 88.6 |
|
|
||||||
| stock, ctx 65536 | 88.39 | 90.5 |
|
|
||||||
| paged, ctx 163840 | 89.33 | 89.6 |
|
|
||||||
| stock, ctx 163840 | 87.72 | 91.2 |
|
|
||||||
|
|
||||||
Conclusion: clean batch-8 dense decode is **~88-90 tok/s (~89 ms/step) regardless of all three
|
|
||||||
suspects**:
|
|
||||||
- **Paged overhead?** No -- paged is within 2% of stock, and at ctx 65k paged is *faster*
|
|
||||||
(90.3 vs 88.4). The decode path is not paying a paged penalty at batch-8.
|
|
||||||
- **The 163840-token ctx allocation?** No -- ctx 163840 == ctx 65536 within 1% (89.3 vs 90.3).
|
|
||||||
The large allocation does not slow steady-state decode.
|
|
||||||
- **NVFP4 decode cost?** This *is* the cost -- ~89 ms/step is the GB10 weight-read floor for a
|
|
||||||
32B at batch-8 (it matches vLLM's 86 tok/s server and exceeds it at the kernel level: 90 vs
|
|
||||||
86). It is the hardware ceiling, not a bug.
|
|
||||||
|
|
||||||
The 471 ms/step is ~5.3x slower than this clean floor and is explained by none of the three.
|
|
||||||
It was a **mixed-load artifact**: the 8 decoders were time-sharing the GPU with a concurrent
|
|
||||||
prefill (a large prompt / chunked prefill landing on the same steps). That decode-vs-prefill
|
|
||||||
contention is exactly the stall **patch 0013 (`LLAMA_PREFILL_BUDGET`)** bounds. In steady-state
|
|
||||||
isolated decode, batch-8 dense is at **parity with vLLM (97%)**, not 19%.
|
|
||||||
|
|
||||||
## Aggregate map (replaces the carried 75-80%)
|
|
||||||
|
|
||||||
llama-server (paged) as a fraction of vLLM, by regime:
|
|
||||||
|
|
||||||
- **Low concurrency (batch-8): parity, 97-99%** on both measurable classes. Both engines sit on
|
|
||||||
the LPDDR5x weight-read floor; there is nothing to win.
|
|
||||||
- **Dense 32B, mid-to-high concurrency: 72-86%.** Dips to ~72% at npl 32-64, recovers to 86% at
|
|
||||||
128. Both still climbing (weight-bound), neither plateaus by 128.
|
|
||||||
- **Small 0.6B, mid-to-high concurrency: 49-67%.** llama plateaus ~2.0k; vLLM scales to 4.2k.
|
|
||||||
Runtime/scheduler-bound regime -- vLLM's batching wins; this is llama's weakest ratio.
|
|
||||||
- **MoE 30B-A3B: llama-only.** vLLM cannot serve it on GB10 (bf16 reboots the box at MoE
|
|
||||||
warmup; GGUF expert tensors unmappable). llama serves it at 290 -> 1041 tok/s, scaling
|
|
||||||
cleanly with no npl-128 cliff.
|
|
||||||
|
|
||||||
Net: the single "75-80%" number is replaced by a curve. It is *roughly* right only for the
|
|
||||||
dense mid-band; it is too optimistic for the small model at high concurrency (49%) and moot for
|
|
||||||
MoE (where llama is the only option). The headline is parity at low concurrency and a hardware
|
|
||||||
(not engine) ceiling on dense decode.
|
|
||||||
@@ -2,30 +2,12 @@
|
|||||||
|
|
||||||
## Patches
|
## Patches
|
||||||
|
|
||||||
## Apply patches: the base `patches/` series, then the gated `patches/paged/`
|
## Apply patches from the `patches` directory
|
||||||
## series (default on; LLAMA_PAGED=off skips it). Only *.patch files are applied
|
|
||||||
## (docs/dirs like patches/paged/ and *.md are skipped). The Makefile `llama.cpp`
|
|
||||||
## target already `git apply`s these at checkout, so each apply is guarded by a
|
|
||||||
## sentinel and skipped when already present - re-applying git-format patches with
|
|
||||||
## `patch` fuzzily duplicates hunks (redefinition errors). This block only does
|
|
||||||
## real work if prepare.sh is run against an unpatched checkout.
|
|
||||||
if [ -d "patches" ]; then
|
if [ -d "patches" ]; then
|
||||||
for patch in patches/*.patch; do
|
for patch in $(ls patches); do
|
||||||
[ -e "$patch" ] || continue
|
|
||||||
echo "Applying patch $patch"
|
echo "Applying patch $patch"
|
||||||
patch -d llama.cpp/ -p1 -N -r - < "$patch" || true
|
patch -d llama.cpp/ -p1 < patches/$patch
|
||||||
done
|
done
|
||||||
if [ "${LLAMA_PAGED:-on}" != "off" ] && [ -d "patches/paged" ]; then
|
|
||||||
if [ -f llama.cpp/src/paged-kv-manager.cpp ]; then
|
|
||||||
echo "paged-attention patch series already applied (sentinel present) - skipping re-apply"
|
|
||||||
else
|
|
||||||
for patch in patches/paged/*.patch; do
|
|
||||||
[ -e "$patch" ] || continue
|
|
||||||
echo "Applying paged patch $patch"
|
|
||||||
patch -d llama.cpp/ -p1 -N -r - < "$patch" || true
|
|
||||||
done
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
# Local development: point at a working checkout instead of cloning, e.g.
|
# Local development: point at a working checkout instead of cloning, e.g.
|
||||||
# make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
|
# make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
|
||||||
|
|
||||||
PRIVACY_FILTER_VERSION?=646342f7a59c6b7d195185eac60bad762e572f1d
|
PRIVACY_FILTER_VERSION?=98f52c5ef2250f207cc6b9a6aef05393a120cb7c
|
||||||
PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
|
PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
|
||||||
PRIVACY_FILTER_SRC?=
|
PRIVACY_FILTER_SRC?=
|
||||||
|
|
||||||
|
|||||||
11
backend/go/ced/.gitignore
vendored
Normal file
11
backend/go/ced/.gitignore
vendored
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
.cache/
|
||||||
|
sources/
|
||||||
|
build/
|
||||||
|
package/
|
||||||
|
ced-grpc
|
||||||
|
# build artifacts staged in-tree by the Makefile (cp from sources/) or
|
||||||
|
# symlinked for local dev; the real sources live in ced.cpp upstream.
|
||||||
|
*.so
|
||||||
|
*.so.*
|
||||||
|
ced_capi.h
|
||||||
|
compile_commands.json
|
||||||
77
backend/go/ced/Makefile
Normal file
77
backend/go/ced/Makefile
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
# ced sound-classification backend Makefile.
|
||||||
|
#
|
||||||
|
# Upstream pin lives below as CED_VERSION?=<sha> so .github/bump_deps.sh can find
|
||||||
|
# and update it (matches the parakeet-cpp / whisper.cpp convention).
|
||||||
|
#
|
||||||
|
# Local dev shortcut: symlink an out-of-tree ced.cpp shared build + header and
|
||||||
|
# skip the clone/cmake steps entirely:
|
||||||
|
# ln -sf /path/to/ced.cpp/build-shared/libced.so .
|
||||||
|
# ln -sf /path/to/ced.cpp/include/ced_capi.h .
|
||||||
|
# go build -o ced-grpc .
|
||||||
|
|
||||||
|
CED_VERSION?=c04ac14b7992d00584d9e812c9bb6268598a6ce7
|
||||||
|
CED_REPO?=https://github.com/mudler/ced.cpp
|
||||||
|
|
||||||
|
GOCMD?=go
|
||||||
|
GO_TAGS?=
|
||||||
|
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
||||||
|
|
||||||
|
BUILD_TYPE?=
|
||||||
|
NATIVE?=false
|
||||||
|
|
||||||
|
# Static-link ggml into libced.so (PIC) so the shared lib is self-contained:
|
||||||
|
# dlopen needs no libggml*.so alongside it, only system libs the runtime image
|
||||||
|
# already provides.
|
||||||
|
CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DCED_SHARED=ON -DCED_BUILD_CLI=OFF -DCED_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||||
|
|
||||||
|
ifeq ($(NATIVE),false)
|
||||||
|
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||||
|
endif
|
||||||
|
|
||||||
|
# ced.cpp gates its ggml backends behind CED_GGML_* options (set(... CACHE BOOL
|
||||||
|
# "" FORCE)), so forward those instead of a bare -DGGML_CUDA=ON.
|
||||||
|
ifeq ($(BUILD_TYPE),cublas)
|
||||||
|
CMAKE_ARGS+=-DCED_GGML_CUDA=ON -DGGML_CUDA_GRAPHS=ON
|
||||||
|
else ifeq ($(BUILD_TYPE),openblas)
|
||||||
|
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||||
|
else ifeq ($(BUILD_TYPE),hipblas)
|
||||||
|
CMAKE_ARGS+=-DCED_GGML_HIP=ON
|
||||||
|
else ifeq ($(BUILD_TYPE),vulkan)
|
||||||
|
CMAKE_ARGS+=-DCED_GGML_VULKAN=ON
|
||||||
|
endif
|
||||||
|
|
||||||
|
.PHONY: ced-grpc package build clean purge test all
|
||||||
|
|
||||||
|
all: ced-grpc
|
||||||
|
|
||||||
|
sources/ced.cpp:
|
||||||
|
mkdir -p sources/ced.cpp
|
||||||
|
cd sources/ced.cpp && \
|
||||||
|
git init -q && \
|
||||||
|
git remote add origin $(CED_REPO) && \
|
||||||
|
git fetch --depth 1 origin $(CED_VERSION) && \
|
||||||
|
git checkout FETCH_HEAD && \
|
||||||
|
git submodule update --init --recursive --depth 1 --single-branch
|
||||||
|
|
||||||
|
libced.so: sources/ced.cpp
|
||||||
|
cmake -B sources/ced.cpp/build-shared -S sources/ced.cpp $(CMAKE_ARGS)
|
||||||
|
cmake --build sources/ced.cpp/build-shared --config Release -j$(JOBS)
|
||||||
|
cp -fv sources/ced.cpp/build-shared/libced.so* ./ 2>/dev/null || true
|
||||||
|
cp -fv sources/ced.cpp/include/ced_capi.h ./
|
||||||
|
|
||||||
|
ced-grpc: libced.so main.go goced.go
|
||||||
|
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o ced-grpc .
|
||||||
|
|
||||||
|
package: ced-grpc
|
||||||
|
bash package.sh
|
||||||
|
|
||||||
|
build: package
|
||||||
|
|
||||||
|
test:
|
||||||
|
LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
|
||||||
|
|
||||||
|
clean: purge
|
||||||
|
rm -rf libced.so* ced_capi.h package ced-grpc
|
||||||
|
|
||||||
|
purge:
|
||||||
|
rm -rf sources/ced.cpp
|
||||||
130
backend/go/ced/goced.go
Normal file
130
backend/go/ced/goced.go
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
// Go side of the ced backend: purego bindings over ced_capi.h plus the gRPC
|
||||||
|
// SoundDetection implementation.
|
||||||
|
//
|
||||||
|
// SKETCH: the pb.SoundDetection* types come from backend.proto (regenerate with
|
||||||
|
// `make protogen-go`). The C side is single-threaded per ctx, so we guard the
|
||||||
|
// engine with engineMu; LocalAI also serializes via base.SingleThread.
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
"sync"
|
||||||
|
"unsafe"
|
||||||
|
|
||||||
|
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||||
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||||
|
)
|
||||||
|
|
||||||
|
// purego-bound entry points from libced.so. Names match ced_capi.h exactly.
|
||||||
|
var (
|
||||||
|
CppAbiVersion func() int32
|
||||||
|
CppLoad func(ggufPath string) uintptr
|
||||||
|
CppFree func(ctx uintptr)
|
||||||
|
CppLastError func(ctx uintptr) string
|
||||||
|
CppNumClasses func(ctx uintptr) int32
|
||||||
|
CppSampleRate func(ctx uintptr) int32
|
||||||
|
CppClassifyPathJSON func(ctx uintptr, wavPath string, topK int32) uintptr
|
||||||
|
CppClassifyPcmJSON func(ctx uintptr, pcm []float32, nSamples int32, sampleRate int32, topK int32) uintptr
|
||||||
|
CppFreeString func(s uintptr)
|
||||||
|
)
|
||||||
|
|
||||||
|
// cstr copies a malloc'd C string (returned as uintptr) into a Go string and
|
||||||
|
// frees the original via ced_capi_free_string. Empty/0 -> "".
|
||||||
|
func cstr(p uintptr) string {
|
||||||
|
if p == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
defer CppFreeString(p)
|
||||||
|
var b []byte
|
||||||
|
for i := 0; ; i++ {
|
||||||
|
ch := *(*byte)(unsafe.Pointer(p + uintptr(i))) //nolint:govet // #nosec G103 -- C-owned NUL-terminated string from libced (not Go-GC memory)
|
||||||
|
if ch == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
b = append(b, ch)
|
||||||
|
}
|
||||||
|
return string(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ced is the gRPC backend. One loaded CED model per instance.
|
||||||
|
type Ced struct {
|
||||||
|
base.Base
|
||||||
|
ctxPtr uintptr
|
||||||
|
engineMu sync.Mutex
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load resolves the GGUF and opens the C-API context.
|
||||||
|
func (c *Ced) Load(opts *pb.ModelOptions) error {
|
||||||
|
if opts.ModelFile == "" {
|
||||||
|
return errors.New("ced: ModelFile is required")
|
||||||
|
}
|
||||||
|
ctx := CppLoad(opts.ModelFile)
|
||||||
|
if ctx == 0 {
|
||||||
|
return fmt.Errorf("ced: ced_capi_load failed for %q: %s", opts.ModelFile, CppLastError(0))
|
||||||
|
}
|
||||||
|
c.ctxPtr = ctx
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// jsonTag mirrors the ced_capi JSON tag objects.
|
||||||
|
type jsonTag struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
Score float32 `json:"score"`
|
||||||
|
Label string `json:"label"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// SoundDetection classifies the clip at req.Src and returns scored AudioSet tags.
|
||||||
|
func (c *Ced) SoundDetection(ctx context.Context, req *pb.SoundDetectionRequest) (*pb.SoundDetectionResponse, error) {
|
||||||
|
if c.ctxPtr == 0 {
|
||||||
|
return nil, errors.New("ced: model not loaded")
|
||||||
|
}
|
||||||
|
if req.GetSrc() == "" {
|
||||||
|
return nil, errors.New("ced: SoundDetectionRequest.src (audio path) is required")
|
||||||
|
}
|
||||||
|
topK := req.GetTopK()
|
||||||
|
if topK <= 0 {
|
||||||
|
topK = 10 // sensible default for a tagging response
|
||||||
|
}
|
||||||
|
|
||||||
|
c.engineMu.Lock()
|
||||||
|
out := cstr(CppClassifyPathJSON(c.ctxPtr, req.GetSrc(), topK))
|
||||||
|
lastErr := CppLastError(c.ctxPtr)
|
||||||
|
c.engineMu.Unlock()
|
||||||
|
|
||||||
|
if out == "" {
|
||||||
|
return nil, fmt.Errorf("ced: classification failed: %s", lastErr)
|
||||||
|
}
|
||||||
|
var tags []jsonTag
|
||||||
|
if err := json.Unmarshal([]byte(out), &tags); err != nil {
|
||||||
|
return nil, fmt.Errorf("ced: bad classifier JSON: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
thr := req.GetThreshold()
|
||||||
|
resp := &pb.SoundDetectionResponse{}
|
||||||
|
for _, t := range tags {
|
||||||
|
if t.Score < thr {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
resp.Detections = append(resp.Detections, &pb.SoundClass{
|
||||||
|
Label: t.Label, Score: t.Score, Index: int32(t.Index),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.Slice(resp.Detections, func(i, j int) bool {
|
||||||
|
return resp.Detections[i].Score > resp.Detections[j].Score
|
||||||
|
})
|
||||||
|
return resp, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *Ced) Free() error {
|
||||||
|
c.engineMu.Lock()
|
||||||
|
defer c.engineMu.Unlock()
|
||||||
|
if c.ctxPtr != 0 {
|
||||||
|
CppFree(c.ctxPtr)
|
||||||
|
c.ctxPtr = 0
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
59
backend/go/ced/main.go
Normal file
59
backend/go/ced/main.go
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
// ced sound-classification backend. Started internally by LocalAI: one gRPC
|
||||||
|
// server per loaded model. Loads libced.so via purego and registers the flat
|
||||||
|
// C-API declared in ced_capi.h. The library name can be overridden with
|
||||||
|
// CED_LIBRARY (mirrors PARAKEET_LIBRARY / WHISPER_LIBRARY); the default looks
|
||||||
|
// for the .so next to this binary.
|
||||||
|
//
|
||||||
|
// SKETCH: requires `make protogen-go` after the backend.proto SoundDetection
|
||||||
|
// addition, and a built libced.so (see Makefile). See DESIGN.md.
|
||||||
|
import (
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/ebitengine/purego"
|
||||||
|
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||||
|
)
|
||||||
|
|
||||||
|
var addr = flag.String("addr", "localhost:50051", "the address to connect to")
|
||||||
|
|
||||||
|
type libFunc struct {
|
||||||
|
ptr any
|
||||||
|
name string
|
||||||
|
}
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
libName := os.Getenv("CED_LIBRARY")
|
||||||
|
if libName == "" {
|
||||||
|
libName = "libced.so"
|
||||||
|
}
|
||||||
|
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Errorf("ced: dlopen %q: %w", libName, err))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bound 1:1 to ced_capi.h. char*-returning functions are declared uintptr
|
||||||
|
// so we can free the same pointer with ced_capi_free_string after copying
|
||||||
|
// (purego's string return would copy and leak the original).
|
||||||
|
for _, lf := range []libFunc{
|
||||||
|
{&CppAbiVersion, "ced_capi_abi_version"},
|
||||||
|
{&CppLoad, "ced_capi_load"},
|
||||||
|
{&CppFree, "ced_capi_free"},
|
||||||
|
{&CppLastError, "ced_capi_last_error"},
|
||||||
|
{&CppNumClasses, "ced_capi_num_classes"},
|
||||||
|
{&CppSampleRate, "ced_capi_sample_rate"},
|
||||||
|
{&CppClassifyPathJSON, "ced_capi_classify_path_json"},
|
||||||
|
{&CppClassifyPcmJSON, "ced_capi_classify_pcm_json"},
|
||||||
|
{&CppFreeString, "ced_capi_free_string"},
|
||||||
|
} {
|
||||||
|
purego.RegisterLibFunc(lf.ptr, lib, lf.name)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(os.Stderr, "[ced] ABI=%d\n", CppAbiVersion())
|
||||||
|
flag.Parse()
|
||||||
|
if err := grpc.StartServer(*addr, &Ced{}); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
60
backend/go/ced/package.sh
Executable file
60
backend/go/ced/package.sh
Executable file
@@ -0,0 +1,60 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Bundle the ced-grpc binary, libced.so, the core runtime libs (libc/libstdc++/
|
||||||
|
# libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE so the package
|
||||||
|
# is self-contained. Mirrors backend/go/parakeet-cpp/package.sh; run.sh routes
|
||||||
|
# the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc is used.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CURDIR=$(dirname "$(realpath "$0")")
|
||||||
|
REPO_ROOT="${CURDIR}/../../.."
|
||||||
|
|
||||||
|
mkdir -p "$CURDIR/package/lib"
|
||||||
|
|
||||||
|
cp -avf "$CURDIR/ced-grpc" "$CURDIR/package/"
|
||||||
|
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
|
||||||
|
|
||||||
|
cp -avf "$CURDIR"/libced.so* "$CURDIR/package/lib/" 2>/dev/null || {
|
||||||
|
echo "ERROR: libced.so not found in $CURDIR, run 'make' first" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||||
|
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||||
|
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||||
|
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||||
|
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||||
|
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||||
|
elif [ "$(uname -s)" = "Darwin" ]; then
|
||||||
|
echo "Detected Darwin"
|
||||||
|
else
|
||||||
|
echo "Error: Could not detect architecture"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||||
|
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||||
|
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||||
|
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||||
|
package_gpu_libs
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Packaging completed successfully"
|
||||||
|
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
|
||||||
15
backend/go/ced/run.sh
Executable file
15
backend/go/ced/run.sh
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CURDIR=$(dirname "$(realpath "$0")")
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
|
||||||
|
|
||||||
|
# If a self-contained ld.so was packaged, route through it so the packaged
|
||||||
|
# libc / libstdc++ are used instead of the host's (matches the sibling backends).
|
||||||
|
if [ -f "$CURDIR/lib/ld.so" ]; then
|
||||||
|
echo "Using lib/ld.so"
|
||||||
|
exec "$CURDIR/lib/ld.so" "$CURDIR/ced-grpc" "$@"
|
||||||
|
fi
|
||||||
|
|
||||||
|
exec "$CURDIR/ced-grpc" "$@"
|
||||||
@@ -67,7 +67,7 @@ sources/CrispASR:
|
|||||||
# it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources
|
# it, so ${CMAKE_SOURCE_DIR} is THIS backend dir and the talk-llama sources
|
||||||
# aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root),
|
# aren't found. Rewrite to ${PROJECT_SOURCE_DIR} (the crispasr project root),
|
||||||
# which is correct both standalone and as a subproject. Idempotent.
|
# which is correct both standalone and as a subproject. Idempotent.
|
||||||
sed -i 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt
|
sed -i.bak 's#\$${CMAKE_SOURCE_DIR}/examples/talk-llama#\$${PROJECT_SOURCE_DIR}/examples/talk-llama#' sources/CrispASR/src/CMakeLists.txt && rm -f sources/CrispASR/src/CMakeLists.txt.bak
|
||||||
|
|
||||||
# Detect OS
|
# Detect OS
|
||||||
UNAME_S := $(shell uname -s)
|
UNAME_S := $(shell uname -s)
|
||||||
|
|||||||
@@ -47,6 +47,74 @@ extern "C" void set_abort(int v) {
|
|||||||
g_abort.store(v, std::memory_order_relaxed);
|
g_abort.store(v, std::memory_order_relaxed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- word-level timestamp accessors ---
|
||||||
|
extern "C" {
|
||||||
|
int crispasr_session_result_n_words(crispasr_session_result *r, int seg_i);
|
||||||
|
const char *crispasr_session_result_word_text(crispasr_session_result *r,
|
||||||
|
int seg_i, int word_i);
|
||||||
|
int64_t crispasr_session_result_word_t0(crispasr_session_result *r, int seg_i,
|
||||||
|
int word_i);
|
||||||
|
int64_t crispasr_session_result_word_t1(crispasr_session_result *r, int seg_i,
|
||||||
|
int word_i);
|
||||||
|
|
||||||
|
// Parakeet-specific word accessors
|
||||||
|
int crispasr_parakeet_result_n_words(void *r);
|
||||||
|
const char *crispasr_parakeet_result_word_text(void *r, int word_i);
|
||||||
|
int64_t crispasr_parakeet_result_word_t0(void *r, int word_i);
|
||||||
|
int64_t crispasr_parakeet_result_word_t1(void *r, int word_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
void *get_result(void) { return g_result; }
|
||||||
|
|
||||||
|
int get_word_count(int seg_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return 0;
|
||||||
|
return crispasr_session_result_n_words(g_result, seg_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *get_word_text(int seg_i, int word_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return "";
|
||||||
|
return crispasr_session_result_word_text(g_result, seg_i, word_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_word_t0(int seg_i, int word_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return 0;
|
||||||
|
return crispasr_session_result_word_t0(g_result, seg_i, word_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_word_t1(int seg_i, int word_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return 0;
|
||||||
|
return crispasr_session_result_word_t1(g_result, seg_i, word_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parakeet-specific word accessors
|
||||||
|
int get_parakeet_word_count(void) {
|
||||||
|
if (!g_result)
|
||||||
|
return 0;
|
||||||
|
return crispasr_parakeet_result_n_words(g_result);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *get_parakeet_word_text(int word_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return "";
|
||||||
|
return crispasr_parakeet_result_word_text(g_result, word_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_parakeet_word_t0(int word_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return 0;
|
||||||
|
return crispasr_parakeet_result_word_t0(g_result, word_i);
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_parakeet_word_t1(int word_i) {
|
||||||
|
if (!g_result)
|
||||||
|
return 0;
|
||||||
|
return crispasr_parakeet_result_word_t1(g_result, word_i);
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_log_cb(enum ggml_log_level level, const char *log,
|
static void ggml_log_cb(enum ggml_log_level level, const char *log,
|
||||||
void *data) {
|
void *data) {
|
||||||
const char *level_str;
|
const char *level_str;
|
||||||
|
|||||||
@@ -20,4 +20,18 @@ float *tts_synthesize(const char *text, int *out_n_samples); // 24kHz mono float
|
|||||||
void tts_free(float *pcm);
|
void tts_free(float *pcm);
|
||||||
int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok
|
int tts_set_voice(const char *name); // best-effort speaker selection; 0 ok
|
||||||
int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text)
|
int tts_set_voice_file(const char *path, const char *ref_text); // load voice pack (.gguf) or zero-shot clone (.wav + ref_text)
|
||||||
|
|
||||||
|
// --- word-level timestamp accessors ---
|
||||||
|
// Session-based (works for whisper-like backends)
|
||||||
|
void *get_result(void);
|
||||||
|
int get_word_count(int seg_i);
|
||||||
|
const char *get_word_text(int seg_i, int word_i);
|
||||||
|
int64_t get_word_t0(int seg_i, int word_i);
|
||||||
|
int64_t get_word_t1(int seg_i, int word_i);
|
||||||
|
|
||||||
|
// Parakeet-specific (global word list, no segment index)
|
||||||
|
int get_parakeet_word_count(void);
|
||||||
|
const char *get_parakeet_word_text(int word_i);
|
||||||
|
int64_t get_parakeet_word_t0(int word_i);
|
||||||
|
int64_t get_parakeet_word_t1(int word_i);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,6 +34,18 @@ var (
|
|||||||
CppTTSFree func(ptr uintptr)
|
CppTTSFree func(ptr uintptr)
|
||||||
CppTTSSetVoice func(name string) int
|
CppTTSSetVoice func(name string) int
|
||||||
CppTTSSetVoiceFile func(path string, refText string) int
|
CppTTSSetVoiceFile func(path string, refText string) int
|
||||||
|
|
||||||
|
// Word-level timestamp accessors (session-based, per-segment)
|
||||||
|
CppGetWordCount func(segI int) int
|
||||||
|
CppGetWordText func(segI int, wordI int) string
|
||||||
|
CppGetWordT0 func(segI int, wordI int) int64
|
||||||
|
CppGetWordT1 func(segI int, wordI int) int64
|
||||||
|
|
||||||
|
// Parakeet-specific word accessors (global, no segment index)
|
||||||
|
CppGetParakeetWordCount func() int
|
||||||
|
CppGetParakeetWordText func(wordI int) string
|
||||||
|
CppGetParakeetWordT0 func(wordI int) int64
|
||||||
|
CppGetParakeetWordT1 func(wordI int) int64
|
||||||
)
|
)
|
||||||
|
|
||||||
type CrispASR struct {
|
type CrispASR struct {
|
||||||
@@ -212,6 +224,28 @@ func (w *CrispASR) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
|
|||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isValidWord reports whether a TranscriptWord contains recognisable speech
|
||||||
|
// content. The parakeet-specific word accessors can return stale initialisation
|
||||||
|
// data (model name, binary blobs) when a segment has no real speech. A word is
|
||||||
|
// considered valid only when:
|
||||||
|
// - the text is non-empty after trimming,
|
||||||
|
// - it contains no U+FFFD replacement characters (from binary data scrubbing),
|
||||||
|
// - both timestamps are non-negative,
|
||||||
|
// - the word has positive duration (end > start).
|
||||||
|
func isValidWord(w *pb.TranscriptWord) bool {
|
||||||
|
txt := strings.TrimSpace(w.Text)
|
||||||
|
if txt == "" {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if strings.ContainsRune(txt, '\uFFFD') {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if w.Start < 0 || w.End < 0 || w.End <= w.Start {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
|
func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
|
||||||
if err := ctx.Err(); err != nil {
|
if err := ctx.Err(); err != nil {
|
||||||
return pb.TranscriptResult{}, status.Error(codes.Canceled, "transcription cancelled")
|
return pb.TranscriptResult{}, status.Error(codes.Canceled, "transcription cancelled")
|
||||||
@@ -290,15 +324,54 @@ func (w *CrispASR) AudioTranscription(ctx context.Context, opts *pb.TranscriptRe
|
|||||||
// IDs, so Tokens is left empty.
|
// IDs, so Tokens is left empty.
|
||||||
txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
|
txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
|
||||||
|
|
||||||
|
// Populate word-level timestamps. Try session-based functions first
|
||||||
|
// (per-segment); fall back to parakeet-specific functions (global word
|
||||||
|
// list with no segment index — only populated on the first segment to
|
||||||
|
// avoid duplication).
|
||||||
|
words := []*pb.TranscriptWord{}
|
||||||
|
wordCount := CppGetWordCount(i)
|
||||||
|
if wordCount == 0 && i == 0 {
|
||||||
|
wordCount = CppGetParakeetWordCount()
|
||||||
|
for j := 0; j < wordCount; j++ {
|
||||||
|
w := &pb.TranscriptWord{
|
||||||
|
Start: CppGetParakeetWordT0(j) * (10000000),
|
||||||
|
End: CppGetParakeetWordT1(j) * (10000000),
|
||||||
|
Text: strings.ToValidUTF8(strings.Clone(CppGetParakeetWordText(j)), "<22>"),
|
||||||
|
}
|
||||||
|
if isValidWord(w) {
|
||||||
|
words = append(words, w)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for j := 0; j < wordCount; j++ {
|
||||||
|
w := &pb.TranscriptWord{
|
||||||
|
Start: CppGetWordT0(i, j) * (10000000),
|
||||||
|
End: CppGetWordT1(i, j) * (10000000),
|
||||||
|
Text: strings.ToValidUTF8(strings.Clone(CppGetWordText(i, j)), "<22>"),
|
||||||
|
}
|
||||||
|
if isValidWord(w) {
|
||||||
|
words = append(words, w)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip empty segments with no recognisable content (e.g. trailing
|
||||||
|
// silence segments that parakeet emits with stale init data).
|
||||||
|
trimmed := strings.TrimSpace(txt)
|
||||||
|
if trimmed == "" && len(words) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
segment := &pb.TranscriptSegment{
|
segment := &pb.TranscriptSegment{
|
||||||
Id: int32(i),
|
Id: int32(i),
|
||||||
Text: txt,
|
Text: txt,
|
||||||
Start: s, End: t,
|
Start: s, End: t,
|
||||||
|
Words: words,
|
||||||
}
|
}
|
||||||
|
|
||||||
segments = append(segments, segment)
|
segments = append(segments, segment)
|
||||||
|
|
||||||
text += " " + strings.TrimSpace(txt)
|
text += " " + trimmed
|
||||||
}
|
}
|
||||||
|
|
||||||
return pb.TranscriptResult{
|
return pb.TranscriptResult{
|
||||||
@@ -390,13 +463,20 @@ func (w *CrispASR) AudioTranscriptionStream(ctx context.Context, opts *pb.Transc
|
|||||||
s := CppGetSegmentStart(i) * 10000000
|
s := CppGetSegmentStart(i) * 10000000
|
||||||
t := CppGetSegmentEnd(i) * 10000000
|
t := CppGetSegmentEnd(i) * 10000000
|
||||||
txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
|
txt := strings.ToValidUTF8(strings.Clone(CppGetSegmentText(i)), "<22>")
|
||||||
|
|
||||||
|
// Skip empty segments (e.g. trailing silence that parakeet emits
|
||||||
|
// with stale init data).
|
||||||
|
trimmed := strings.TrimSpace(txt)
|
||||||
|
if trimmed == "" && s == t {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
segments = append(segments, &pb.TranscriptSegment{
|
segments = append(segments, &pb.TranscriptSegment{
|
||||||
Id: int32(i),
|
Id: int32(i),
|
||||||
Text: txt,
|
Text: txt,
|
||||||
Start: s, End: t,
|
Start: s, End: t,
|
||||||
})
|
})
|
||||||
|
|
||||||
trimmed := strings.TrimSpace(txt)
|
|
||||||
if trimmed == "" {
|
if trimmed == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -44,6 +44,14 @@ func main() {
|
|||||||
{&CppTTSFree, "tts_free"},
|
{&CppTTSFree, "tts_free"},
|
||||||
{&CppTTSSetVoice, "tts_set_voice"},
|
{&CppTTSSetVoice, "tts_set_voice"},
|
||||||
{&CppTTSSetVoiceFile, "tts_set_voice_file"},
|
{&CppTTSSetVoiceFile, "tts_set_voice_file"},
|
||||||
|
{&CppGetWordCount, "get_word_count"},
|
||||||
|
{&CppGetWordText, "get_word_text"},
|
||||||
|
{&CppGetWordT0, "get_word_t0"},
|
||||||
|
{&CppGetWordT1, "get_word_t1"},
|
||||||
|
{&CppGetParakeetWordCount, "get_parakeet_word_count"},
|
||||||
|
{&CppGetParakeetWordText, "get_parakeet_word_text"},
|
||||||
|
{&CppGetParakeetWordT0, "get_parakeet_word_t0"},
|
||||||
|
{&CppGetParakeetWordT1, "get_parakeet_word_t1"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, lf := range libFuncs {
|
for _, lf := range libFuncs {
|
||||||
|
|||||||
@@ -8,11 +8,13 @@ JOBS?=$(shell nproc --ignore=1)
|
|||||||
|
|
||||||
# depth-anything.cpp. Pin to a specific commit for a stable build; a squash
|
# depth-anything.cpp. Pin to a specific commit for a stable build; a squash
|
||||||
# merge upstream can orphan a branch, so the native version is pinned by SHA.
|
# merge upstream can orphan a branch, so the native version is pinned by SHA.
|
||||||
# This SHA adds the nested two-file metric C-API (abi_version 4,
|
# This SHA adds the Depth Anything V2 engine + C-API routing (depth-only,
|
||||||
# da_capi_load_nested) required by the depth-anything-3-nested gallery model;
|
# relative + metric) on top of the nested two-file metric C-API (abi_version 4,
|
||||||
# tag it (e.g. v0.1.3) upstream to keep the SHA alive.
|
# da_capi_load_nested) required by the depth-anything-3-nested gallery model.
|
||||||
|
# It is kept alive by the upstream tag da2-support (survives a squash-merge);
|
||||||
|
# repoint to the master merge commit once mudler/depth-anything.cpp PR #1 lands.
|
||||||
DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git
|
DEPTHANYTHING_REPO?=https://github.com/mudler/depth-anything.cpp.git
|
||||||
DEPTHANYTHING_VERSION?=cce5edc395fd1843806093d7ccc0c8b0d0b97b72
|
DEPTHANYTHING_VERSION?=f4e17dea695dd12ae76bea98ba58030996b98118
|
||||||
|
|
||||||
ifeq ($(NATIVE),false)
|
ifeq ($(NATIVE),false)
|
||||||
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||||
|
|||||||
18
backend/go/face-detect/.gitignore
vendored
Normal file
18
backend/go/face-detect/.gitignore
vendored
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
# Fetched upstream sources
|
||||||
|
sources/
|
||||||
|
|
||||||
|
# CMake build directories
|
||||||
|
build*/
|
||||||
|
|
||||||
|
# build artifacts staged in-tree by the Makefile (cp from sources/) or
|
||||||
|
# symlinked for local dev; the real sources live in face-detect.cpp upstream.
|
||||||
|
*.so
|
||||||
|
*.so.*
|
||||||
|
facedetect_capi.h
|
||||||
|
compile_commands.json
|
||||||
|
|
||||||
|
# Compiled backend binary
|
||||||
|
face-detect-grpc
|
||||||
|
|
||||||
|
# Packaging output
|
||||||
|
package/
|
||||||
97
backend/go/face-detect/Makefile
Normal file
97
backend/go/face-detect/Makefile
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
# face-detect backend Makefile.
|
||||||
|
#
|
||||||
|
# Upstream pin lives below as FACEDETECT_VERSION?=be22d67... (.github/bump_deps.sh
|
||||||
|
# can find and update it - matches the voice-detect / parakeet.cpp / whisper.cpp
|
||||||
|
# convention).
|
||||||
|
#
|
||||||
|
# Local dev shortcut: if you already have an out-of-tree face-detect.cpp build,
|
||||||
|
# symlink the .so + header into this directory and skip the clone/cmake steps:
|
||||||
|
#
|
||||||
|
# ln -sf /path/to/face-detect.cpp/build-shared/libfacedetect.so .
|
||||||
|
# ln -sf /path/to/face-detect.cpp/include/facedetect_capi.h .
|
||||||
|
# go build -o face-detect-grpc .
|
||||||
|
#
|
||||||
|
# The default target below does the proper clone-at-pin + cmake build so CI does
|
||||||
|
# not need a side-checkout.
|
||||||
|
|
||||||
|
FACEDETECT_VERSION?=be22d67145a8bcd879f45ad33fbea03131c5922b
|
||||||
|
FACEDETECT_REPO?=https://github.com/mudler/face-detect.cpp
|
||||||
|
|
||||||
|
GOCMD?=go
|
||||||
|
GO_TAGS?=
|
||||||
|
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4)
|
||||||
|
|
||||||
|
BUILD_TYPE?=
|
||||||
|
NATIVE?=false
|
||||||
|
|
||||||
|
# Build ggml + the vendored libjpeg-turbo statically into libfacedetect.so (PIC)
|
||||||
|
# so the shared lib is self-contained: dlopen needs no libggml*.so alongside it,
|
||||||
|
# only system libs (libstdc++/libgomp/libc) the runtime image already provides.
|
||||||
|
# The vendored jpeg symbols are hidden via -Wl,--exclude-libs,ALL on the C++
|
||||||
|
# side, so only the facedetect_capi_* surface is exported.
|
||||||
|
CMAKE_ARGS?=-DCMAKE_BUILD_TYPE=Release -DFACEDETECT_SHARED=ON -DFACEDETECT_BUILD_CLI=OFF -DFACEDETECT_BUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
|
||||||
|
|
||||||
|
ifeq ($(NATIVE),false)
|
||||||
|
CMAKE_ARGS+=-DGGML_NATIVE=OFF
|
||||||
|
endif
|
||||||
|
|
||||||
|
# face-detect.cpp gates its GGML backends behind FACEDETECT_GGML_* options and
|
||||||
|
# does set(GGML_CUDA ${FACEDETECT_GGML_CUDA} CACHE BOOL "" FORCE), so a bare
|
||||||
|
# -DGGML_CUDA=ON is overwritten back to OFF. Forward the FACEDETECT_GGML_*
|
||||||
|
# options instead. (openblas is not gated, so -DGGML_BLAS passes through.)
|
||||||
|
ifeq ($(BUILD_TYPE),cublas)
|
||||||
|
CMAKE_ARGS+=-DFACEDETECT_GGML_CUDA=ON
|
||||||
|
else ifeq ($(BUILD_TYPE),openblas)
|
||||||
|
CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||||
|
else ifeq ($(BUILD_TYPE),hipblas)
|
||||||
|
CMAKE_ARGS+=-DFACEDETECT_GGML_HIP=ON
|
||||||
|
else ifeq ($(BUILD_TYPE),vulkan)
|
||||||
|
CMAKE_ARGS+=-DFACEDETECT_GGML_VULKAN=ON
|
||||||
|
else ifeq ($(BUILD_TYPE),metal)
|
||||||
|
CMAKE_ARGS+=-DFACEDETECT_GGML_METAL=ON
|
||||||
|
endif
|
||||||
|
|
||||||
|
.PHONY: face-detect-grpc package build clean purge test all
|
||||||
|
|
||||||
|
all: face-detect-grpc
|
||||||
|
|
||||||
|
# Clone the upstream face-detect.cpp source at the pinned commit. Directory acts
|
||||||
|
# as the target so make only re-clones when missing. After a FACEDETECT_VERSION
|
||||||
|
# bump, run 'make purge && make' to refetch.
|
||||||
|
sources/face-detect.cpp:
|
||||||
|
mkdir -p sources/face-detect.cpp
|
||||||
|
cd sources/face-detect.cpp && \
|
||||||
|
git init -q && \
|
||||||
|
git remote add origin $(FACEDETECT_REPO) && \
|
||||||
|
git fetch --depth 1 origin $(FACEDETECT_VERSION) && \
|
||||||
|
git checkout FETCH_HEAD && \
|
||||||
|
git submodule update --init --recursive --depth 1 --single-branch
|
||||||
|
|
||||||
|
# Build the shared lib + header out-of-tree, then stage them next to the Go
|
||||||
|
# sources so purego.Dlopen("libfacedetect.so") and the cgo-less build both pick
|
||||||
|
# them up.
|
||||||
|
libfacedetect.so: sources/face-detect.cpp
|
||||||
|
cmake -B sources/face-detect.cpp/build-shared -S sources/face-detect.cpp $(CMAKE_ARGS)
|
||||||
|
cmake --build sources/face-detect.cpp/build-shared --config Release -j$(JOBS) --target facedetect
|
||||||
|
cp -fv sources/face-detect.cpp/build-shared/libfacedetect.so* ./ 2>/dev/null || true
|
||||||
|
cp -fv sources/face-detect.cpp/include/facedetect_capi.h ./
|
||||||
|
|
||||||
|
face-detect-grpc: libfacedetect.so main.go gofacedetect.go options.go
|
||||||
|
CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o face-detect-grpc .
|
||||||
|
|
||||||
|
package: face-detect-grpc
|
||||||
|
bash package.sh
|
||||||
|
|
||||||
|
build: package
|
||||||
|
|
||||||
|
# Test target. The embed/detect/verify/analyze smoke specs are gated on
|
||||||
|
# FACEDETECT_BACKEND_TEST_MODEL + FACEDETECT_BACKEND_TEST_IMAGE; without them the
|
||||||
|
# heavy specs auto-skip and only the pure-Go parsing specs run.
|
||||||
|
test:
|
||||||
|
LD_LIBRARY_PATH=$(CURDIR):$$LD_LIBRARY_PATH $(GOCMD) test ./... -count=1
|
||||||
|
|
||||||
|
clean: purge
|
||||||
|
rm -rf libfacedetect.so* facedetect_capi.h package face-detect-grpc
|
||||||
|
|
||||||
|
purge:
|
||||||
|
rm -rf sources/face-detect.cpp
|
||||||
431
backend/go/face-detect/gofacedetect.go
Normal file
431
backend/go/face-detect/gofacedetect.go
Normal file
@@ -0,0 +1,431 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/base64"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
"unsafe"
|
||||||
|
|
||||||
|
"github.com/mudler/LocalAI/pkg/grpc/base"
|
||||||
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||||
|
"github.com/mudler/xlog"
|
||||||
|
)
|
||||||
|
|
||||||
|
// purego-bound entry points from libfacedetect.so. Names match
|
||||||
|
// facedetect_capi.h exactly so a `nm libfacedetect.so | grep facedetect_capi`
|
||||||
|
// is enough to spot drift.
|
||||||
|
//
|
||||||
|
// The opaque ctx and the malloc'd char*/float* return values are declared as
|
||||||
|
// uintptr so we get the raw pointer back and can release it via the matching
|
||||||
|
// capi free function. purego's native string/[]float32 returns would copy and
|
||||||
|
// forget the original pointer, leaking the C-owned buffer on every call.
|
||||||
|
var (
|
||||||
|
CppAbiVersion func() int32
|
||||||
|
CppLoad func(ggufPath string) uintptr
|
||||||
|
CppFree func(ctx uintptr)
|
||||||
|
CppLastError func(ctx uintptr) string
|
||||||
|
CppFreeString func(s uintptr)
|
||||||
|
CppFreeVec func(v uintptr)
|
||||||
|
CppEmbedPath func(ctx uintptr, imagePath string, outVec, outDim unsafe.Pointer) int32
|
||||||
|
CppEmbedRGB func(ctx uintptr, rgb []byte, width, height int32, outVec, outDim unsafe.Pointer) int32
|
||||||
|
CppDetectJSON func(ctx uintptr, imagePath string) uintptr
|
||||||
|
CppVerifyPaths func(ctx uintptr, a, b string, threshold float32, antiSpoof int32, outDistance, outVerified unsafe.Pointer) int32
|
||||||
|
CppAnalyzeJSON func(ctx uintptr, imagePath string) uintptr
|
||||||
|
)
|
||||||
|
|
||||||
|
// FaceDetect implements the face-recognition (biometric) subset of the Backend
|
||||||
|
// gRPC service over libfacedetect.so. The C side keeps a single loaded model
|
||||||
|
// pack plus a per-ctx last-error buffer and is not reentrant, so
|
||||||
|
// base.SingleThread serializes every call.
|
||||||
|
type FaceDetect struct {
|
||||||
|
base.SingleThread
|
||||||
|
opts loadOptions
|
||||||
|
ctxPtr uintptr
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *FaceDetect) Load(opts *pb.ModelOptions) error {
|
||||||
|
model := opts.ModelFile
|
||||||
|
if model == "" {
|
||||||
|
model = opts.ModelPath
|
||||||
|
}
|
||||||
|
if !filepath.IsAbs(model) && opts.ModelPath != "" {
|
||||||
|
model = filepath.Join(opts.ModelPath, model)
|
||||||
|
}
|
||||||
|
if model == "" {
|
||||||
|
return errors.New("face-detect: ModelFile is required")
|
||||||
|
}
|
||||||
|
|
||||||
|
f.opts = parseOptions(opts.Options)
|
||||||
|
if f.opts.modelName == "" {
|
||||||
|
f.opts.modelName = filepath.Base(model)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Propagate LocalAI's per-model thread budget to the engine. LocalAI spawns
|
||||||
|
// one backend process per model and serves requests concurrently, so the
|
||||||
|
// engine's own min(hardware_concurrency, 8) default can oversubscribe cores.
|
||||||
|
// FACEDETECT_THREADS is read by the engine at backend construction, so it
|
||||||
|
// must be set before the capi load. A non-positive Threads means "unset":
|
||||||
|
// leave the env alone so the engine keeps its sane default.
|
||||||
|
threads := opts.Threads
|
||||||
|
if threads > 0 {
|
||||||
|
if err := os.Setenv("FACEDETECT_THREADS", strconv.Itoa(int(threads))); err != nil {
|
||||||
|
return fmt.Errorf("face-detect: set FACEDETECT_THREADS: %w", err)
|
||||||
|
}
|
||||||
|
xlog.Info("face-detect: applying LocalAI thread budget", "threads", threads)
|
||||||
|
}
|
||||||
|
|
||||||
|
xlog.Info("face-detect: loading model", "model", model,
|
||||||
|
"verify_threshold", f.opts.verifyThreshold, "abi", CppAbiVersion())
|
||||||
|
|
||||||
|
ctx := CppLoad(model)
|
||||||
|
if ctx == 0 {
|
||||||
|
// The last-error buffer lives on the ctx that was never returned, so
|
||||||
|
// surface the path the operator tried to load instead.
|
||||||
|
return fmt.Errorf("face-detect: facedetect_capi_load failed for %q", model)
|
||||||
|
}
|
||||||
|
f.ctxPtr = ctx
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Embeddings returns the L2-normalized ArcFace embedding of the primary face in
|
||||||
|
// the supplied image. Mirroring the Python face backend, the image is read from
|
||||||
|
// Images[0] as a base64 payload; materializeImage decodes it to a temp file so
|
||||||
|
// the path-based C-API can run its own decode (cv2.imread parity). The gRPC
|
||||||
|
// server wraps the returned slice in an EmbeddingResult.
|
||||||
|
func (f *FaceDetect) Embeddings(req *pb.PredictOptions) ([]float32, error) {
|
||||||
|
if f.ctxPtr == 0 {
|
||||||
|
return nil, errors.New("face-detect: model not loaded")
|
||||||
|
}
|
||||||
|
if len(req.Images) == 0 || req.Images[0] == "" {
|
||||||
|
return nil, errors.New("face-detect: Embedding requires Images[0] to be a base64 image")
|
||||||
|
}
|
||||||
|
|
||||||
|
path, cleanup, err := materializeImage(req.Images[0])
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
return f.embedPath(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *FaceDetect) embedPath(path string) ([]float32, error) {
|
||||||
|
var vec uintptr
|
||||||
|
var dim int32
|
||||||
|
rc := CppEmbedPath(f.ctxPtr, path, unsafe.Pointer(&vec), unsafe.Pointer(&dim))
|
||||||
|
if rc != 0 || vec == 0 || dim <= 0 {
|
||||||
|
return nil, f.lastErr("embed", path)
|
||||||
|
}
|
||||||
|
defer CppFreeVec(vec)
|
||||||
|
// Copy out of the C-owned malloc'd buffer before freeing it. The
|
||||||
|
// uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
|
||||||
|
// a C heap pointer from Go-managed memory; safe here, the GC neither tracks
|
||||||
|
// nor moves this buffer and we copy immediately.
|
||||||
|
src := unsafe.Slice((*float32)(unsafe.Pointer(vec)), int(dim)) //nolint:govet // C-owned malloc'd vector, copied out before free
|
||||||
|
out := make([]float32, int(dim))
|
||||||
|
copy(out, src)
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Detect runs SCRFD over the image and returns one Detection per face. The
|
||||||
|
// C-API emits a box as [x1,y1,x2,y2] in pixels; the proto carries x/y plus
|
||||||
|
// width/height, so the corners are converted. The 5 facial landmarks the engine
|
||||||
|
// also returns are dropped: the Detection message has no field for them.
|
||||||
|
func (f *FaceDetect) Detect(req *pb.DetectOptions) (pb.DetectResponse, error) {
|
||||||
|
if f.ctxPtr == 0 {
|
||||||
|
return pb.DetectResponse{}, errors.New("face-detect: model not loaded")
|
||||||
|
}
|
||||||
|
if req.Src == "" {
|
||||||
|
return pb.DetectResponse{}, errors.New("face-detect: src image is required")
|
||||||
|
}
|
||||||
|
|
||||||
|
path, cleanup, err := materializeImage(req.Src)
|
||||||
|
if err != nil {
|
||||||
|
return pb.DetectResponse{}, err
|
||||||
|
}
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
faces, err := f.detectFaces(path)
|
||||||
|
if err != nil {
|
||||||
|
return pb.DetectResponse{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
dets := make([]*pb.Detection, 0, len(faces))
|
||||||
|
for _, fc := range faces {
|
||||||
|
if req.Threshold > 0 && fc.Score < req.Threshold {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
x, y, w, h := fc.xywh()
|
||||||
|
dets = append(dets, &pb.Detection{
|
||||||
|
X: x,
|
||||||
|
Y: y,
|
||||||
|
Width: w,
|
||||||
|
Height: h,
|
||||||
|
Confidence: fc.Score,
|
||||||
|
ClassName: "face",
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return pb.DetectResponse{Detections: dets}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// FaceVerify embeds the primary face in each image and reports whether they are
|
||||||
|
// the same identity by cosine distance against a threshold. A request threshold
|
||||||
|
// <= 0 falls back to the model-configured default (verify_threshold option,
|
||||||
|
// 0.35 if unset). When anti_spoofing is set, the C-API applies a MiniFASNet
|
||||||
|
// veto internally (verified forced false on a spoof); the per-image liveness
|
||||||
|
// scores are not exposed by the verify entry point, so img*_is_real /
|
||||||
|
// img*_antispoof_score stay at their zero values.
|
||||||
|
func (f *FaceDetect) FaceVerify(req *pb.FaceVerifyRequest) (pb.FaceVerifyResponse, error) {
|
||||||
|
if f.ctxPtr == 0 {
|
||||||
|
return pb.FaceVerifyResponse{}, errors.New("face-detect: model not loaded")
|
||||||
|
}
|
||||||
|
if req.Img1 == "" || req.Img2 == "" {
|
||||||
|
return pb.FaceVerifyResponse{}, errors.New("face-detect: img1 and img2 are required")
|
||||||
|
}
|
||||||
|
|
||||||
|
path1, cleanup1, err := materializeImage(req.Img1)
|
||||||
|
if err != nil {
|
||||||
|
return pb.FaceVerifyResponse{}, err
|
||||||
|
}
|
||||||
|
defer cleanup1()
|
||||||
|
path2, cleanup2, err := materializeImage(req.Img2)
|
||||||
|
if err != nil {
|
||||||
|
return pb.FaceVerifyResponse{}, err
|
||||||
|
}
|
||||||
|
defer cleanup2()
|
||||||
|
|
||||||
|
threshold := req.Threshold
|
||||||
|
if threshold <= 0 {
|
||||||
|
threshold = f.opts.verifyThreshold
|
||||||
|
}
|
||||||
|
|
||||||
|
antiSpoof := int32(0)
|
||||||
|
if req.AntiSpoofing {
|
||||||
|
antiSpoof = 1
|
||||||
|
}
|
||||||
|
|
||||||
|
started := time.Now()
|
||||||
|
var distance float32
|
||||||
|
var verified int32
|
||||||
|
rc := CppVerifyPaths(f.ctxPtr, path1, path2, threshold, antiSpoof,
|
||||||
|
unsafe.Pointer(&distance), unsafe.Pointer(&verified))
|
||||||
|
if rc != 0 {
|
||||||
|
return pb.FaceVerifyResponse{}, f.lastErr("verify", req.Img1[:min(8, len(req.Img1))]+"...")
|
||||||
|
}
|
||||||
|
elapsedMs := float32(time.Since(started).Seconds() * 1000.0)
|
||||||
|
|
||||||
|
// Confidence decays linearly from 100 at distance 0 to 0 at the threshold,
|
||||||
|
// matching the Python face backend's reporting.
|
||||||
|
confidence := float32(0)
|
||||||
|
if threshold > 0 {
|
||||||
|
confidence = float32(math.Max(0, math.Min(100, (1.0-float64(distance)/float64(threshold))*100.0)))
|
||||||
|
}
|
||||||
|
|
||||||
|
return pb.FaceVerifyResponse{
|
||||||
|
Verified: verified != 0,
|
||||||
|
Distance: distance,
|
||||||
|
Threshold: threshold,
|
||||||
|
Confidence: confidence,
|
||||||
|
Model: f.opts.modelName,
|
||||||
|
Img1Area: f.bestArea(path1),
|
||||||
|
Img2Area: f.bestArea(path2),
|
||||||
|
ProcessingTimeMs: elapsedMs,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// FaceAnalyze runs the genderage head on every detected face. The C-API returns
|
||||||
|
// "M"/"F" gender labels and a rounded age; the labels are normalized to the
|
||||||
|
// "Man"/"Woman" values the proto documents.
|
||||||
|
func (f *FaceDetect) FaceAnalyze(req *pb.FaceAnalyzeRequest) (pb.FaceAnalyzeResponse, error) {
|
||||||
|
if f.ctxPtr == 0 {
|
||||||
|
return pb.FaceAnalyzeResponse{}, errors.New("face-detect: model not loaded")
|
||||||
|
}
|
||||||
|
if req.Img == "" {
|
||||||
|
return pb.FaceAnalyzeResponse{}, errors.New("face-detect: img is required")
|
||||||
|
}
|
||||||
|
|
||||||
|
path, cleanup, err := materializeImage(req.Img)
|
||||||
|
if err != nil {
|
||||||
|
return pb.FaceAnalyzeResponse{}, err
|
||||||
|
}
|
||||||
|
defer cleanup()
|
||||||
|
|
||||||
|
ptr := CppAnalyzeJSON(f.ctxPtr, path)
|
||||||
|
if ptr == 0 {
|
||||||
|
return pb.FaceAnalyzeResponse{}, f.lastErr("analyze", path)
|
||||||
|
}
|
||||||
|
defer CppFreeString(ptr)
|
||||||
|
|
||||||
|
faces, err := parseAnalyzeJSON(goStringFromCPtr(ptr))
|
||||||
|
if err != nil {
|
||||||
|
return pb.FaceAnalyzeResponse{}, fmt.Errorf("face-detect: analyze JSON: %w", err)
|
||||||
|
}
|
||||||
|
return pb.FaceAnalyzeResponse{Faces: faces}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// faceBox is one entry of the detect/analyze JSON documents the engine emits.
|
||||||
|
type faceBox struct {
|
||||||
|
Score float32 `json:"score"`
|
||||||
|
Box []float32 `json:"box"`
|
||||||
|
Age float32 `json:"age"`
|
||||||
|
Gender string `json:"gender"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// xywh converts the engine's [x1,y1,x2,y2] box into the x/y/width/height the
|
||||||
|
// proto carries. A short or missing box yields zeros.
|
||||||
|
func (b faceBox) xywh() (x, y, w, h float32) {
|
||||||
|
if len(b.Box) < 4 {
|
||||||
|
return 0, 0, 0, 0
|
||||||
|
}
|
||||||
|
return b.Box[0], b.Box[1], b.Box[2] - b.Box[0], b.Box[3] - b.Box[1]
|
||||||
|
}
|
||||||
|
|
||||||
|
type facesJSON struct {
|
||||||
|
Faces []faceBox `json:"faces"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *FaceDetect) detectFaces(path string) ([]faceBox, error) {
|
||||||
|
ptr := CppDetectJSON(f.ctxPtr, path)
|
||||||
|
if ptr == 0 {
|
||||||
|
return nil, f.lastErr("detect", path)
|
||||||
|
}
|
||||||
|
defer CppFreeString(ptr)
|
||||||
|
|
||||||
|
var doc facesJSON
|
||||||
|
if err := json.Unmarshal([]byte(goStringFromCPtr(ptr)), &doc); err != nil {
|
||||||
|
return nil, fmt.Errorf("face-detect: detect JSON: %w", err)
|
||||||
|
}
|
||||||
|
return doc.Faces, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// bestArea returns the FacialArea of the highest-scoring face in an image, or an
|
||||||
|
// empty area when detection fails or finds nothing. Best-effort: verify already
|
||||||
|
// succeeded, so a missing region must not turn a valid match into an error.
|
||||||
|
func (f *FaceDetect) bestArea(path string) *pb.FacialArea {
|
||||||
|
faces, err := f.detectFaces(path)
|
||||||
|
if err != nil || len(faces) == 0 {
|
||||||
|
return &pb.FacialArea{}
|
||||||
|
}
|
||||||
|
best := faces[0]
|
||||||
|
for _, fc := range faces[1:] {
|
||||||
|
if fc.Score > best.Score {
|
||||||
|
best = fc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x, y, w, h := best.xywh()
|
||||||
|
return &pb.FacialArea{X: x, Y: y, W: w, H: h}
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseAnalyzeJSON maps the engine's analyze document onto FaceAnalysis entries.
|
||||||
|
// The engine reports gender as "M"/"F"; both the dominant label and the score
|
||||||
|
// map are filled with the "Man"/"Woman" form the proto documents.
|
||||||
|
func parseAnalyzeJSON(doc string) ([]*pb.FaceAnalysis, error) {
|
||||||
|
var parsed facesJSON
|
||||||
|
if err := json.Unmarshal([]byte(doc), &parsed); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
out := make([]*pb.FaceAnalysis, 0, len(parsed.Faces))
|
||||||
|
for _, fc := range parsed.Faces {
|
||||||
|
x, y, w, h := fc.xywh()
|
||||||
|
fa := &pb.FaceAnalysis{
|
||||||
|
Region: &pb.FacialArea{X: x, Y: y, W: w, H: h},
|
||||||
|
FaceConfidence: fc.Score,
|
||||||
|
Age: fc.Age,
|
||||||
|
}
|
||||||
|
if label := normalizeGender(fc.Gender); label != "" {
|
||||||
|
fa.DominantGender = label
|
||||||
|
fa.Gender = map[string]float32{label: 1.0}
|
||||||
|
}
|
||||||
|
out = append(out, fa)
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalizeGender maps the engine's "M"/"F" code to the "Man"/"Woman" labels the
|
||||||
|
// proto documents. Unknown codes pass through unchanged.
|
||||||
|
func normalizeGender(g string) string {
|
||||||
|
switch strings.ToUpper(strings.TrimSpace(g)) {
|
||||||
|
case "M":
|
||||||
|
return "Man"
|
||||||
|
case "F":
|
||||||
|
return "Woman"
|
||||||
|
case "":
|
||||||
|
return ""
|
||||||
|
default:
|
||||||
|
return g
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// materializeImage decodes a base64 image payload into a temp file and returns
|
||||||
|
// its path plus a cleanup func. As a convenience for callers that already pass a
|
||||||
|
// filesystem path (e.g. a test fixture), an existing path is used as-is with a
|
||||||
|
// no-op cleanup. data: URI prefixes are stripped before decoding.
|
||||||
|
func materializeImage(src string) (path string, cleanup func(), err error) {
|
||||||
|
noop := func() {}
|
||||||
|
if src == "" {
|
||||||
|
return "", noop, errors.New("face-detect: empty image input")
|
||||||
|
}
|
||||||
|
if _, statErr := os.Stat(src); statErr == nil {
|
||||||
|
return src, noop, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
payload := src
|
||||||
|
if i := strings.Index(payload, ","); strings.HasPrefix(payload, "data:") && i >= 0 {
|
||||||
|
payload = payload[i+1:]
|
||||||
|
}
|
||||||
|
data, decErr := base64.StdEncoding.DecodeString(strings.TrimSpace(payload))
|
||||||
|
if decErr != nil || len(data) == 0 {
|
||||||
|
return "", noop, errors.New("face-detect: image is neither an existing path nor valid base64")
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp, createErr := os.CreateTemp("", "face-detect-*.img")
|
||||||
|
if createErr != nil {
|
||||||
|
return "", noop, fmt.Errorf("face-detect: create temp image: %w", createErr)
|
||||||
|
}
|
||||||
|
cleanup = func() { _ = os.Remove(tmp.Name()) }
|
||||||
|
if _, wErr := tmp.Write(data); wErr != nil {
|
||||||
|
_ = tmp.Close()
|
||||||
|
cleanup()
|
||||||
|
return "", noop, fmt.Errorf("face-detect: write temp image: %w", wErr)
|
||||||
|
}
|
||||||
|
if cErr := tmp.Close(); cErr != nil {
|
||||||
|
cleanup()
|
||||||
|
return "", noop, fmt.Errorf("face-detect: close temp image: %w", cErr)
|
||||||
|
}
|
||||||
|
return tmp.Name(), cleanup, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// lastErr wraps the C-API's per-ctx last-error buffer into a Go error.
|
||||||
|
func (f *FaceDetect) lastErr(op, subject string) error {
|
||||||
|
msg := strings.TrimSpace(CppLastError(f.ctxPtr))
|
||||||
|
if msg == "" {
|
||||||
|
msg = "no error detail"
|
||||||
|
}
|
||||||
|
return fmt.Errorf("face-detect: %s failed for %q: %s", op, subject, msg)
|
||||||
|
}
|
||||||
|
|
||||||
|
// goStringFromCPtr copies a NUL-terminated C string into Go memory. cptr is a
|
||||||
|
// malloc'd buffer the caller owns; release it via CppFreeString after the copy.
|
||||||
|
//
|
||||||
|
// The uintptr->Pointer conversion trips vet's unsafeptr check, which can't tell
|
||||||
|
// a C heap pointer from Go-managed memory. Safe here: the GC neither tracks nor
|
||||||
|
// moves the buffer and we dereference it immediately to copy the bytes out.
|
||||||
|
func goStringFromCPtr(cptr uintptr) string {
|
||||||
|
if cptr == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
p := unsafe.Pointer(cptr) //nolint:govet // C-owned malloc'd buffer, not Go-GC memory (see doc above)
|
||||||
|
n := 0
|
||||||
|
for *(*byte)(unsafe.Add(p, n)) != 0 {
|
||||||
|
n++
|
||||||
|
}
|
||||||
|
return string(unsafe.Slice((*byte)(p), n))
|
||||||
|
}
|
||||||
230
backend/go/face-detect/gofacedetect_test.go
Normal file
230
backend/go/face-detect/gofacedetect_test.go
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/base64"
|
||||||
|
"os"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/ebitengine/purego"
|
||||||
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||||
|
. "github.com/onsi/ginkgo/v2"
|
||||||
|
. "github.com/onsi/gomega"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestFaceDetect(t *testing.T) {
|
||||||
|
RegisterFailHandler(Fail)
|
||||||
|
RunSpecs(t, "face-detect Backend Suite")
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
libLoadOnce sync.Once
|
||||||
|
libLoadErr error
|
||||||
|
)
|
||||||
|
|
||||||
|
// ensureLibLoaded mirrors main.go's bootstrap so a Go test can drive the C-API
|
||||||
|
// bridge without spinning up the gRPC server. Records the error (the smoke
|
||||||
|
// specs skip themselves) when libfacedetect.so is not loadable from cwd
|
||||||
|
// (LD_LIBRARY_PATH or a symlink in ./).
|
||||||
|
func ensureLibLoaded() error {
|
||||||
|
libLoadOnce.Do(func() {
|
||||||
|
libName := os.Getenv("FACEDETECT_LIBRARY")
|
||||||
|
if libName == "" {
|
||||||
|
libName = "libfacedetect.so"
|
||||||
|
}
|
||||||
|
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
|
||||||
|
if err != nil {
|
||||||
|
libLoadErr = err
|
||||||
|
return
|
||||||
|
}
|
||||||
|
purego.RegisterLibFunc(&CppAbiVersion, lib, "facedetect_capi_abi_version")
|
||||||
|
purego.RegisterLibFunc(&CppLoad, lib, "facedetect_capi_load")
|
||||||
|
purego.RegisterLibFunc(&CppFree, lib, "facedetect_capi_free")
|
||||||
|
purego.RegisterLibFunc(&CppLastError, lib, "facedetect_capi_last_error")
|
||||||
|
purego.RegisterLibFunc(&CppFreeString, lib, "facedetect_capi_free_string")
|
||||||
|
purego.RegisterLibFunc(&CppFreeVec, lib, "facedetect_capi_free_vec")
|
||||||
|
purego.RegisterLibFunc(&CppEmbedPath, lib, "facedetect_capi_embed_path")
|
||||||
|
purego.RegisterLibFunc(&CppEmbedRGB, lib, "facedetect_capi_embed_rgb")
|
||||||
|
purego.RegisterLibFunc(&CppDetectJSON, lib, "facedetect_capi_detect_path_json")
|
||||||
|
purego.RegisterLibFunc(&CppVerifyPaths, lib, "facedetect_capi_verify_paths")
|
||||||
|
purego.RegisterLibFunc(&CppAnalyzeJSON, lib, "facedetect_capi_analyze_path_json")
|
||||||
|
})
|
||||||
|
return libLoadErr
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ = Describe("parseOptions", func() {
|
||||||
|
It("defaults verify_threshold to 0.35", func() {
|
||||||
|
o := parseOptions(nil)
|
||||||
|
Expect(o.verifyThreshold).To(Equal(float32(0.35)))
|
||||||
|
Expect(o.modelName).To(Equal(""))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("parses verify_threshold, threshold alias and model_name", func() {
|
||||||
|
o := parseOptions([]string{"verify_threshold:0.4", "model_name:buffalo_l", "unknown:x"})
|
||||||
|
Expect(o.verifyThreshold).To(Equal(float32(0.4)))
|
||||||
|
Expect(o.modelName).To(Equal("buffalo_l"))
|
||||||
|
|
||||||
|
o2 := parseOptions([]string{"threshold:0.3"})
|
||||||
|
Expect(o2.verifyThreshold).To(Equal(float32(0.3)))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("ignores non-positive thresholds and keeps the default", func() {
|
||||||
|
o := parseOptions([]string{"verify_threshold:0", "threshold:-1"})
|
||||||
|
Expect(o.verifyThreshold).To(Equal(float32(0.35)))
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
var _ = Describe("normalizeGender", func() {
|
||||||
|
It("maps M/F codes to Man/Woman", func() {
|
||||||
|
Expect(normalizeGender("M")).To(Equal("Man"))
|
||||||
|
Expect(normalizeGender("f")).To(Equal("Woman"))
|
||||||
|
Expect(normalizeGender(" m ")).To(Equal("Man"))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("passes empty and unknown codes through", func() {
|
||||||
|
Expect(normalizeGender("")).To(Equal(""))
|
||||||
|
Expect(normalizeGender("nonbinary")).To(Equal("nonbinary"))
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
var _ = Describe("faceBox.xywh", func() {
|
||||||
|
It("converts an [x1,y1,x2,y2] box to x/y/width/height", func() {
|
||||||
|
b := faceBox{Box: []float32{10, 20, 50, 80}}
|
||||||
|
x, y, w, h := b.xywh()
|
||||||
|
Expect(x).To(Equal(float32(10)))
|
||||||
|
Expect(y).To(Equal(float32(20)))
|
||||||
|
Expect(w).To(Equal(float32(40)))
|
||||||
|
Expect(h).To(Equal(float32(60)))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("returns zeros for a short box", func() {
|
||||||
|
x, y, w, h := faceBox{Box: []float32{1, 2}}.xywh()
|
||||||
|
Expect([]float32{x, y, w, h}).To(Equal([]float32{0, 0, 0, 0}))
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
var _ = Describe("parseAnalyzeJSON", func() {
|
||||||
|
It("maps region, age and gender for each face", func() {
|
||||||
|
doc := `{"faces":[
|
||||||
|
{"score":0.997,"box":[10,20,50,80],"age":31,"gender":"M"},
|
||||||
|
{"score":0.81,"box":[0,0,40,40],"age":24,"gender":"F"}]}`
|
||||||
|
faces, err := parseAnalyzeJSON(doc)
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
Expect(faces).To(HaveLen(2))
|
||||||
|
|
||||||
|
Expect(faces[0].FaceConfidence).To(BeNumerically("~", 0.997, 1e-4))
|
||||||
|
Expect(faces[0].Age).To(BeNumerically("~", 31, 1e-4))
|
||||||
|
Expect(faces[0].DominantGender).To(Equal("Man"))
|
||||||
|
Expect(faces[0].Gender).To(HaveKeyWithValue("Man", float32(1.0)))
|
||||||
|
Expect(faces[0].Region.W).To(Equal(float32(40)))
|
||||||
|
Expect(faces[0].Region.H).To(Equal(float32(60)))
|
||||||
|
|
||||||
|
Expect(faces[1].DominantGender).To(Equal("Woman"))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("tolerates a missing gender field", func() {
|
||||||
|
faces, err := parseAnalyzeJSON(`{"faces":[{"score":0.5,"box":[0,0,10,10],"age":40}]}`)
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
Expect(faces).To(HaveLen(1))
|
||||||
|
Expect(faces[0].DominantGender).To(Equal(""))
|
||||||
|
Expect(faces[0].Gender).To(BeEmpty())
|
||||||
|
})
|
||||||
|
|
||||||
|
It("returns no faces for an empty document", func() {
|
||||||
|
faces, err := parseAnalyzeJSON(`{"faces":[]}`)
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
Expect(faces).To(BeEmpty())
|
||||||
|
})
|
||||||
|
|
||||||
|
It("returns an error on malformed JSON", func() {
|
||||||
|
_, err := parseAnalyzeJSON(`{not-json`)
|
||||||
|
Expect(err).To(HaveOccurred())
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
var _ = Describe("materializeImage", func() {
|
||||||
|
It("decodes a base64 payload to a temp file", func() {
|
||||||
|
payload := base64.StdEncoding.EncodeToString([]byte("\xff\xd8\xff\xe0fake-jpeg"))
|
||||||
|
path, cleanup, err := materializeImage(payload)
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
defer cleanup()
|
||||||
|
data, rerr := os.ReadFile(path)
|
||||||
|
Expect(rerr).ToNot(HaveOccurred())
|
||||||
|
Expect(data).To(Equal([]byte("\xff\xd8\xff\xe0fake-jpeg")))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("strips a data: URI prefix before decoding", func() {
|
||||||
|
payload := "data:image/png;base64," + base64.StdEncoding.EncodeToString([]byte("hello"))
|
||||||
|
path, cleanup, err := materializeImage(payload)
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
defer cleanup()
|
||||||
|
data, rerr := os.ReadFile(path)
|
||||||
|
Expect(rerr).ToNot(HaveOccurred())
|
||||||
|
Expect(data).To(Equal([]byte("hello")))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("uses an existing path as-is", func() {
|
||||||
|
tmp, err := os.CreateTemp("", "face-detect-fixture-*.bin")
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
defer func() { _ = os.Remove(tmp.Name()) }()
|
||||||
|
Expect(tmp.Close()).To(Succeed())
|
||||||
|
|
||||||
|
path, cleanup, err := materializeImage(tmp.Name())
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
defer cleanup()
|
||||||
|
Expect(path).To(Equal(tmp.Name()))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("errors on input that is neither a path nor base64", func() {
|
||||||
|
_, _, err := materializeImage("not base64!!!")
|
||||||
|
Expect(err).To(HaveOccurred())
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
// The specs below exercise the real C-API end to end. They run only when both a
|
||||||
|
// model GGUF and a test image are provided, and skip cleanly otherwise so the
|
||||||
|
// suite stays green without large assets.
|
||||||
|
var _ = Describe("FaceDetect end-to-end", Ordered, func() {
|
||||||
|
var (
|
||||||
|
f *FaceDetect
|
||||||
|
modelPath = os.Getenv("FACEDETECT_BACKEND_TEST_MODEL")
|
||||||
|
imagePath = os.Getenv("FACEDETECT_BACKEND_TEST_IMAGE")
|
||||||
|
)
|
||||||
|
|
||||||
|
BeforeAll(func() {
|
||||||
|
if modelPath == "" || imagePath == "" {
|
||||||
|
Skip("set FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE to run the e2e specs")
|
||||||
|
}
|
||||||
|
if err := ensureLibLoaded(); err != nil {
|
||||||
|
Skip("libfacedetect.so not loadable: " + err.Error())
|
||||||
|
}
|
||||||
|
f = &FaceDetect{}
|
||||||
|
Expect(f.Load(&pb.ModelOptions{ModelFile: modelPath})).To(Succeed())
|
||||||
|
})
|
||||||
|
|
||||||
|
It("embeds the primary face in an image", func() {
|
||||||
|
emb, err := f.Embeddings(&pb.PredictOptions{Images: []string{imagePath}})
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
Expect(emb).ToNot(BeEmpty())
|
||||||
|
})
|
||||||
|
|
||||||
|
It("detects at least one face", func() {
|
||||||
|
resp, err := f.Detect(&pb.DetectOptions{Src: imagePath})
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
Expect(resp.Detections).ToNot(BeEmpty())
|
||||||
|
Expect(resp.Detections[0].ClassName).To(Equal("face"))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("verifies an image against itself as the same identity", func() {
|
||||||
|
resp, err := f.FaceVerify(&pb.FaceVerifyRequest{Img1: imagePath, Img2: imagePath})
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
Expect(resp.Verified).To(BeTrue())
|
||||||
|
Expect(resp.Distance).To(BeNumerically("<=", resp.Threshold))
|
||||||
|
})
|
||||||
|
|
||||||
|
It("analyzes age/gender for each face", func() {
|
||||||
|
resp, err := f.FaceAnalyze(&pb.FaceAnalyzeRequest{Img: imagePath})
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
Expect(resp.Faces).ToNot(BeEmpty())
|
||||||
|
})
|
||||||
|
})
|
||||||
65
backend/go/face-detect/main.go
Normal file
65
backend/go/face-detect/main.go
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
// Started internally by LocalAI - one gRPC server per loaded model.
|
||||||
|
//
|
||||||
|
// Loads libfacedetect.so via purego and registers the flat C-API entry points
|
||||||
|
// declared in facedetect_capi.h. The library name can be overridden with
|
||||||
|
// FACEDETECT_LIBRARY (mirrors the VOICEDETECT_LIBRARY / PARAKEET_LIBRARY
|
||||||
|
// convention in the sibling backends); the default looks for the .so next to
|
||||||
|
// this binary (resolved via LD_LIBRARY_PATH by run.sh).
|
||||||
|
import (
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/ebitengine/purego"
|
||||||
|
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
addr = flag.String("addr", "localhost:50051", "the address to connect to")
|
||||||
|
)
|
||||||
|
|
||||||
|
type LibFuncs struct {
|
||||||
|
FuncPtr any
|
||||||
|
Name string
|
||||||
|
}
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
libName := os.Getenv("FACEDETECT_LIBRARY")
|
||||||
|
if libName == "" {
|
||||||
|
libName = "libfacedetect.so"
|
||||||
|
}
|
||||||
|
|
||||||
|
lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Errorf("face-detect: dlopen %q: %w", libName, err))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bound 1:1 to facedetect_capi.h. char*/float* returns are registered as
|
||||||
|
// uintptr so the raw pointer can be freed via the matching capi free fn.
|
||||||
|
libFuncs := []LibFuncs{
|
||||||
|
{&CppAbiVersion, "facedetect_capi_abi_version"},
|
||||||
|
{&CppLoad, "facedetect_capi_load"},
|
||||||
|
{&CppFree, "facedetect_capi_free"},
|
||||||
|
{&CppLastError, "facedetect_capi_last_error"},
|
||||||
|
{&CppFreeString, "facedetect_capi_free_string"},
|
||||||
|
{&CppFreeVec, "facedetect_capi_free_vec"},
|
||||||
|
{&CppEmbedPath, "facedetect_capi_embed_path"},
|
||||||
|
{&CppEmbedRGB, "facedetect_capi_embed_rgb"},
|
||||||
|
{&CppDetectJSON, "facedetect_capi_detect_path_json"},
|
||||||
|
{&CppVerifyPaths, "facedetect_capi_verify_paths"},
|
||||||
|
{&CppAnalyzeJSON, "facedetect_capi_analyze_path_json"},
|
||||||
|
}
|
||||||
|
for _, lf := range libFuncs {
|
||||||
|
purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(os.Stderr, "[face-detect] ABI=%d\n", CppAbiVersion())
|
||||||
|
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
if err := grpc.StartServer(*addr, &FaceDetect{}); err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
47
backend/go/face-detect/options.go
Normal file
47
backend/go/face-detect/options.go
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// defaultVerifyThreshold is the cosine-distance cutoff used when a request does
|
||||||
|
// not set one. Matches the insightface buffalo_l ArcFace R50 default the Python
|
||||||
|
// face backend ships with so the two implementations agree on verdicts out of
|
||||||
|
// the box.
|
||||||
|
const defaultVerifyThreshold float32 = 0.35
|
||||||
|
|
||||||
|
// loadOptions holds the parsed model-level options for face-detect.
|
||||||
|
type loadOptions struct {
|
||||||
|
verifyThreshold float32
|
||||||
|
modelName string
|
||||||
|
}
|
||||||
|
|
||||||
|
func splitOption(o string) (key, value string, ok bool) {
|
||||||
|
i := strings.Index(o, ":")
|
||||||
|
if i < 0 {
|
||||||
|
return "", "", false
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(o[:i]), strings.TrimSpace(o[i+1:]), true
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseOptions reads the backend "key:value" option slice. Unknown keys are
|
||||||
|
// ignored. Defaults: verify_threshold 0.35, model_name derived from the file.
|
||||||
|
func parseOptions(opts []string) loadOptions {
|
||||||
|
o := loadOptions{verifyThreshold: defaultVerifyThreshold}
|
||||||
|
for _, oo := range opts {
|
||||||
|
key, value, ok := splitOption(oo)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch key {
|
||||||
|
case "verify_threshold", "threshold":
|
||||||
|
if f, err := strconv.ParseFloat(value, 32); err == nil && f > 0 {
|
||||||
|
o.verifyThreshold = float32(f)
|
||||||
|
}
|
||||||
|
case "model_name":
|
||||||
|
o.modelName = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return o
|
||||||
|
}
|
||||||
68
backend/go/face-detect/package.sh
Normal file
68
backend/go/face-detect/package.sh
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Bundle the face-detect-grpc binary, libfacedetect.so, the core runtime libs
|
||||||
|
# (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active BUILD_TYPE
|
||||||
|
# so the package is self-contained. Mirrors backend/go/voice-detect/package.sh;
|
||||||
|
# run.sh routes the (CGO_ENABLED=0) binary through lib/ld.so so the packaged libc
|
||||||
|
# is used instead of the host's.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CURDIR=$(dirname "$(realpath "$0")")
|
||||||
|
REPO_ROOT="${CURDIR}/../../.."
|
||||||
|
|
||||||
|
mkdir -p "$CURDIR/package/lib"
|
||||||
|
|
||||||
|
cp -avf "$CURDIR/face-detect-grpc" "$CURDIR/package/"
|
||||||
|
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
|
||||||
|
|
||||||
|
# libfacedetect.so + any soname symlinks. purego.Dlopen resolves it via
|
||||||
|
# LD_LIBRARY_PATH, which run.sh points at lib/.
|
||||||
|
cp -avf "$CURDIR"/libfacedetect.so* "$CURDIR/package/lib/" 2>/dev/null || {
|
||||||
|
echo "ERROR: libfacedetect.so not found in $CURDIR, run 'make' first" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Detect architecture and copy the core runtime libs libfacedetect.so links
|
||||||
|
# against, plus the matching dynamic loader as lib/ld.so.
|
||||||
|
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||||
|
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||||
|
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||||
|
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||||
|
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||||
|
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||||
|
elif [ "$(uname -s)" = "Darwin" ]; then
|
||||||
|
echo "Detected Darwin"
|
||||||
|
else
|
||||||
|
echo "Error: Could not detect architecture"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers) based on
|
||||||
|
# BUILD_TYPE so the backend can reach the GPU without the runtime base image
|
||||||
|
# shipping those drivers.
|
||||||
|
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||||
|
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||||
|
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||||
|
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||||
|
package_gpu_libs
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Packaging completed successfully"
|
||||||
|
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
|
||||||
16
backend/go/face-detect/run.sh
Normal file
16
backend/go/face-detect/run.sh
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CURDIR=$(dirname "$(realpath "$0")")
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH="$CURDIR/lib:$CURDIR:${LD_LIBRARY_PATH:-}"
|
||||||
|
|
||||||
|
# If a self-contained ld.so was packaged, route through it so the packaged
|
||||||
|
# libc / libstdc++ are used instead of the host's (matches the voice-detect /
|
||||||
|
# whisper / parakeet backends' runtime layout).
|
||||||
|
if [ -f "$CURDIR/lib/ld.so" ]; then
|
||||||
|
echo "Using lib/ld.so"
|
||||||
|
exec "$CURDIR/lib/ld.so" "$CURDIR/face-detect-grpc" "$@"
|
||||||
|
fi
|
||||||
|
|
||||||
|
exec "$CURDIR/face-detect-grpc" "$@"
|
||||||
15
backend/go/face-detect/test.sh
Normal file
15
backend/go/face-detect/test.sh
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CURDIR=$(dirname "$(realpath "$0")")
|
||||||
|
cd "$CURDIR"
|
||||||
|
|
||||||
|
echo "Running face-detect backend tests..."
|
||||||
|
|
||||||
|
# The pure-Go parsing specs always run. The embed/detect/verify/analyze smoke
|
||||||
|
# specs run only when a model + image are provided via
|
||||||
|
# FACEDETECT_BACKEND_TEST_MODEL and FACEDETECT_BACKEND_TEST_IMAGE; otherwise they
|
||||||
|
# auto-skip.
|
||||||
|
LD_LIBRARY_PATH="$CURDIR:${LD_LIBRARY_PATH:-}" go test -v -timeout 1200s .
|
||||||
|
|
||||||
|
echo "face-detect tests completed."
|
||||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
|||||||
|
|
||||||
# omnivoice.cpp version
|
# omnivoice.cpp version
|
||||||
OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
|
OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
|
||||||
OMNIVOICE_VERSION?=2603355a5dfacae5cfc33531d5d0933221843509
|
OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
|
||||||
SO_TARGET?=libgomnivoicecpp.so
|
SO_TARGET?=libgomnivoicecpp.so
|
||||||
|
|
||||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# parakeet-cpp backend Makefile.
|
# parakeet-cpp backend Makefile.
|
||||||
#
|
#
|
||||||
# Upstream pin lives below as PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
|
# Upstream pin lives below as PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
|
||||||
# (.github/bump_deps.sh) can find and update it - matches the
|
# (.github/bump_deps.sh) can find and update it - matches the
|
||||||
# whisper.cpp / ds4 / vibevoice-cpp convention.
|
# whisper.cpp / ds4 / vibevoice-cpp convention.
|
||||||
#
|
#
|
||||||
@@ -15,7 +15,7 @@
|
|||||||
# That's what the L0 smoke test uses. The default target below does the
|
# That's what the L0 smoke test uses. The default target below does the
|
||||||
# proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
|
# proper clone-at-pin + cmake build so CI doesn't need a side-checkout.
|
||||||
|
|
||||||
PARAKEET_VERSION?=92a5f0306be354c109150fe58ae4cc4f8a21ca45
|
PARAKEET_VERSION?=db755a78d39f789bb7d4e3935158a9e8105dbe36
|
||||||
PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
|
PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp
|
||||||
|
|
||||||
GOCMD?=go
|
GOCMD?=go
|
||||||
|
|||||||
@@ -1,23 +1,68 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
#
|
#
|
||||||
# L0 packaging stub: copy the binary, run.sh and libparakeet.so* into
|
# Bundle the parakeet-cpp-grpc binary, libparakeet.so, the core runtime
|
||||||
# package/. The full ldd walk (libc, libstdc++, libgomp, GPU runtimes,
|
# libs (libc/libstdc++/libgomp + ld.so) and the GPU runtime for the active
|
||||||
# arch detection) lands in L3, mirroring backend/go/whisper/package.sh.
|
# BUILD_TYPE so the package is self-contained. Mirrors
|
||||||
|
# backend/go/whisper/package.sh; run.sh routes the (CGO_ENABLED=0) binary
|
||||||
|
# through lib/ld.so so the packaged libc is used instead of the host's.
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
CURDIR=$(dirname "$(realpath "$0")")
|
CURDIR=$(dirname "$(realpath "$0")")
|
||||||
|
REPO_ROOT="${CURDIR}/../../.."
|
||||||
|
|
||||||
mkdir -p "$CURDIR/package/lib"
|
mkdir -p "$CURDIR/package/lib"
|
||||||
|
|
||||||
cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
|
cp -avf "$CURDIR/parakeet-cpp-grpc" "$CURDIR/package/"
|
||||||
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
|
cp -avf "$CURDIR/run.sh" "$CURDIR/package/"
|
||||||
|
|
||||||
# libparakeet.so + any soname symlinks (libparakeet.so.X, libparakeet.so.X.Y).
|
# libparakeet.so + any soname symlinks (libparakeet.so.X[.Y]). purego.Dlopen
|
||||||
|
# resolves it via LD_LIBRARY_PATH, which run.sh points at lib/.
|
||||||
cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
|
cp -avf "$CURDIR"/libparakeet.so* "$CURDIR/package/lib/" 2>/dev/null || {
|
||||||
echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
|
echo "ERROR: libparakeet.so not found in $CURDIR, run 'make' first" >&2
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
echo "L0 package layout (full ldd walk lands in L3):"
|
# Detect architecture and copy the core runtime libs libparakeet.so links
|
||||||
|
# against, plus the matching dynamic loader as lib/ld.so.
|
||||||
|
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||||
|
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||||
|
cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||||
|
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||||
|
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||||
|
cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 "$CURDIR/package/lib/libc.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 "$CURDIR/package/lib/libgcc_s.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 "$CURDIR/package/lib/libstdc++.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 "$CURDIR/package/lib/libm.so.6"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 "$CURDIR/package/lib/libgomp.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 "$CURDIR/package/lib/libdl.so.2"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 "$CURDIR/package/lib/librt.so.1"
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 "$CURDIR/package/lib/libpthread.so.0"
|
||||||
|
elif [ "$(uname -s)" = "Darwin" ]; then
|
||||||
|
echo "Detected Darwin"
|
||||||
|
else
|
||||||
|
echo "Error: Could not detect architecture"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Package GPU libraries (CUDA/ROCm/Intel/Vulkan loader + ICDs + drivers)
|
||||||
|
# based on BUILD_TYPE so the backend can reach the GPU without the runtime
|
||||||
|
# base image shipping those drivers.
|
||||||
|
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||||
|
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||||
|
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||||
|
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||||
|
package_gpu_libs
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Packaging completed successfully"
|
||||||
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
|
ls -liah "$CURDIR/package/" "$CURDIR/package/lib/"
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user