mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-24 16:51:44 -04:00
Compare commits
8 Commits
v4.2.1
...
feat/buun-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9787bee48b | ||
|
|
42754d33b9 | ||
|
|
7f2b7e4ace | ||
|
|
6233feb190 | ||
|
|
d6bf3a4969 | ||
|
|
b27d38a53d | ||
|
|
45756b19dc | ||
|
|
cd6079b2f3 |
@@ -35,19 +35,33 @@ All contributions must comply with LocalAI's licensing requirements:
|
|||||||
|
|
||||||
## Signed-off-by and Developer Certificate of Origin
|
## Signed-off-by and Developer Certificate of Origin
|
||||||
|
|
||||||
**AI agents MUST NOT add `Signed-off-by` tags.** Only humans can legally
|
Only humans can certify the Developer Certificate of Origin (DCO). AI
|
||||||
certify the Developer Certificate of Origin (DCO). The human submitter
|
agents MUST NOT invent or guess a human identity for `Signed-off-by` —
|
||||||
is responsible for:
|
doing so forges the DCO certification.
|
||||||
|
|
||||||
- Reviewing all AI-generated code
|
However, when a human operator explicitly directs the AI to commit on
|
||||||
|
their behalf, the AI is acting as a typing tool — no different from an
|
||||||
|
editor macro or `git commit -s`. In that case the AI SHOULD add
|
||||||
|
`Signed-off-by:` using the **configured `user.name` / `user.email`** of
|
||||||
|
the current git repository (i.e. the operator's own identity). The
|
||||||
|
resulting trailer is the operator's signature; they take responsibility
|
||||||
|
for it by reviewing and pushing the commit. The AI MUST NOT use any
|
||||||
|
other identity and MUST NOT add its own name to the sign-off.
|
||||||
|
|
||||||
|
When running `git commit`, prefer `git commit --signoff` (or `-s`) so
|
||||||
|
the trailer is emitted by git itself from the configured identity,
|
||||||
|
rather than hand-writing it in a heredoc — this guarantees the sign-off
|
||||||
|
matches whatever identity the operator is currently using.
|
||||||
|
|
||||||
|
The human submitter remains responsible for:
|
||||||
|
|
||||||
|
- Reviewing all AI-generated code before it's pushed or merged
|
||||||
- Ensuring compliance with licensing requirements
|
- Ensuring compliance with licensing requirements
|
||||||
- Adding their own `Signed-off-by` tag (when the project requires DCO)
|
|
||||||
to certify the contribution
|
|
||||||
- Taking full responsibility for the contribution
|
- Taking full responsibility for the contribution
|
||||||
|
|
||||||
AI agents MUST NOT add `Co-Authored-By` trailers for themselves either.
|
AI agents MUST NOT add `Co-Authored-By` trailers for themselves. A human
|
||||||
A human reviewer owns the contribution; the AI's involvement is recorded
|
reviewer owns the contribution; the AI's involvement is recorded via
|
||||||
via `Assisted-by` (see below).
|
`Assisted-by` (see below).
|
||||||
|
|
||||||
## Attribution
|
## Attribution
|
||||||
|
|
||||||
@@ -84,6 +98,12 @@ Assisted-by: Claude:claude-opus-4-7 golangci-lint
|
|||||||
Signed-off-by: Jane Developer <jane@example.com>
|
Signed-off-by: Jane Developer <jane@example.com>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The `Signed-off-by` line uses Jane's own identity because Jane is the
|
||||||
|
submitter operating the AI. If Jane asks Claude to create the commit via
|
||||||
|
`git commit -s`, git emits that exact trailer from Jane's configured
|
||||||
|
identity — no separate human step is needed beyond Jane reviewing the
|
||||||
|
diff before pushing.
|
||||||
|
|
||||||
## Scope and Responsibility
|
## Scope and Responsibility
|
||||||
|
|
||||||
Using an AI assistant does not reduce the contributor's responsibility.
|
Using an AI assistant does not reduce the contributor's responsibility.
|
||||||
|
|||||||
117
.github/workflows/backend.yml
vendored
117
.github/workflows/backend.yml
vendored
@@ -399,6 +399,19 @@ jobs:
|
|||||||
dockerfile: "./backend/Dockerfile.turboquant"
|
dockerfile: "./backend/Dockerfile.turboquant"
|
||||||
context: "./"
|
context: "./"
|
||||||
ubuntu-version: '2404'
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "12"
|
||||||
|
cuda-minor-version: "8"
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-nvidia-cuda-12-buun-llama-cpp'
|
||||||
|
runs-on: 'bigger-runner'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "buun-llama-cpp"
|
||||||
|
dockerfile: "./backend/Dockerfile.buun-llama-cpp"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
- build-type: 'cublas'
|
- build-type: 'cublas'
|
||||||
cuda-major-version: "12"
|
cuda-major-version: "12"
|
||||||
cuda-minor-version: "8"
|
cuda-minor-version: "8"
|
||||||
@@ -894,6 +907,19 @@ jobs:
|
|||||||
dockerfile: "./backend/Dockerfile.turboquant"
|
dockerfile: "./backend/Dockerfile.turboquant"
|
||||||
context: "./"
|
context: "./"
|
||||||
ubuntu-version: '2404'
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "13"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-nvidia-cuda-13-buun-llama-cpp'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "buun-llama-cpp"
|
||||||
|
dockerfile: "./backend/Dockerfile.buun-llama-cpp"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
- build-type: 'cublas'
|
- build-type: 'cublas'
|
||||||
cuda-major-version: "13"
|
cuda-major-version: "13"
|
||||||
cuda-minor-version: "0"
|
cuda-minor-version: "0"
|
||||||
@@ -920,6 +946,19 @@ jobs:
|
|||||||
backend: "turboquant"
|
backend: "turboquant"
|
||||||
dockerfile: "./backend/Dockerfile.turboquant"
|
dockerfile: "./backend/Dockerfile.turboquant"
|
||||||
context: "./"
|
context: "./"
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "13"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-nvidia-l4t-cuda-13-arm64-buun-llama-cpp'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
ubuntu-version: '2404'
|
||||||
|
backend: "buun-llama-cpp"
|
||||||
|
dockerfile: "./backend/Dockerfile.buun-llama-cpp"
|
||||||
|
context: "./"
|
||||||
- build-type: 'cublas'
|
- build-type: 'cublas'
|
||||||
cuda-major-version: "13"
|
cuda-major-version: "13"
|
||||||
cuda-minor-version: "0"
|
cuda-minor-version: "0"
|
||||||
@@ -1454,6 +1493,19 @@ jobs:
|
|||||||
dockerfile: "./backend/Dockerfile.turboquant"
|
dockerfile: "./backend/Dockerfile.turboquant"
|
||||||
context: "./"
|
context: "./"
|
||||||
ubuntu-version: '2404'
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'hipblas'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-rocm-hipblas-buun-llama-cpp'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "rocm/dev-ubuntu-24.04:7.2.1"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "buun-llama-cpp"
|
||||||
|
dockerfile: "./backend/Dockerfile.buun-llama-cpp"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
- build-type: 'hipblas'
|
- build-type: 'hipblas'
|
||||||
cuda-major-version: ""
|
cuda-major-version: ""
|
||||||
cuda-minor-version: ""
|
cuda-minor-version: ""
|
||||||
@@ -1703,6 +1755,19 @@ jobs:
|
|||||||
dockerfile: "./backend/Dockerfile.turboquant"
|
dockerfile: "./backend/Dockerfile.turboquant"
|
||||||
context: "./"
|
context: "./"
|
||||||
ubuntu-version: '2404'
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'sycl_f32'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-intel-sycl-f32-buun-llama-cpp'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "buun-llama-cpp"
|
||||||
|
dockerfile: "./backend/Dockerfile.buun-llama-cpp"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
- build-type: 'sycl_f16'
|
- build-type: 'sycl_f16'
|
||||||
cuda-major-version: ""
|
cuda-major-version: ""
|
||||||
cuda-minor-version: ""
|
cuda-minor-version: ""
|
||||||
@@ -1729,6 +1794,19 @@ jobs:
|
|||||||
dockerfile: "./backend/Dockerfile.turboquant"
|
dockerfile: "./backend/Dockerfile.turboquant"
|
||||||
context: "./"
|
context: "./"
|
||||||
ubuntu-version: '2404'
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'sycl_f16'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-intel-sycl-f16-buun-llama-cpp'
|
||||||
|
runs-on: 'ubuntu-latest'
|
||||||
|
base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "buun-llama-cpp"
|
||||||
|
dockerfile: "./backend/Dockerfile.buun-llama-cpp"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
- build-type: 'intel'
|
- build-type: 'intel'
|
||||||
cuda-major-version: ""
|
cuda-major-version: ""
|
||||||
cuda-minor-version: ""
|
cuda-minor-version: ""
|
||||||
@@ -2134,6 +2212,19 @@ jobs:
|
|||||||
dockerfile: "./backend/Dockerfile.turboquant"
|
dockerfile: "./backend/Dockerfile.turboquant"
|
||||||
context: "./"
|
context: "./"
|
||||||
ubuntu-version: '2404'
|
ubuntu-version: '2404'
|
||||||
|
- build-type: ''
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64,linux/arm64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-cpu-buun-llama-cpp'
|
||||||
|
runs-on: 'bigger-runner'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "buun-llama-cpp"
|
||||||
|
dockerfile: "./backend/Dockerfile.buun-llama-cpp"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
- build-type: ''
|
- build-type: ''
|
||||||
cuda-major-version: ""
|
cuda-major-version: ""
|
||||||
cuda-minor-version: ""
|
cuda-minor-version: ""
|
||||||
@@ -2173,6 +2264,19 @@ jobs:
|
|||||||
dockerfile: "./backend/Dockerfile.turboquant"
|
dockerfile: "./backend/Dockerfile.turboquant"
|
||||||
context: "./"
|
context: "./"
|
||||||
ubuntu-version: '2204'
|
ubuntu-version: '2204'
|
||||||
|
- build-type: 'cublas'
|
||||||
|
cuda-major-version: "12"
|
||||||
|
cuda-minor-version: "0"
|
||||||
|
platforms: 'linux/arm64'
|
||||||
|
skip-drivers: 'false'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-nvidia-l4t-arm64-buun-llama-cpp'
|
||||||
|
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
|
||||||
|
runs-on: 'ubuntu-24.04-arm'
|
||||||
|
backend: "buun-llama-cpp"
|
||||||
|
dockerfile: "./backend/Dockerfile.buun-llama-cpp"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2204'
|
||||||
- build-type: 'vulkan'
|
- build-type: 'vulkan'
|
||||||
cuda-major-version: ""
|
cuda-major-version: ""
|
||||||
cuda-minor-version: ""
|
cuda-minor-version: ""
|
||||||
@@ -2199,6 +2303,19 @@ jobs:
|
|||||||
dockerfile: "./backend/Dockerfile.turboquant"
|
dockerfile: "./backend/Dockerfile.turboquant"
|
||||||
context: "./"
|
context: "./"
|
||||||
ubuntu-version: '2404'
|
ubuntu-version: '2404'
|
||||||
|
- build-type: 'vulkan'
|
||||||
|
cuda-major-version: ""
|
||||||
|
cuda-minor-version: ""
|
||||||
|
platforms: 'linux/amd64,linux/arm64'
|
||||||
|
tag-latest: 'auto'
|
||||||
|
tag-suffix: '-gpu-vulkan-buun-llama-cpp'
|
||||||
|
runs-on: 'bigger-runner'
|
||||||
|
base-image: "ubuntu:24.04"
|
||||||
|
skip-drivers: 'false'
|
||||||
|
backend: "buun-llama-cpp"
|
||||||
|
dockerfile: "./backend/Dockerfile.buun-llama-cpp"
|
||||||
|
context: "./"
|
||||||
|
ubuntu-version: '2404'
|
||||||
# Stablediffusion-ggml
|
# Stablediffusion-ggml
|
||||||
- build-type: ''
|
- build-type: ''
|
||||||
cuda-major-version: ""
|
cuda-major-version: ""
|
||||||
|
|||||||
25
.github/workflows/test-extra.yml
vendored
25
.github/workflows/test-extra.yml
vendored
@@ -32,6 +32,7 @@ jobs:
|
|||||||
llama-cpp: ${{ steps.detect.outputs.llama-cpp }}
|
llama-cpp: ${{ steps.detect.outputs.llama-cpp }}
|
||||||
ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
|
ik-llama-cpp: ${{ steps.detect.outputs.ik-llama-cpp }}
|
||||||
turboquant: ${{ steps.detect.outputs.turboquant }}
|
turboquant: ${{ steps.detect.outputs.turboquant }}
|
||||||
|
buun-llama-cpp: ${{ steps.detect.outputs['buun-llama-cpp'] }}
|
||||||
vllm: ${{ steps.detect.outputs.vllm }}
|
vllm: ${{ steps.detect.outputs.vllm }}
|
||||||
sglang: ${{ steps.detect.outputs.sglang }}
|
sglang: ${{ steps.detect.outputs.sglang }}
|
||||||
acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
|
acestep-cpp: ${{ steps.detect.outputs.acestep-cpp }}
|
||||||
@@ -613,6 +614,30 @@ jobs:
|
|||||||
- name: Build turboquant backend image and run gRPC e2e tests
|
- name: Build turboquant backend image and run gRPC e2e tests
|
||||||
run: |
|
run: |
|
||||||
make test-extra-backend-turboquant
|
make test-extra-backend-turboquant
|
||||||
|
tests-buun-llama-cpp-grpc:
|
||||||
|
needs: detect-changes
|
||||||
|
if: needs.detect-changes.outputs['buun-llama-cpp'] == 'true' || needs.detect-changes.outputs.run-all == 'true'
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
timeout-minutes: 90
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
uses: actions/checkout@v6
|
||||||
|
with:
|
||||||
|
submodules: true
|
||||||
|
- name: Setup Go
|
||||||
|
uses: actions/setup-go@v5
|
||||||
|
with:
|
||||||
|
go-version: '1.25.4'
|
||||||
|
# Exercises the buun-llama-cpp (fork-of-a-fork) backend with the
|
||||||
|
# fork-specific TurboQuant/TCQ KV-cache types. BACKEND_TEST_CACHE_TYPE_V
|
||||||
|
# is set to turbo3 so the test round-trips through the fork's KV
|
||||||
|
# allow-list — picking a stock llama.cpp type would only re-test the
|
||||||
|
# shared code path. DFlash speculative decoding is not exercised here
|
||||||
|
# because the one known public target/drafter pair (Qwen3.5-27B) is too
|
||||||
|
# large for CI.
|
||||||
|
- name: Build buun-llama-cpp backend image and run gRPC e2e tests
|
||||||
|
run: |
|
||||||
|
make test-extra-backend-buun-llama-cpp
|
||||||
# tests-vllm-grpc is currently disabled in CI.
|
# tests-vllm-grpc is currently disabled in CI.
|
||||||
#
|
#
|
||||||
# The prebuilt vllm CPU wheel is compiled with AVX-512 VNNI/BF16
|
# The prebuilt vllm CPU wheel is compiled with AVX-512 VNNI/BF16
|
||||||
|
|||||||
23
Makefile
23
Makefile
@@ -1,5 +1,5 @@
|
|||||||
# Disable parallel execution for backend builds
|
# Disable parallel execution for backend builds
|
||||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/tinygrad backends/sherpa-onnx
|
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/buun-llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/tinygrad backends/sherpa-onnx
|
||||||
|
|
||||||
GOCMD=go
|
GOCMD=go
|
||||||
GOTEST=$(GOCMD) test
|
GOTEST=$(GOCMD) test
|
||||||
@@ -545,6 +545,19 @@ test-extra-backend-turboquant: docker-build-turboquant
|
|||||||
BACKEND_TEST_CACHE_TYPE_V=turbo3 \
|
BACKEND_TEST_CACHE_TYPE_V=turbo3 \
|
||||||
$(MAKE) test-extra-backend
|
$(MAKE) test-extra-backend
|
||||||
|
|
||||||
|
## buun-llama-cpp: exercises the fork-of-a-fork backend (spiritbuun/buun-llama-cpp)
|
||||||
|
## with the *TurboQuant/TCQ-specific* KV-cache types (turbo3 for V). Same rationale
|
||||||
|
## as turboquant above: picking a standard llama.cpp type would only re-test the
|
||||||
|
## shared code path. buun inherits turboquant's turbo2/turbo3/turbo4 and adds
|
||||||
|
## turbo2_tcq / turbo3_tcq on top. DFlash speculative decoding is not exercised
|
||||||
|
## here because no small DFlash drafter model exists (the known public pair is
|
||||||
|
## Qwen3.5-27B, ~54 GB).
|
||||||
|
test-extra-backend-buun-llama-cpp: docker-build-buun-llama-cpp
|
||||||
|
BACKEND_IMAGE=local-ai-backend:buun-llama-cpp \
|
||||||
|
BACKEND_TEST_CACHE_TYPE_K=q8_0 \
|
||||||
|
BACKEND_TEST_CACHE_TYPE_V=turbo3 \
|
||||||
|
$(MAKE) test-extra-backend
|
||||||
|
|
||||||
## Audio transcription wrapper for the llama-cpp backend.
|
## Audio transcription wrapper for the llama-cpp backend.
|
||||||
## Drives the new AudioTranscription / AudioTranscriptionStream RPCs against
|
## Drives the new AudioTranscription / AudioTranscriptionStream RPCs against
|
||||||
## ggml-org/Qwen3-ASR-0.6B-GGUF (a small ASR model that requires its mmproj
|
## ggml-org/Qwen3-ASR-0.6B-GGUF (a small ASR model that requires its mmproj
|
||||||
@@ -949,6 +962,11 @@ BACKEND_IK_LLAMA_CPP = ik-llama-cpp|ik-llama-cpp|.|false|false
|
|||||||
# turboquant is a llama.cpp fork with TurboQuant KV-cache quantization.
|
# turboquant is a llama.cpp fork with TurboQuant KV-cache quantization.
|
||||||
# Reuses backend/cpp/llama-cpp grpc-server sources via a thin wrapper Makefile.
|
# Reuses backend/cpp/llama-cpp grpc-server sources via a thin wrapper Makefile.
|
||||||
BACKEND_TURBOQUANT = turboquant|turboquant|.|false|false
|
BACKEND_TURBOQUANT = turboquant|turboquant|.|false|false
|
||||||
|
# buun-llama-cpp is a fork-of-a-fork (spiritbuun/buun-llama-cpp forks
|
||||||
|
# TheTom/llama-cpp-turboquant) that adds DFlash block-diffusion speculative
|
||||||
|
# decoding and extra TCQ KV-cache variants on top of TurboQuant. Same thin
|
||||||
|
# wrapper pattern as turboquant — reuses backend/cpp/llama-cpp grpc-server.
|
||||||
|
BACKEND_BUUN_LLAMA_CPP = buun-llama-cpp|buun-llama-cpp|.|false|false
|
||||||
|
|
||||||
# Golang backends
|
# Golang backends
|
||||||
BACKEND_PIPER = piper|golang|.|false|true
|
BACKEND_PIPER = piper|golang|.|false|true
|
||||||
@@ -1029,6 +1047,7 @@ endef
|
|||||||
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP)))
|
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP)))
|
||||||
$(eval $(call generate-docker-build-target,$(BACKEND_IK_LLAMA_CPP)))
|
$(eval $(call generate-docker-build-target,$(BACKEND_IK_LLAMA_CPP)))
|
||||||
$(eval $(call generate-docker-build-target,$(BACKEND_TURBOQUANT)))
|
$(eval $(call generate-docker-build-target,$(BACKEND_TURBOQUANT)))
|
||||||
|
$(eval $(call generate-docker-build-target,$(BACKEND_BUUN_LLAMA_CPP)))
|
||||||
$(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
|
$(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
|
||||||
$(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
|
$(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
|
||||||
$(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))
|
$(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))
|
||||||
@@ -1080,7 +1099,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SHERPA_ONNX)))
|
|||||||
docker-save-%: backend-images
|
docker-save-%: backend-images
|
||||||
docker save local-ai-backend:$* -o backend-images/$*.tar
|
docker save local-ai-backend:$* -o backend-images/$*.tar
|
||||||
|
|
||||||
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx
|
docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-buun-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx
|
||||||
|
|
||||||
########################################################
|
########################################################
|
||||||
### Mock Backend for E2E Tests
|
### Mock Backend for E2E Tests
|
||||||
|
|||||||
290
backend/Dockerfile.buun-llama-cpp
Normal file
290
backend/Dockerfile.buun-llama-cpp
Normal file
@@ -0,0 +1,290 @@
|
|||||||
|
ARG BASE_IMAGE=ubuntu:24.04
|
||||||
|
ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
|
||||||
|
|
||||||
|
|
||||||
|
# The grpc target does one thing, it builds and installs GRPC. This is in it's own layer so that it can be effectively cached by CI.
|
||||||
|
# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
|
||||||
|
FROM ${GRPC_BASE_IMAGE} AS grpc
|
||||||
|
|
||||||
|
# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
|
||||||
|
ARG GRPC_MAKEFLAGS="-j4 -Otarget"
|
||||||
|
ARG GRPC_VERSION=v1.65.0
|
||||||
|
ARG CMAKE_FROM_SOURCE=false
|
||||||
|
# CUDA Toolkit 13.x compatibility: CMake 3.31.9+ fixes toolchain detection/arch table issues
|
||||||
|
ARG CMAKE_VERSION=3.31.10
|
||||||
|
|
||||||
|
ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
|
||||||
|
|
||||||
|
WORKDIR /build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
ca-certificates \
|
||||||
|
build-essential curl libssl-dev \
|
||||||
|
git wget && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install CMake (the version in 22.04 is too old)
|
||||||
|
RUN <<EOT bash
|
||||||
|
if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
|
||||||
|
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
|
||||||
|
else
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y \
|
||||||
|
cmake && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
fi
|
||||||
|
EOT
|
||||||
|
|
||||||
|
# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
|
||||||
|
# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
|
||||||
|
# and running make install in the target container
|
||||||
|
RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
|
||||||
|
mkdir -p /build/grpc/cmake/build && \
|
||||||
|
cd /build/grpc/cmake/build && \
|
||||||
|
sed -i "216i\ TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
|
||||||
|
cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
|
||||||
|
make && \
|
||||||
|
make install && \
|
||||||
|
rm -rf /build
|
||||||
|
|
||||||
|
FROM ${BASE_IMAGE} AS builder
|
||||||
|
ARG CMAKE_FROM_SOURCE=false
|
||||||
|
ARG CMAKE_VERSION=3.31.10
|
||||||
|
# We can target specific CUDA ARCHITECTURES like --build-arg CUDA_DOCKER_ARCH='75;86;89;120'
|
||||||
|
ARG CUDA_DOCKER_ARCH
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
ARG CMAKE_ARGS
|
||||||
|
ENV CMAKE_ARGS=${CMAKE_ARGS}
|
||||||
|
ARG BACKEND=rerankers
|
||||||
|
ARG BUILD_TYPE
|
||||||
|
ENV BUILD_TYPE=${BUILD_TYPE}
|
||||||
|
ARG CUDA_MAJOR_VERSION
|
||||||
|
ARG CUDA_MINOR_VERSION
|
||||||
|
ARG SKIP_DRIVERS=false
|
||||||
|
ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
|
||||||
|
ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
ARG TARGETARCH
|
||||||
|
ARG TARGETVARIANT
|
||||||
|
ARG GO_VERSION=1.25.4
|
||||||
|
ARG UBUNTU_VERSION=2404
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
ccache git \
|
||||||
|
ca-certificates \
|
||||||
|
make \
|
||||||
|
pkg-config libcurl4-openssl-dev \
|
||||||
|
curl unzip \
|
||||||
|
libssl-dev wget && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Cuda
|
||||||
|
ENV PATH=/usr/local/cuda/bin:${PATH}
|
||||||
|
|
||||||
|
# HipBLAS requirements
|
||||||
|
ENV PATH=/opt/rocm/bin:${PATH}
|
||||||
|
|
||||||
|
|
||||||
|
# Vulkan requirements
|
||||||
|
RUN <<EOT bash
|
||||||
|
if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
software-properties-common pciutils wget gpg-agent && \
|
||||||
|
apt-get install -y libglm-dev cmake libxcb-dri3-0 libxcb-present0 libpciaccess0 \
|
||||||
|
libpng-dev libxcb-keysyms1-dev libxcb-dri3-dev libx11-dev g++ gcc \
|
||||||
|
libwayland-dev libxrandr-dev libxcb-randr0-dev libxcb-ewmh-dev \
|
||||||
|
git python-is-python3 bison libx11-xcb-dev liblz4-dev libzstd-dev \
|
||||||
|
ocaml-core ninja-build pkg-config libxml2-dev wayland-protocols python3-jsonschema \
|
||||||
|
clang-format qtbase5-dev qt6-base-dev libxcb-glx0-dev sudo xz-utils
|
||||||
|
if [ "amd64" = "$TARGETARCH" ]; then
|
||||||
|
wget "https://sdk.lunarg.com/sdk/download/1.4.335.0/linux/vulkansdk-linux-x86_64-1.4.335.0.tar.xz" && \
|
||||||
|
tar -xf vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
||||||
|
rm vulkansdk-linux-x86_64-1.4.335.0.tar.xz && \
|
||||||
|
mkdir -p /opt/vulkan-sdk && \
|
||||||
|
mv 1.4.335.0 /opt/vulkan-sdk/ && \
|
||||||
|
cd /opt/vulkan-sdk/1.4.335.0 && \
|
||||||
|
./vulkansdk --no-deps --maxjobs \
|
||||||
|
vulkan-loader \
|
||||||
|
vulkan-validationlayers \
|
||||||
|
vulkan-extensionlayer \
|
||||||
|
vulkan-tools \
|
||||||
|
shaderc && \
|
||||||
|
cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/bin/* /usr/bin/ && \
|
||||||
|
cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/lib/* /usr/lib/x86_64-linux-gnu/ && \
|
||||||
|
cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/include/* /usr/include/ && \
|
||||||
|
cp -rfv /opt/vulkan-sdk/1.4.335.0/x86_64/share/* /usr/share/ && \
|
||||||
|
rm -rf /opt/vulkan-sdk
|
||||||
|
fi
|
||||||
|
if [ "arm64" = "$TARGETARCH" ]; then
|
||||||
|
mkdir vulkan && cd vulkan && \
|
||||||
|
curl -L -o vulkan-sdk.tar.xz https://github.com/mudler/vulkan-sdk-arm/releases/download/1.4.335.0/vulkansdk-ubuntu-24.04-arm-1.4.335.0.tar.xz && \
|
||||||
|
tar -xvf vulkan-sdk.tar.xz && \
|
||||||
|
rm vulkan-sdk.tar.xz && \
|
||||||
|
cd 1.4.335.0 && \
|
||||||
|
cp -rfv aarch64/bin/* /usr/bin/ && \
|
||||||
|
cp -rfv aarch64/lib/* /usr/lib/aarch64-linux-gnu/ && \
|
||||||
|
cp -rfv aarch64/include/* /usr/include/ && \
|
||||||
|
cp -rfv aarch64/share/* /usr/share/ && \
|
||||||
|
cd ../.. && \
|
||||||
|
rm -rf vulkan
|
||||||
|
fi
|
||||||
|
ldconfig && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
fi
|
||||||
|
EOT
|
||||||
|
|
||||||
|
# CuBLAS requirements
|
||||||
|
RUN <<EOT bash
|
||||||
|
if ( [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "l4t" ] ) && [ "${SKIP_DRIVERS}" = "false" ]; then
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
software-properties-common pciutils
|
||||||
|
if [ "amd64" = "$TARGETARCH" ]; then
|
||||||
|
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/x86_64/cuda-keyring_1.1-1_all.deb
|
||||||
|
fi
|
||||||
|
if [ "arm64" = "$TARGETARCH" ]; then
|
||||||
|
if [ "${CUDA_MAJOR_VERSION}" = "13" ]; then
|
||||||
|
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/sbsa/cuda-keyring_1.1-1_all.deb
|
||||||
|
else
|
||||||
|
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${UBUNTU_VERSION}/arm64/cuda-keyring_1.1-1_all.deb
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
dpkg -i cuda-keyring_1.1-1_all.deb && \
|
||||||
|
rm -f cuda-keyring_1.1-1_all.deb && \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
|
libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
|
libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
|
libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
|
libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
|
||||||
|
libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
|
||||||
|
if [ "${CUDA_MAJOR_VERSION}" = "13" ] && [ "arm64" = "$TARGETARCH" ]; then
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
libcufile-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcudnn9-cuda-${CUDA_MAJOR_VERSION} cuda-cupti-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libnvjitlink-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}
|
||||||
|
fi
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
fi
|
||||||
|
EOT
|
||||||
|
|
||||||
|
|
||||||
|
# https://github.com/NVIDIA/Isaac-GR00T/issues/343
|
||||||
|
RUN <<EOT bash
|
||||||
|
if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
|
||||||
|
wget https://developer.download.nvidia.com/compute/cudss/0.6.0/local_installers/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
|
||||||
|
dpkg -i cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0_0.6.0-1_arm64.deb && \
|
||||||
|
cp /var/cudss-local-tegra-repo-ubuntu${UBUNTU_VERSION}-0.6.0/cudss-*-keyring.gpg /usr/share/keyrings/ && \
|
||||||
|
apt-get update && apt-get -y install cudss cudss-cuda-${CUDA_MAJOR_VERSION} && \
|
||||||
|
wget https://developer.download.nvidia.com/compute/nvpl/25.5/local_installers/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
|
||||||
|
dpkg -i nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5_1.0-1_arm64.deb && \
|
||||||
|
cp /var/nvpl-local-repo-ubuntu${UBUNTU_VERSION}-25.5/nvpl-*-keyring.gpg /usr/share/keyrings/ && \
|
||||||
|
apt-get update && apt-get install -y nvpl
|
||||||
|
fi
|
||||||
|
EOT
|
||||||
|
|
||||||
|
# If we are building with clblas support, we need the libraries for the builds
|
||||||
|
RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
libclblast-dev && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/* \
|
||||||
|
; fi
|
||||||
|
|
||||||
|
RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
hipblas-dev \
|
||||||
|
rocblas-dev && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/* && \
|
||||||
|
# I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
|
||||||
|
# to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
|
||||||
|
ldconfig && \
|
||||||
|
# Log which GPU architectures have rocBLAS kernel support
|
||||||
|
echo "rocBLAS library data architectures:" && \
|
||||||
|
(ls /opt/rocm*/lib/rocblas/library/Kernels* 2>/dev/null || ls /opt/rocm*/lib64/rocblas/library/Kernels* 2>/dev/null) | grep -oP 'gfx[0-9a-z+-]+' | sort -u || \
|
||||||
|
echo "WARNING: No rocBLAS kernel data found" \
|
||||||
|
; fi
|
||||||
|
|
||||||
|
RUN echo "TARGETARCH: $TARGETARCH"
|
||||||
|
|
||||||
|
# We need protoc installed, and the version in 22.04 is too old. We will create one as part installing the GRPC build below
|
||||||
|
# but that will also being in a newer version of absl which stablediffusion cannot compile with. This version of protoc is only
|
||||||
|
# here so that we can generate the grpc code for the stablediffusion build
|
||||||
|
RUN <<EOT bash
|
||||||
|
if [ "amd64" = "$TARGETARCH" ]; then
|
||||||
|
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
|
||||||
|
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
|
||||||
|
rm protoc.zip
|
||||||
|
fi
|
||||||
|
if [ "arm64" = "$TARGETARCH" ]; then
|
||||||
|
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
|
||||||
|
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
|
||||||
|
rm protoc.zip
|
||||||
|
fi
|
||||||
|
EOT
|
||||||
|
|
||||||
|
# Install CMake (the version in 22.04 is too old)
|
||||||
|
RUN <<EOT bash
|
||||||
|
if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
|
||||||
|
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
|
||||||
|
else
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install -y \
|
||||||
|
cmake && \
|
||||||
|
apt-get clean && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
fi
|
||||||
|
EOT
|
||||||
|
|
||||||
|
COPY --from=grpc /opt/grpc /usr/local
|
||||||
|
|
||||||
|
|
||||||
|
COPY . /LocalAI
|
||||||
|
|
||||||
|
RUN <<'EOT' bash
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
|
||||||
|
CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
|
||||||
|
export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
|
||||||
|
echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
|
||||||
|
rm -rf /LocalAI/backend/cpp/buun-llama-cpp-*-build
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd /LocalAI/backend/cpp/buun-llama-cpp
|
||||||
|
|
||||||
|
if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
|
||||||
|
make buun-llama-cpp-fallback
|
||||||
|
make buun-llama-cpp-grpc
|
||||||
|
make buun-llama-cpp-rpc-server
|
||||||
|
else
|
||||||
|
make buun-llama-cpp-avx
|
||||||
|
make buun-llama-cpp-avx2
|
||||||
|
make buun-llama-cpp-avx512
|
||||||
|
make buun-llama-cpp-fallback
|
||||||
|
make buun-llama-cpp-grpc
|
||||||
|
make buun-llama-cpp-rpc-server
|
||||||
|
fi
|
||||||
|
EOT
|
||||||
|
|
||||||
|
|
||||||
|
# Copy libraries using a script to handle architecture differences
|
||||||
|
RUN make -BC /LocalAI/backend/cpp/buun-llama-cpp package
|
||||||
|
|
||||||
|
|
||||||
|
FROM scratch
|
||||||
|
|
||||||
|
|
||||||
|
# Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
|
||||||
|
COPY --from=builder /LocalAI/backend/cpp/buun-llama-cpp/package/. ./
|
||||||
85
backend/cpp/buun-llama-cpp/Makefile
Normal file
85
backend/cpp/buun-llama-cpp/Makefile
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
|
||||||
|
# Pinned to the HEAD of master on https://github.com/spiritbuun/buun-llama-cpp.
|
||||||
|
# Auto-bumped nightly by .github/workflows/bump_deps.yaml.
|
||||||
|
BUUN_LLAMA_VERSION?=22464d0848b87c5d56b52fdf6af2e5da46bf803e
|
||||||
|
LLAMA_REPO?=https://github.com/spiritbuun/buun-llama-cpp
|
||||||
|
|
||||||
|
CMAKE_ARGS?=
|
||||||
|
BUILD_TYPE?=
|
||||||
|
NATIVE?=false
|
||||||
|
ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
|
||||||
|
TARGET?=--target grpc-server
|
||||||
|
JOBS?=$(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 1)
|
||||||
|
ARCH?=$(shell uname -m)
|
||||||
|
|
||||||
|
CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||||
|
LLAMA_CPP_DIR := $(CURRENT_MAKEFILE_DIR)/../llama-cpp
|
||||||
|
|
||||||
|
GREEN := \033[0;32m
|
||||||
|
RESET := \033[0m
|
||||||
|
|
||||||
|
# buun-llama-cpp is a llama.cpp fork-of-a-fork (spiritbuun/buun-llama-cpp forked
|
||||||
|
# TheTom/llama-cpp-turboquant, which itself forked ggml-org/llama.cpp). Rather
|
||||||
|
# than duplicating grpc-server.cpp / CMakeLists.txt / prepare.sh we reuse the
|
||||||
|
# ones in backend/cpp/llama-cpp, and only swap which repo+sha the fetch step
|
||||||
|
# pulls. Each flavor target copies ../llama-cpp into a sibling
|
||||||
|
# ../buun-llama-cpp-<flavor>-build directory, then invokes llama-cpp's own
|
||||||
|
# build-llama-cpp-grpc-server with LLAMA_REPO/LLAMA_VERSION overridden to point
|
||||||
|
# at the fork.
|
||||||
|
PATCHES_DIR := $(CURRENT_MAKEFILE_DIR)/patches
|
||||||
|
|
||||||
|
# Each flavor target:
|
||||||
|
# 1. copies backend/cpp/llama-cpp/ (grpc-server.cpp + prepare.sh + CMakeLists.txt + Makefile)
|
||||||
|
# into a sibling buun-llama-cpp-<flavor>-build directory;
|
||||||
|
# 2. clones the buun fork into buun-llama-cpp-<flavor>-build/llama.cpp via the
|
||||||
|
# copy's own `llama.cpp` target, overriding LLAMA_REPO/LLAMA_VERSION;
|
||||||
|
# 3. applies patches from backend/cpp/buun-llama-cpp/patches/ to the cloned
|
||||||
|
# fork sources (for backporting upstream commits the fork hasn't pulled);
|
||||||
|
# 4. runs the copy's `grpc-server` target, which produces the binary we copy
|
||||||
|
# up as buun-llama-cpp-<flavor>.
|
||||||
|
define buun-llama-cpp-build
|
||||||
|
rm -rf $(CURRENT_MAKEFILE_DIR)/../buun-llama-cpp-$(1)-build
|
||||||
|
cp -rf $(LLAMA_CPP_DIR) $(CURRENT_MAKEFILE_DIR)/../buun-llama-cpp-$(1)-build
|
||||||
|
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../buun-llama-cpp-$(1)-build purge
|
||||||
|
# Augment the copied grpc-server.cpp's KV-cache allow-list with the
|
||||||
|
# fork's turbo2/turbo3/turbo4/turbo2_tcq/turbo3_tcq types and wire up the
|
||||||
|
# DFlash-specific option handlers (tree_budget / draft_topk). We patch the
|
||||||
|
# *copy*, never the original under backend/cpp/llama-cpp/, so the stock
|
||||||
|
# llama-cpp build stays compiling against vanilla upstream.
|
||||||
|
bash $(CURRENT_MAKEFILE_DIR)/patch-grpc-server.sh $(CURRENT_MAKEFILE_DIR)/../buun-llama-cpp-$(1)-build/grpc-server.cpp
|
||||||
|
$(info $(GREEN)I buun-llama-cpp build info:$(1)$(RESET))
|
||||||
|
LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(BUUN_LLAMA_VERSION) \
|
||||||
|
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../buun-llama-cpp-$(1)-build llama.cpp
|
||||||
|
bash $(CURRENT_MAKEFILE_DIR)/apply-patches.sh $(CURRENT_MAKEFILE_DIR)/../buun-llama-cpp-$(1)-build/llama.cpp $(PATCHES_DIR)
|
||||||
|
CMAKE_ARGS="$(CMAKE_ARGS) $(2)" TARGET="$(3)" \
|
||||||
|
LLAMA_REPO=$(LLAMA_REPO) LLAMA_VERSION=$(BUUN_LLAMA_VERSION) \
|
||||||
|
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../buun-llama-cpp-$(1)-build grpc-server
|
||||||
|
cp -rfv $(CURRENT_MAKEFILE_DIR)/../buun-llama-cpp-$(1)-build/grpc-server buun-llama-cpp-$(1)
|
||||||
|
endef
|
||||||
|
|
||||||
|
buun-llama-cpp-avx2:
|
||||||
|
$(call buun-llama-cpp-build,avx2,-DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on,--target grpc-server)
|
||||||
|
|
||||||
|
buun-llama-cpp-avx512:
|
||||||
|
$(call buun-llama-cpp-build,avx512,-DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on,--target grpc-server)
|
||||||
|
|
||||||
|
buun-llama-cpp-avx:
|
||||||
|
$(call buun-llama-cpp-build,avx,-DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server)
|
||||||
|
|
||||||
|
buun-llama-cpp-fallback:
|
||||||
|
$(call buun-llama-cpp-build,fallback,-DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server)
|
||||||
|
|
||||||
|
buun-llama-cpp-grpc:
|
||||||
|
$(call buun-llama-cpp-build,grpc,-DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off,--target grpc-server --target rpc-server)
|
||||||
|
|
||||||
|
buun-llama-cpp-rpc-server: buun-llama-cpp-grpc
|
||||||
|
cp -rf $(CURRENT_MAKEFILE_DIR)/../buun-llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server buun-llama-cpp-rpc-server
|
||||||
|
|
||||||
|
package:
|
||||||
|
bash package.sh
|
||||||
|
|
||||||
|
purge:
|
||||||
|
rm -rf $(CURRENT_MAKEFILE_DIR)/../buun-llama-cpp-*-build
|
||||||
|
rm -rf buun-llama-cpp-* package
|
||||||
|
|
||||||
|
clean: purge
|
||||||
50
backend/cpp/buun-llama-cpp/apply-patches.sh
Executable file
50
backend/cpp/buun-llama-cpp/apply-patches.sh
Executable file
@@ -0,0 +1,50 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Apply the buun-llama-cpp patch series to a cloned buun-llama-cpp checkout.
|
||||||
|
#
|
||||||
|
# buun-llama-cpp is a fork-of-a-fork that branched off upstream llama.cpp
|
||||||
|
# before some API changes the shared backend/cpp/llama-cpp/grpc-server.cpp
|
||||||
|
# depends on. We carry those upstream commits as patch files under
|
||||||
|
# backend/cpp/buun-llama-cpp/patches/ and apply them here so the reused
|
||||||
|
# grpc-server source compiles against the fork unmodified.
|
||||||
|
#
|
||||||
|
# Drop the corresponding patch from patches/ whenever the fork catches up with
|
||||||
|
# upstream — the build will fail fast if a patch stops applying, which is the
|
||||||
|
# signal to retire it.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
if [[ $# -ne 2 ]]; then
|
||||||
|
echo "usage: $0 <llama.cpp-src-dir> <patches-dir>" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
SRC_DIR=$1
|
||||||
|
PATCHES_DIR=$2
|
||||||
|
|
||||||
|
if [[ ! -d "$SRC_DIR" ]]; then
|
||||||
|
echo "source dir does not exist: $SRC_DIR" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -d "$PATCHES_DIR" ]]; then
|
||||||
|
echo "no patches dir at $PATCHES_DIR, nothing to apply"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
shopt -s nullglob
|
||||||
|
patches=("$PATCHES_DIR"/*.patch)
|
||||||
|
shopt -u nullglob
|
||||||
|
|
||||||
|
if [[ ${#patches[@]} -eq 0 ]]; then
|
||||||
|
echo "no .patch files in $PATCHES_DIR, nothing to apply"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd "$SRC_DIR"
|
||||||
|
|
||||||
|
for patch in "${patches[@]}"; do
|
||||||
|
echo "==> applying $patch"
|
||||||
|
git apply --verbose "$patch"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "all buun-llama-cpp patches applied successfully"
|
||||||
57
backend/cpp/buun-llama-cpp/package.sh
Executable file
57
backend/cpp/buun-llama-cpp/package.sh
Executable file
@@ -0,0 +1,57 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Script to copy the appropriate libraries based on architecture
|
||||||
|
# This script is used in the final stage of the Dockerfile
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
CURDIR=$(dirname "$(realpath $0)")
|
||||||
|
REPO_ROOT="${CURDIR}/../../.."
|
||||||
|
|
||||||
|
# Create lib directory
|
||||||
|
mkdir -p $CURDIR/package/lib
|
||||||
|
|
||||||
|
cp -avrf $CURDIR/buun-llama-cpp-* $CURDIR/package/
|
||||||
|
cp -rfv $CURDIR/run.sh $CURDIR/package/
|
||||||
|
|
||||||
|
# Detect architecture and copy appropriate libraries
|
||||||
|
if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
|
||||||
|
# x86_64 architecture
|
||||||
|
echo "Detected x86_64 architecture, copying x86_64 libraries..."
|
||||||
|
cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
|
||||||
|
cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
|
||||||
|
elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
|
||||||
|
# ARM64 architecture
|
||||||
|
echo "Detected ARM64 architecture, copying ARM64 libraries..."
|
||||||
|
cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
|
||||||
|
cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
|
||||||
|
else
|
||||||
|
echo "Error: Could not detect architecture"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Package GPU libraries based on BUILD_TYPE
|
||||||
|
GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
|
||||||
|
if [ -f "$GPU_LIB_SCRIPT" ]; then
|
||||||
|
echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
|
||||||
|
source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
|
||||||
|
package_gpu_libs
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Packaging completed successfully"
|
||||||
|
ls -liah $CURDIR/package/
|
||||||
|
ls -liah $CURDIR/package/lib/
|
||||||
162
backend/cpp/buun-llama-cpp/patch-grpc-server.sh
Executable file
162
backend/cpp/buun-llama-cpp/patch-grpc-server.sh
Executable file
@@ -0,0 +1,162 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Patch the shared backend/cpp/llama-cpp/grpc-server.cpp *copy* used by the
|
||||||
|
# buun-llama-cpp build to account for three gaps between upstream and the fork:
|
||||||
|
#
|
||||||
|
# 1. Augment the kv_cache_types[] allow-list so `LoadModel` accepts the
|
||||||
|
# fork-specific `turbo2` / `turbo3` / `turbo4` cache types plus the buun
|
||||||
|
# additions `turbo2_tcq` / `turbo3_tcq`.
|
||||||
|
#
|
||||||
|
# 2. Wire up buun-exclusive speculative-decoding option handlers
|
||||||
|
# (tree_budget / draft_topk) alongside the existing spec_* handlers.
|
||||||
|
# These reference struct fields (common_params.speculative.tree_budget
|
||||||
|
# and .draft_topk) that only exist in buun's common/common.h — adding
|
||||||
|
# them to the shared backend/cpp/llama-cpp/grpc-server.cpp would break
|
||||||
|
# the stock llama-cpp build, so we inject them only into the buun copy.
|
||||||
|
#
|
||||||
|
# 3. Replace `get_media_marker()` (added upstream in ggml-org/llama.cpp#21962,
|
||||||
|
# server-side random per-instance marker) with the legacy "<__media__>"
|
||||||
|
# literal. The fork branched before that PR, so server-common.cpp has no
|
||||||
|
# get_media_marker symbol. The fork's mtmd_default_marker() still returns
|
||||||
|
# "<__media__>", and Go-side tooling falls back to that sentinel when the
|
||||||
|
# backend does not expose media_marker, so substituting the literal keeps
|
||||||
|
# behavior identical on the buun path.
|
||||||
|
#
|
||||||
|
# We patch the *copy* sitting in buun-llama-cpp-<flavor>-build/, never the
|
||||||
|
# original under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps
|
||||||
|
# compiling against vanilla upstream.
|
||||||
|
#
|
||||||
|
# Idempotent: skips each insertion if its marker is already present (so re-runs
|
||||||
|
# of the same build dir don't double-insert).
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
if [[ $# -ne 1 ]]; then
|
||||||
|
echo "usage: $0 <grpc-server.cpp>" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
SRC=$1
|
||||||
|
|
||||||
|
if [[ ! -f "$SRC" ]]; then
|
||||||
|
echo "grpc-server.cpp not found at $SRC" >&2
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
if grep -q 'GGML_TYPE_TURBO2_TCQ' "$SRC"; then
|
||||||
|
echo "==> $SRC already has buun cache types, skipping KV allow-list patch"
|
||||||
|
else
|
||||||
|
echo "==> patching $SRC to allow turbo2/turbo3/turbo4/turbo2_tcq/turbo3_tcq KV-cache types"
|
||||||
|
|
||||||
|
# Insert the five TURBO entries right after the first ` GGML_TYPE_Q5_1,`
|
||||||
|
# line (the kv_cache_types[] allow-list). Using awk because the builder
|
||||||
|
# image does not ship python3, and GNU sed's multi-line `a\` quoting is
|
||||||
|
# awkward.
|
||||||
|
awk '
|
||||||
|
/^ GGML_TYPE_Q5_1,$/ && !done {
|
||||||
|
print
|
||||||
|
print " // buun-llama-cpp fork extras — added by patch-grpc-server.sh"
|
||||||
|
print " GGML_TYPE_TURBO2_0,"
|
||||||
|
print " GGML_TYPE_TURBO3_0,"
|
||||||
|
print " GGML_TYPE_TURBO4_0,"
|
||||||
|
print " GGML_TYPE_TURBO2_TCQ,"
|
||||||
|
print " GGML_TYPE_TURBO3_TCQ,"
|
||||||
|
done = 1
|
||||||
|
next
|
||||||
|
}
|
||||||
|
{ print }
|
||||||
|
END {
|
||||||
|
if (!done) {
|
||||||
|
print "patch-grpc-server.sh: anchor ` GGML_TYPE_Q5_1,` not found" > "/dev/stderr"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
' "$SRC" > "$SRC.tmp"
|
||||||
|
mv "$SRC.tmp" "$SRC"
|
||||||
|
|
||||||
|
echo "==> KV allow-list patch OK"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if grep -q 'optname, "tree_budget"' "$SRC"; then
|
||||||
|
echo "==> $SRC already has DFlash option handlers, skipping"
|
||||||
|
else
|
||||||
|
echo "==> patching $SRC to add tree_budget / draft_topk option handlers"
|
||||||
|
|
||||||
|
# Insert two new `else if` handlers between the inner close-brace of the
|
||||||
|
# `spec_p_split` block and the next `} else if (…spec_ngram_size_n…)` line.
|
||||||
|
# Upstream writes each `} else if` as a single physical line, so we don't
|
||||||
|
# emit an outer `}` ourselves — the existing next line provides both the
|
||||||
|
# close of our `draft_topk` block and the open of `spec_ngram_size_n`.
|
||||||
|
# Anchor on the exact 3-line body of spec_p_split so we can't drift.
|
||||||
|
awk '
|
||||||
|
prev2 == " } else if (!strcmp(optname, \"spec_p_split\")) {" &&
|
||||||
|
prev1 ~ /^ +if \(optval != NULL\) \{$/ &&
|
||||||
|
$0 ~ /^ +try \{ params\.speculative\.p_split = std::stof\(optval_str\); \} catch \(\.\.\.\) \{\}$/ &&
|
||||||
|
!done {
|
||||||
|
print # print the try-line itself
|
||||||
|
getline inner_close # read " }" closing the inner if
|
||||||
|
print inner_close # print it — this closes spec_p_split body
|
||||||
|
print " // buun-llama-cpp DFlash options — added by patch-grpc-server.sh"
|
||||||
|
print " } else if (!strcmp(optname, \"tree_budget\")) {"
|
||||||
|
print " if (optval != NULL) {"
|
||||||
|
print " try { params.speculative.tree_budget = std::stoi(optval_str); } catch (...) {}"
|
||||||
|
print " }"
|
||||||
|
print " } else if (!strcmp(optname, \"draft_topk\")) {"
|
||||||
|
print " if (optval != NULL) {"
|
||||||
|
print " try { params.speculative.draft_topk = std::stoi(optval_str); } catch (...) {}"
|
||||||
|
print " }"
|
||||||
|
# The next source line (`} else if (…spec_ngram_size_n…) {`) closes
|
||||||
|
# our draft_topk block and continues the chain naturally; fall back
|
||||||
|
# into the main loop to emit it and everything after.
|
||||||
|
done = 1
|
||||||
|
prev2 = prev1
|
||||||
|
prev1 = inner_close
|
||||||
|
next
|
||||||
|
}
|
||||||
|
{ print; prev2 = prev1; prev1 = $0 }
|
||||||
|
END {
|
||||||
|
if (!done) {
|
||||||
|
print "patch-grpc-server.sh: spec_p_split anchor not found" > "/dev/stderr"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
' "$SRC" > "$SRC.tmp"
|
||||||
|
mv "$SRC.tmp" "$SRC"
|
||||||
|
|
||||||
|
echo "==> DFlash option-handler patch OK"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if grep -qE 'ctx_server\.get_meta\(\)\.logit_bias_eog|params_base\.sampling\.logit_bias_eog,' "$SRC"; then
|
||||||
|
echo "==> patching $SRC to drop the logit_bias_eog arg from params_from_json_cmpl() callsites (buun still uses the pre-refactor 4-arg signature)"
|
||||||
|
# Upstream llama.cpp refactored params_from_json_cmpl to take a precomputed
|
||||||
|
# logit_bias_eog vector after buun's 2026-04-05 fork-point — simultaneously
|
||||||
|
# adding server_context_meta::logit_bias_eog as the supplier. Buun carries
|
||||||
|
# neither change: its params_from_json_cmpl is still 4-arg, and internally
|
||||||
|
# derives logit_bias_eog from the common_params it's passed. So we just
|
||||||
|
# delete the argument line entirely — the remaining 4 args match buun's
|
||||||
|
# signature and the resulting behavior matches upstream bit-for-bit
|
||||||
|
# (upstream's 5th arg is the same data buun derives internally).
|
||||||
|
#
|
||||||
|
# Guard is broad so this works whether the line has been run through this
|
||||||
|
# block before (leaving params_base.sampling.logit_bias_eog,) or not
|
||||||
|
# (leaving the original ctx_server.get_meta().logit_bias_eog,).
|
||||||
|
sed -E '/^[[:space:]]+(ctx_server\.get_meta\(\)\.logit_bias_eog|params_base\.sampling\.logit_bias_eog),$/d' "$SRC" > "$SRC.tmp"
|
||||||
|
mv "$SRC.tmp" "$SRC"
|
||||||
|
echo "==> logit_bias_eog arg drop OK"
|
||||||
|
else
|
||||||
|
echo "==> $SRC has no logit_bias_eog arg line, skipping"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if grep -q 'get_media_marker()' "$SRC"; then
|
||||||
|
echo "==> patching $SRC to replace get_media_marker() with legacy \"<__media__>\" literal"
|
||||||
|
# Only one call site today (ModelMetadata), but replace all occurrences to
|
||||||
|
# stay robust if upstream adds more. Use a temp file to avoid relying on
|
||||||
|
# sed -i portability (the builder image uses GNU sed, but keeping this
|
||||||
|
# consistent with the awk block above).
|
||||||
|
sed 's/get_media_marker()/"<__media__>"/g' "$SRC" > "$SRC.tmp"
|
||||||
|
mv "$SRC.tmp" "$SRC"
|
||||||
|
echo "==> get_media_marker() substitution OK"
|
||||||
|
else
|
||||||
|
echo "==> $SRC has no get_media_marker() call, skipping media-marker patch"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "==> all patches applied"
|
||||||
@@ -0,0 +1,46 @@
|
|||||||
|
Subject: [PATCH] ggml-cuda/fattn: provide atomicAdd(double*,double) shim for pre-sm_60
|
||||||
|
|
||||||
|
Buun's Q² calibration path in ggml_cuda_turbo_scale_q calls
|
||||||
|
atomicAdd(&d_q_channel_sq_fattn[threadIdx.x], (double)(val * val));
|
||||||
|
but native double atomicAdd is only available on compute capability 6.0
|
||||||
|
and newer. Compiling against a CUDA arch list that includes older
|
||||||
|
architectures (LocalAI's CUDA 12 Docker image builds for the full
|
||||||
|
published arch range) fails with:
|
||||||
|
|
||||||
|
fattn.cu(812): error: no instance of overloaded function "atomicAdd"
|
||||||
|
matches the argument list, argument types are: (double *, double)
|
||||||
|
|
||||||
|
Add the canonical CUDA-programming-guide shim at the top of fattn.cu so
|
||||||
|
pre-sm_60 codegen has a definition to call. On sm_60+ the native CUDA
|
||||||
|
intrinsic is used and the shim is elided via __CUDA_ARCH__.
|
||||||
|
|
||||||
|
--- a/ggml/src/ggml-cuda/fattn.cu
|
||||||
|
+++ b/ggml/src/ggml-cuda/fattn.cu
|
||||||
|
@@ -7,6 +7,27 @@
|
||||||
|
|
||||||
|
#include <atomic>
|
||||||
|
|
||||||
|
+// Pre-sm_60 double atomicAdd shim. Native double atomicAdd(double*,double)
|
||||||
|
+// is only available on CUDA compute capability 6.0+ (see CUDA C Programming
|
||||||
|
+// Guide, B.15 Atomic Functions). Buun's Q² calibration path below calls
|
||||||
|
+// atomicAdd with a double*; without this definition, nvcc fails to find a
|
||||||
|
+// matching overload whenever the compile target list includes pre-sm_60
|
||||||
|
+// architectures. The standard CAS loop implementation below matches the
|
||||||
|
+// semantics of the native intrinsic.
|
||||||
|
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
|
||||||
|
+static __device__ double atomicAdd(double * address, double val) {
|
||||||
|
+ unsigned long long int * address_as_ull = (unsigned long long int *)address;
|
||||||
|
+ unsigned long long int old = *address_as_ull;
|
||||||
|
+ unsigned long long int assumed;
|
||||||
|
+ do {
|
||||||
|
+ assumed = old;
|
||||||
|
+ old = atomicCAS(address_as_ull, assumed,
|
||||||
|
+ __double_as_longlong(val + __longlong_as_double(assumed)));
|
||||||
|
+ } while (assumed != old);
|
||||||
|
+ return __longlong_as_double(old);
|
||||||
|
+}
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
// InnerQ: update the fattn-side inverse scale array from host (all devices)
|
||||||
|
void turbo_innerq_update_fattn_scales(const float * scale_inv) {
|
||||||
|
int cur_device;
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
Subject: [PATCH] ggml-cuda/argmax: pass WARP_SIZE to the top-K __shfl_xor_sync calls
|
||||||
|
|
||||||
|
Two __shfl_xor_sync calls in the top-K intra-warp merge drop the `width`
|
||||||
|
argument and rely on the CUDA default (warpSize). Every other call in
|
||||||
|
the same file already passes WARP_SIZE explicitly, and the HIP/ROCm
|
||||||
|
compatibility shim at ggml/src/ggml-cuda/vendors/hip.h:33 is a 4-arg
|
||||||
|
function-like macro — so the 3-arg form fails to preprocess when
|
||||||
|
building with hipcc against ROCm:
|
||||||
|
|
||||||
|
argmax.cu:265: error: too few arguments provided to function-like
|
||||||
|
macro invocation
|
||||||
|
note: macro '__shfl_xor_sync' defined here:
|
||||||
|
#define __shfl_xor_sync(mask, var, laneMask, width) \
|
||||||
|
__shfl_xor(var, laneMask, width)
|
||||||
|
|
||||||
|
Align the two call sites with the rest of the file by passing WARP_SIZE
|
||||||
|
explicitly. On CUDA the generated code is unchanged (warpSize is the
|
||||||
|
default); on HIP it now matches the macro's arity.
|
||||||
|
|
||||||
|
--- a/ggml/src/ggml-cuda/argmax.cu
|
||||||
|
+++ b/ggml/src/ggml-cuda/argmax.cu
|
||||||
|
@@ -262,8 +262,8 @@
|
||||||
|
// Each step: lane gets partner's min element, if it beats our min, replace and re-heapify
|
||||||
|
for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
|
||||||
|
for (int i = 0; i < K; i++) {
|
||||||
|
- float partner_val = __shfl_xor_sync(0xFFFFFFFF, heap_val[i], offset);
|
||||||
|
- int partner_idx = __shfl_xor_sync(0xFFFFFFFF, heap_idx[i], offset);
|
||||||
|
+ float partner_val = __shfl_xor_sync(0xFFFFFFFF, heap_val[i], offset, WARP_SIZE);
|
||||||
|
+ int partner_idx = __shfl_xor_sync(0xFFFFFFFF, heap_idx[i], offset, WARP_SIZE);
|
||||||
|
if (partner_val > heap_val[0]) {
|
||||||
|
heap_val[0] = partner_val;
|
||||||
|
heap_idx[0] = partner_idx;
|
||||||
@@ -0,0 +1,24 @@
|
|||||||
|
Subject: [PATCH] ggml-cuda/vendors/hip: alias cudaMemcpy{To,From}Symbol to hip counterparts
|
||||||
|
|
||||||
|
Buun's Q² calibration + TCQ codebook upload paths in fattn.cu use
|
||||||
|
cudaMemcpyToSymbol / cudaMemcpyFromSymbol. The HIP-compat header in
|
||||||
|
ggml/src/ggml-cuda/vendors/hip.h already aliases the scalar cudaMemcpy
|
||||||
|
family (cudaMemcpy, cudaMemcpyAsync, cudaMemcpy2DAsync, …) but is
|
||||||
|
missing the symbol variants. Building with hipcc therefore fails with
|
||||||
|
15+ "use of undeclared identifier 'cudaMemcpyToSymbol'" errors.
|
||||||
|
|
||||||
|
Add the two missing aliases alongside the existing memcpy block. HIP
|
||||||
|
provides hipMemcpy{To,From}Symbol with the same signature as CUDA's
|
||||||
|
equivalents, so this is a straight name substitution.
|
||||||
|
|
||||||
|
--- a/ggml/src/ggml-cuda/vendors/hip.h
|
||||||
|
+++ b/ggml/src/ggml-cuda/vendors/hip.h
|
||||||
|
@@ -85,6 +85,8 @@
|
||||||
|
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||||
|
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||||
|
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||||
|
+#define cudaMemcpyToSymbol hipMemcpyToSymbol
|
||||||
|
+#define cudaMemcpyFromSymbol hipMemcpyFromSymbol
|
||||||
|
#define cudaMemcpyKind hipMemcpyKind
|
||||||
|
#define cudaMemset hipMemset
|
||||||
|
#define cudaMemsetAsync hipMemsetAsync
|
||||||
@@ -0,0 +1,36 @@
|
|||||||
|
Subject: [PATCH] ggml-cuda/fattn: pass WARP_SIZE to fwht128 __shfl_xor_sync calls
|
||||||
|
|
||||||
|
Same issue as the argmax top-K fix: two __shfl_xor_sync call sites in
|
||||||
|
the FWHT-128 butterfly kernels (ggml_cuda_fwht128 and fwht128_store_half)
|
||||||
|
use the 3-arg CUDA form and omit the `width` argument that the HIP
|
||||||
|
function-like macro in vendors/hip.h:33 requires. Hipcc fails with:
|
||||||
|
|
||||||
|
fattn.cu:512: too few arguments provided to function-like macro
|
||||||
|
invocation
|
||||||
|
note: macro '__shfl_xor_sync' defined here:
|
||||||
|
#define __shfl_xor_sync(mask, var, laneMask, width) \
|
||||||
|
__shfl_xor(var, laneMask, width)
|
||||||
|
|
||||||
|
Add WARP_SIZE to both calls. CUDA codegen is unchanged (warpSize is the
|
||||||
|
default); HIP now matches the macro arity.
|
||||||
|
|
||||||
|
--- a/ggml/src/ggml-cuda/fattn.cu
|
||||||
|
+++ b/ggml/src/ggml-cuda/fattn.cu
|
||||||
|
@@ -509,7 +509,7 @@
|
||||||
|
// Intra-warp passes: shuffle xor with stride h, no smem, no sync.
|
||||||
|
#pragma unroll
|
||||||
|
for (int h = 1; h <= 16; h *= 2) {
|
||||||
|
- const float other = __shfl_xor_sync(0xFFFFFFFF, val, h);
|
||||||
|
+ const float other = __shfl_xor_sync(0xFFFFFFFF, val, h, WARP_SIZE);
|
||||||
|
val = (tid & h) ? (other - val) : (val + other);
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -533,7 +533,7 @@
|
||||||
|
static __device__ __forceinline__ void fwht128_store_half(
|
||||||
|
float val, half * dst_base) {
|
||||||
|
const int tid = threadIdx.x;
|
||||||
|
- const float neighbor = __shfl_xor_sync(0xFFFFFFFF, val, 1);
|
||||||
|
+ const float neighbor = __shfl_xor_sync(0xFFFFFFFF, val, 1, WARP_SIZE);
|
||||||
|
if ((tid & 1) == 0) {
|
||||||
|
const half2 packed = __floats2half2_rn(val, neighbor);
|
||||||
|
*((half2 *)(dst_base + tid)) = packed;
|
||||||
65
backend/cpp/buun-llama-cpp/run.sh
Executable file
65
backend/cpp/buun-llama-cpp/run.sh
Executable file
@@ -0,0 +1,65 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Get the absolute current dir where the script is located
|
||||||
|
CURDIR=$(dirname "$(realpath $0)")
|
||||||
|
|
||||||
|
cd /
|
||||||
|
|
||||||
|
echo "CPU info:"
|
||||||
|
grep -e "model\sname" /proc/cpuinfo | head -1
|
||||||
|
grep -e "flags" /proc/cpuinfo | head -1
|
||||||
|
|
||||||
|
BINARY=buun-llama-cpp-fallback
|
||||||
|
|
||||||
|
if grep -q -e "\savx\s" /proc/cpuinfo ; then
|
||||||
|
echo "CPU: AVX found OK"
|
||||||
|
if [ -e $CURDIR/buun-llama-cpp-avx ]; then
|
||||||
|
BINARY=buun-llama-cpp-avx
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if grep -q -e "\savx2\s" /proc/cpuinfo ; then
|
||||||
|
echo "CPU: AVX2 found OK"
|
||||||
|
if [ -e $CURDIR/buun-llama-cpp-avx2 ]; then
|
||||||
|
BINARY=buun-llama-cpp-avx2
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check avx 512
|
||||||
|
if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
|
||||||
|
echo "CPU: AVX512F found OK"
|
||||||
|
if [ -e $CURDIR/buun-llama-cpp-avx512 ]; then
|
||||||
|
BINARY=buun-llama-cpp-avx512
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
|
||||||
|
if [ -e $CURDIR/buun-llama-cpp-grpc ]; then
|
||||||
|
BINARY=buun-llama-cpp-grpc
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Extend ld library path with the dir where this script is located/lib
|
||||||
|
if [ "$(uname)" == "Darwin" ]; then
|
||||||
|
export DYLD_LIBRARY_PATH=$CURDIR/lib:$DYLD_LIBRARY_PATH
|
||||||
|
else
|
||||||
|
export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
|
||||||
|
# Tell rocBLAS where to find TensileLibrary data (GPU kernel tuning files)
|
||||||
|
if [ -d "$CURDIR/lib/rocblas/library" ]; then
|
||||||
|
export ROCBLAS_TENSILE_LIBPATH=$CURDIR/lib/rocblas/library
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# If there is a lib/ld.so, use it
|
||||||
|
if [ -f $CURDIR/lib/ld.so ]; then
|
||||||
|
echo "Using lib/ld.so"
|
||||||
|
echo "Using binary: $BINARY"
|
||||||
|
exec $CURDIR/lib/ld.so $CURDIR/$BINARY "$@"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Using binary: $BINARY"
|
||||||
|
exec $CURDIR/$BINARY "$@"
|
||||||
|
|
||||||
|
# We should never reach this point, however just in case we do, run fallback
|
||||||
|
exec $CURDIR/buun-llama-cpp-fallback "$@"
|
||||||
@@ -37,6 +37,14 @@ var CacheTypeOptions = []FieldOption{
|
|||||||
{Value: "q4_1", Label: "Q4_1"},
|
{Value: "q4_1", Label: "Q4_1"},
|
||||||
{Value: "q5_0", Label: "Q5_0"},
|
{Value: "q5_0", Label: "Q5_0"},
|
||||||
{Value: "q5_1", Label: "Q5_1"},
|
{Value: "q5_1", Label: "Q5_1"},
|
||||||
|
// TurboQuant KV-cache types — accepted by the turboquant and
|
||||||
|
// buun-llama-cpp fork backends; stock llama-cpp will reject them at load.
|
||||||
|
{Value: "turbo2", Label: "Turbo2 (TurboQuant)"},
|
||||||
|
{Value: "turbo3", Label: "Turbo3 (TurboQuant)"},
|
||||||
|
{Value: "turbo4", Label: "Turbo4 (TurboQuant)"},
|
||||||
|
// Trellis-Coded Quantization variants — buun-llama-cpp only.
|
||||||
|
{Value: "turbo2_tcq", Label: "Turbo2 TCQ (buun-llama-cpp)"},
|
||||||
|
{Value: "turbo3_tcq", Label: "Turbo3 TCQ (buun-llama-cpp)"},
|
||||||
}
|
}
|
||||||
|
|
||||||
var DiffusersPipelineOptions = []FieldOption{
|
var DiffusersPipelineOptions = []FieldOption{
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ func (i *LlamaCPPImporter) AdditionalBackends() []KnownBackendEntry {
|
|||||||
return []KnownBackendEntry{
|
return []KnownBackendEntry{
|
||||||
{Name: "ik-llama-cpp", Modality: "text", Description: "GGUF drop-in replacement for llama-cpp with ik-quants"},
|
{Name: "ik-llama-cpp", Modality: "text", Description: "GGUF drop-in replacement for llama-cpp with ik-quants"},
|
||||||
{Name: "turboquant", Modality: "text", Description: "GGUF drop-in replacement for llama-cpp with TurboQuant optimizations"},
|
{Name: "turboquant", Modality: "text", Description: "GGUF drop-in replacement for llama-cpp with TurboQuant optimizations"},
|
||||||
|
{Name: "buun-llama-cpp", Modality: "text", Description: "GGUF drop-in replacement for llama-cpp with DFlash speculative decoding and TurboQuant/TCQ KV-cache quantization"},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -127,7 +128,7 @@ func (i *LlamaCPPImporter) Import(details Details) (gallery.ModelConfig, error)
|
|||||||
backend := "llama-cpp"
|
backend := "llama-cpp"
|
||||||
if b, ok := preferencesMap["backend"].(string); ok {
|
if b, ok := preferencesMap["backend"].(string); ok {
|
||||||
switch b {
|
switch b {
|
||||||
case "ik-llama-cpp", "turboquant":
|
case "ik-llama-cpp", "turboquant", "buun-llama-cpp":
|
||||||
backend = b
|
backend = b
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -181,6 +181,23 @@ var _ = Describe("LlamaCPPImporter", func() {
|
|||||||
Expect(modelConfig.Files[0].Filename).To(Equal("my-model.gguf"))
|
Expect(modelConfig.Files[0].Filename).To(Equal("my-model.gguf"))
|
||||||
})
|
})
|
||||||
|
|
||||||
|
It("swaps the emitted backend to buun-llama-cpp when preferred", func() {
|
||||||
|
preferences := json.RawMessage(`{"backend": "buun-llama-cpp"}`)
|
||||||
|
details := Details{
|
||||||
|
URI: "https://example.com/my-model.gguf",
|
||||||
|
Preferences: preferences,
|
||||||
|
}
|
||||||
|
|
||||||
|
modelConfig, err := importer.Import(details)
|
||||||
|
|
||||||
|
Expect(err).ToNot(HaveOccurred())
|
||||||
|
Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: buun-llama-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
|
Expect(modelConfig.ConfigFile).NotTo(ContainSubstring("backend: llama-cpp\n"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
|
Expect(modelConfig.ConfigFile).To(ContainSubstring("model: my-model.gguf"), fmt.Sprintf("Model config: %+v", modelConfig))
|
||||||
|
Expect(len(modelConfig.Files)).To(Equal(1))
|
||||||
|
Expect(modelConfig.Files[0].Filename).To(Equal("my-model.gguf"))
|
||||||
|
})
|
||||||
|
|
||||||
It("keeps backend: llama-cpp for unknown backend preferences", func() {
|
It("keeps backend: llama-cpp for unknown backend preferences", func() {
|
||||||
// Unknown backend values must not leak into the emitted YAML —
|
// Unknown backend values must not leak into the emitted YAML —
|
||||||
// we only honour the two curated drop-in replacements.
|
// we only honour the two curated drop-in replacements.
|
||||||
@@ -375,7 +392,7 @@ var _ = Describe("LlamaCPPImporter", func() {
|
|||||||
})
|
})
|
||||||
|
|
||||||
Context("AdditionalBackends", func() {
|
Context("AdditionalBackends", func() {
|
||||||
It("advertises ik-llama-cpp and turboquant as drop-in replacements", func() {
|
It("advertises ik-llama-cpp, turboquant, and buun-llama-cpp as drop-in replacements", func() {
|
||||||
entries := importer.AdditionalBackends()
|
entries := importer.AdditionalBackends()
|
||||||
|
|
||||||
names := make([]string, 0, len(entries))
|
names := make([]string, 0, len(entries))
|
||||||
@@ -384,7 +401,7 @@ var _ = Describe("LlamaCPPImporter", func() {
|
|||||||
names = append(names, e.Name)
|
names = append(names, e.Name)
|
||||||
byName[e.Name] = e
|
byName[e.Name] = e
|
||||||
}
|
}
|
||||||
Expect(names).To(ConsistOf("ik-llama-cpp", "turboquant"))
|
Expect(names).To(ConsistOf("ik-llama-cpp", "turboquant", "buun-llama-cpp"))
|
||||||
|
|
||||||
ik := byName["ik-llama-cpp"]
|
ik := byName["ik-llama-cpp"]
|
||||||
Expect(ik.Modality).To(Equal("text"))
|
Expect(ik.Modality).To(Equal("text"))
|
||||||
@@ -393,6 +410,10 @@ var _ = Describe("LlamaCPPImporter", func() {
|
|||||||
tq := byName["turboquant"]
|
tq := byName["turboquant"]
|
||||||
Expect(tq.Modality).To(Equal("text"))
|
Expect(tq.Modality).To(Equal("text"))
|
||||||
Expect(tq.Description).NotTo(BeEmpty())
|
Expect(tq.Description).NotTo(BeEmpty())
|
||||||
|
|
||||||
|
bn := byName["buun-llama-cpp"]
|
||||||
|
Expect(bn.Modality).To(Equal("text"))
|
||||||
|
Expect(bn.Description).NotTo(BeEmpty())
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -631,6 +631,83 @@ The `cache_type_k` / `cache_type_v` fields map to llama.cpp's `-ctk` / `-ctv` fl
|
|||||||
- [Tracked branch: `feature/turboquant-kv-cache`](https://github.com/TheTom/llama-cpp-turboquant/tree/feature/turboquant-kv-cache)
|
- [Tracked branch: `feature/turboquant-kv-cache`](https://github.com/TheTom/llama-cpp-turboquant/tree/feature/turboquant-kv-cache)
|
||||||
|
|
||||||
|
|
||||||
|
### buun-llama-cpp (DFlash speculative decoding + TurboQuant/TCQ KV-cache)
|
||||||
|
|
||||||
|
[buun-llama-cpp](https://github.com/spiritbuun/buun-llama-cpp) is a fork-of-a-fork: spiritbuun forked `TheTom/llama-cpp-turboquant` (the `turboquant` backend above) and added two independent features on top:
|
||||||
|
|
||||||
|
1. **DFlash** — a block-diffusion speculative decoding scheme that uses a dedicated drafter model (new `DFlashDraftModel` GGUF architecture). On a target/drafter pair it emits a block of tokens per speculation step and can be combined with tree-structured verification ("DDTree") for multi-branch draft expansion.
|
||||||
|
2. **TCQ (Trellis-Coded Quantization)** — two additional KV-cache types (`turbo2_tcq`, `turbo3_tcq`) on top of the TurboQuant `turbo2` / `turbo3` / `turbo4` already shipped by the parent fork, delivering 10–44% KL reduction over scalar quantization at 2–3 bits per value.
|
||||||
|
|
||||||
|
Like `turboquant`, this backend shares LocalAI's stock `llama-cpp` gRPC server sources — so any GGUF model that runs on `llama-cpp` also runs on `buun-llama-cpp`. Pick it over `turboquant` specifically when you want DFlash speculative decoding or the newer TCQ KV-cache variants.
|
||||||
|
|
||||||
|
#### Features
|
||||||
|
|
||||||
|
- Drop-in GGUF compatibility with upstream `llama.cpp`.
|
||||||
|
- DFlash block-diffusion speculative decoding (CUDA/Metal; no CPU fallback).
|
||||||
|
- TurboQuant KV-cache types (`turbo2`, `turbo3`, `turbo4`) inherited from the parent `turboquant` fork, plus buun-exclusive `turbo2_tcq` and `turbo3_tcq` variants.
|
||||||
|
- Same feature surface as `llama-cpp`: text generation, embeddings, tool calls, multimodal via mmproj.
|
||||||
|
- Available on CPU (AVX/AVX2/AVX512/fallback), NVIDIA CUDA 12/13, AMD ROCm/HIP, Intel SYCL f32/f16, Vulkan, and NVIDIA L4T — but note that DFlash and `turbo*` KV types have no CPU fallback and error at model-load on CPU-only builds.
|
||||||
|
|
||||||
|
#### Setup
|
||||||
|
|
||||||
|
`buun-llama-cpp` ships as a separate container image in the LocalAI backend gallery. Install it like any other backend:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
local-ai backends install buun-llama-cpp
|
||||||
|
```
|
||||||
|
|
||||||
|
Or pick a specific flavor for your hardware (example tags: `cpu-buun-llama-cpp`, `cuda12-buun-llama-cpp`, `cuda13-buun-llama-cpp`, `rocm-buun-llama-cpp`, `intel-sycl-f16-buun-llama-cpp`, `vulkan-buun-llama-cpp`).
|
||||||
|
|
||||||
|
#### YAML configuration — TCQ KV-cache
|
||||||
|
|
||||||
|
To run a model with TurboQuant/TCQ quantized KV-cache, set the backend and pick a `turbo*` cache type:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
name: my-model
|
||||||
|
backend: buun-llama-cpp
|
||||||
|
parameters:
|
||||||
|
model: file.gguf
|
||||||
|
# Accepted values for the two fork-aware backends include the stock llama.cpp
|
||||||
|
# types (f16, f32, q8_0, q4_0, q4_1, q5_0, q5_1), the TurboQuant types
|
||||||
|
# (turbo2, turbo3, turbo4), and the buun-only TCQ variants (turbo2_tcq,
|
||||||
|
# turbo3_tcq). turbo3 / turbo4 / turbo*_tcq auto-enable flash_attention.
|
||||||
|
cache_type_k: turbo3
|
||||||
|
cache_type_v: turbo3_tcq
|
||||||
|
context_size: 8192
|
||||||
|
```
|
||||||
|
|
||||||
|
#### YAML configuration — DFlash speculative decoding
|
||||||
|
|
||||||
|
DFlash requires a **dedicated drafter model** in the new `DFlashDraftModel` GGUF architecture. At time of writing the only known public target/drafter pair is [`z-lab/Qwen3.5-27B`](https://huggingface.co/z-lab/Qwen3.5-27B) + [`z-lab/Qwen3.5-27B-DFlash`](https://huggingface.co/z-lab/Qwen3.5-27B-DFlash).
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
name: qwen3-dflash
|
||||||
|
backend: buun-llama-cpp
|
||||||
|
parameters:
|
||||||
|
# Target model (quantized as usual)
|
||||||
|
model: Qwen3.5-27B-Q4_K_M.gguf
|
||||||
|
# Drafter model produced by buun's convert_hf_to_gguf.py from the
|
||||||
|
# DFlashDraftModel checkpoint. Resolved relative to the models path.
|
||||||
|
draft_model: Qwen3.5-27B-DFlash.gguf
|
||||||
|
options:
|
||||||
|
# Switches the speculative pipeline from the default draft-model mode to
|
||||||
|
# DFlash (block-diffusion). Required to activate the DFlash code path.
|
||||||
|
- spec_type:dflash
|
||||||
|
# Optional tuning:
|
||||||
|
# - tree_budget:0 # 0 = flat DFlash; >0 = DDTree verification budget
|
||||||
|
# - draft_topk:1 # drafter top-K per position (1 = argmax)
|
||||||
|
# - spec_n_max:16 # cap on draft tokens per speculation step
|
||||||
|
```
|
||||||
|
|
||||||
|
Under the hood LocalAI wires `draft_model` through to the grpc-server's `params.speculative.mparams_dft.path`, and `spec_type:dflash` is forwarded through the options passthrough to buun's `common_speculative_type_from_name("dflash")`. The `tree_budget` and `draft_topk` options are buun-exclusive; they reference struct fields that only exist in buun's fork, so they're surfaced on this backend only (passing them to stock `llama-cpp` is a no-op).
|
||||||
|
|
||||||
|
#### Reference
|
||||||
|
|
||||||
|
- [spiritbuun/buun-llama-cpp](https://github.com/spiritbuun/buun-llama-cpp)
|
||||||
|
- [TCQ paper / dataset](https://huggingface.co/datasets/spiritbuun/turboquant-tcq-kv-cache) — *"Closing the Gap: Trellis-Coded Quantization for KV Cache at 2-3 Bits"*
|
||||||
|
- DFlash target/drafter pair: [`z-lab/Qwen3.5-27B`](https://huggingface.co/z-lab/Qwen3.5-27B) + [`z-lab/Qwen3.5-27B-DFlash`](https://huggingface.co/z-lab/Qwen3.5-27B-DFlash)
|
||||||
|
|
||||||
|
|
||||||
### vLLM
|
### vLLM
|
||||||
|
|
||||||
[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference.
|
[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference.
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ LocalAI will attempt to automatically load models which are not explicitly confi
|
|||||||
|---------|-------------|------------|------------|-----------|-------------|
|
|---------|-------------|------------|------------|-----------|-------------|
|
||||||
| [llama.cpp](https://github.com/ggerganov/llama.cpp) | LLM inference in C/C++. Supports LLaMA, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | GPT, Functions | yes | yes | CPU, CUDA 12/13, ROCm, Intel SYCL, Vulkan, Metal, Jetson L4T |
|
| [llama.cpp](https://github.com/ggerganov/llama.cpp) | LLM inference in C/C++. Supports LLaMA, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | GPT, Functions | yes | yes | CPU, CUDA 12/13, ROCm, Intel SYCL, Vulkan, Metal, Jetson L4T |
|
||||||
| [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp) | Hard fork of llama.cpp optimized for CPU/hybrid CPU+GPU with IQK quants, custom quant mixes, and MLA for DeepSeek | GPT | yes | yes | CPU (AVX2+) |
|
| [ik_llama.cpp](https://github.com/ikawrakow/ik_llama.cpp) | Hard fork of llama.cpp optimized for CPU/hybrid CPU+GPU with IQK quants, custom quant mixes, and MLA for DeepSeek | GPT | yes | yes | CPU (AVX2+) |
|
||||||
|
| [buun-llama-cpp](https://github.com/spiritbuun/buun-llama-cpp) | llama.cpp fork with DFlash block-diffusion speculative decoding and TurboQuant/TCQ KV-cache quantization (2–3 bits per value). Accelerated paths are CUDA/Metal only. | GPT, Functions | yes | yes | CUDA, Metal (CPU fallback for non-turbo/non-DFlash only) |
|
||||||
| [vLLM](https://github.com/vllm-project/vllm) | Fast LLM serving with PagedAttention | GPT, Functions | no | yes | CPU, CUDA 12, ROCm, Intel |
|
| [vLLM](https://github.com/vllm-project/vllm) | Fast LLM serving with PagedAttention | GPT, Functions | no | yes | CPU, CUDA 12, ROCm, Intel |
|
||||||
| [vLLM Omni](https://github.com/vllm-project/vllm) | Unified multimodal generation (text, image, video, audio) | Multimodal GPT, Functions | no | yes | CUDA 12, ROCm |
|
| [vLLM Omni](https://github.com/vllm-project/vllm) | Unified multimodal generation (text, image, video, audio) | Multimodal GPT, Functions | no | yes | CUDA 12, ROCm |
|
||||||
| [transformers](https://github.com/huggingface/transformers) | HuggingFace Transformers framework | GPT, Embeddings, Multimodal | yes | yes* | CPU, CUDA 12/13, ROCm, Intel, Metal |
|
| [transformers](https://github.com/huggingface/transformers) | HuggingFace Transformers framework | GPT, Embeddings, Multimodal | yes | yes* | CPU, CUDA 12/13, ROCm, Intel, Metal |
|
||||||
|
|||||||
@@ -32,6 +32,12 @@ function inferBackendPath(item) {
|
|||||||
// via a thin wrapper Makefile. Changes to either dir should retrigger it.
|
// via a thin wrapper Makefile. Changes to either dir should retrigger it.
|
||||||
return `backend/cpp/turboquant/`;
|
return `backend/cpp/turboquant/`;
|
||||||
}
|
}
|
||||||
|
if (item.dockerfile.endsWith("buun-llama-cpp")) {
|
||||||
|
// buun-llama-cpp is a fork-of-a-fork (spiritbuun/buun-llama-cpp forks
|
||||||
|
// TheTom/llama-cpp-turboquant) that reuses backend/cpp/llama-cpp sources
|
||||||
|
// the same way turboquant does. Changes to either dir retrigger it.
|
||||||
|
return `backend/cpp/buun-llama-cpp/`;
|
||||||
|
}
|
||||||
if (item.dockerfile.endsWith("llama-cpp")) {
|
if (item.dockerfile.endsWith("llama-cpp")) {
|
||||||
return `backend/cpp/llama-cpp/`;
|
return `backend/cpp/llama-cpp/`;
|
||||||
}
|
}
|
||||||
@@ -138,9 +144,10 @@ async function getChangedFiles() {
|
|||||||
// Per-backend boolean outputs
|
// Per-backend boolean outputs
|
||||||
for (const [backend, pathPrefix] of allBackendPaths) {
|
for (const [backend, pathPrefix] of allBackendPaths) {
|
||||||
let changed = changedFiles.some(file => file.startsWith(pathPrefix));
|
let changed = changedFiles.some(file => file.startsWith(pathPrefix));
|
||||||
// turboquant reuses backend/cpp/llama-cpp sources via a thin wrapper;
|
// turboquant and buun-llama-cpp reuse backend/cpp/llama-cpp sources via
|
||||||
// changes to either directory should retrigger its pipeline.
|
// thin wrapper Makefiles; changes to that directory should retrigger
|
||||||
if (backend === "turboquant" && !changed) {
|
// their pipelines too.
|
||||||
|
if ((backend === "turboquant" || backend === "buun-llama-cpp") && !changed) {
|
||||||
changed = changedFiles.some(file => file.startsWith("backend/cpp/llama-cpp/"));
|
changed = changedFiles.some(file => file.startsWith("backend/cpp/llama-cpp/"));
|
||||||
}
|
}
|
||||||
fs.appendFileSync(process.env.GITHUB_OUTPUT, `${backend}=${changed ? 'true' : 'false'}\n`);
|
fs.appendFileSync(process.env.GITHUB_OUTPUT, `${backend}=${changed ? 'true' : 'false'}\n`);
|
||||||
|
|||||||
Reference in New Issue
Block a user