feat(dllm): backend packaging, gallery index, CI matrix

Registers the dllm backend across every surface: backend gallery index
(cpu amd64+arm64 with manifest merge, cuda13, l4t-cuda13 for GB10-class
hardware; no darwin per engine scope), top-level Makefile targets,
bump_deps pin tracking for DLLM_VERSION, and the curated known-backends
list for /backends/known (pref-only: auto-detecting on .gguf would
shadow llama-cpp). Note: image builds and the nightly bump leg stay red
until github.com/mudler/dllm.cpp is published (planned at merge time).

Assisted-by: Claude Code (Fable 5)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-06-11 17:05:18 +00:00
parent 99184809fa
commit 52b3b68cea
8 changed files with 136 additions and 3 deletions

View File

@@ -1608,6 +1608,19 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-dllm'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "dllm"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
@@ -1647,6 +1660,19 @@ include:
backend: "parakeet-cpp"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-cuda-13-arm64-dllm'
base-image: "ubuntu:24.04"
ubuntu-version: '2404'
runs-on: 'ubuntu-24.04-arm'
backend: "dllm"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
@@ -3145,6 +3171,35 @@ include:
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
# dllm
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
platform-tag: 'amd64'
tag-latest: 'auto'
tag-suffix: '-cpu-dllm'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "dllm"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/arm64'
platform-tag: 'arm64'
tag-latest: 'auto'
tag-suffix: '-cpu-dllm'
runs-on: 'ubuntu-24.04-arm'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "dllm"
dockerfile: "./backend/Dockerfile.golang"
context: "./"
ubuntu-version: '2404'
- build-type: 'sycl_f32'
cuda-major-version: ""
cuda-minor-version: ""

View File

@@ -38,6 +38,10 @@ jobs:
variable: "PARAKEET_VERSION"
branch: "master"
file: "backend/go/parakeet-cpp/Makefile"
- repository: "mudler/dllm.cpp"
variable: "DLLM_VERSION"
branch: "main"
file: "backend/go/dllm/Makefile"
- repository: "leejet/stable-diffusion.cpp"
variable: "STABLEDIFFUSION_GGML_VERSION"
branch: "master"

View File

@@ -1,5 +1,5 @@
# Disable parallel execution for backend builds
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/dllm backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio
GOCMD=go
GOTEST=$(GOCMD) test
@@ -1171,6 +1171,9 @@ BACKEND_STABLEDIFFUSION_GGML = stablediffusion-ggml|golang|.|--progress=plain|tr
BACKEND_WHISPER = whisper|golang|.|false|true
BACKEND_CRISPASR = crispasr|golang|.|false|true
BACKEND_PARAKEET_CPP = parakeet-cpp|golang|.|false|true
# dllm is mudler/dllm.cpp, the DiffusionGemma block-diffusion engine,
# wrapped by the purego backend at backend/go/dllm.
BACKEND_DLLM = dllm|golang|.|false|true
BACKEND_VOXTRAL = voxtral|golang|.|false|true
BACKEND_ACESTEP_CPP = acestep-cpp|golang|.|false|true
BACKEND_QWEN3_TTS_CPP = qwen3-tts-cpp|golang|.|false|true
@@ -1260,6 +1263,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_STABLEDIFFUSION_GGML)))
$(eval $(call generate-docker-build-target,$(BACKEND_WHISPER)))
$(eval $(call generate-docker-build-target,$(BACKEND_CRISPASR)))
$(eval $(call generate-docker-build-target,$(BACKEND_PARAKEET_CPP)))
$(eval $(call generate-docker-build-target,$(BACKEND_DLLM)))
$(eval $(call generate-docker-build-target,$(BACKEND_VOXTRAL)))
$(eval $(call generate-docker-build-target,$(BACKEND_OPUS)))
$(eval $(call generate-docker-build-target,$(BACKEND_RERANKERS)))

4
backend/go/dllm/Makefile Normal file → Executable file
View File

@@ -14,6 +14,10 @@
# That's what the gated C-ABI binding smoke uses (DLLM_TEST_LIBRARY). The
# default target below does the proper clone-at-pin + cmake build so CI
# doesn't need a side-checkout.
#
# NOTE: github.com/mudler/dllm.cpp is still private (publishing is planned);
# until then the anonymous clone below fails. Use the symlink shortcut above
# with a local checkout, or a git credential helper with access to the repo.
DLLM_VERSION?=b22fcebebfb225131113188599a9ae542b2935d7
DLLM_REPO?=https://github.com/mudler/dllm.cpp

View File

@@ -275,8 +275,8 @@ func (d *Dllm) requestOptsJSON(opts *pb.PredictOptions) (string, error) {
// The engine rounds n_predict UP to a whole number of diffusion
// blocks (the canvas is denoised block-wise), so the completion may
// run slightly past the requested budget. Tokens==0 omits the key so
// the engine's GGUF-metadata default applies (the C-ABI documents
// per-key defaults; no hardcoded 256 like ds4's grpc-server).
// the C-ABI default of 256 applies (hardcoded in capi.cpp's
// parse_gen_opts, independent of canvas_length).
m["n_predict"] = n
}
if s := opts.GetSeed(); s > 0 {

View File

@@ -95,6 +95,29 @@
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ds4"
metal: "metal-ds4"
metal-darwin-arm64: "metal-ds4"
- &dllm
name: "dllm"
alias: "dllm"
license: mit
description: |
mudler/dllm.cpp - DiffusionGemma block-diffusion LLM inference engine
(C++/ggml, GGUF weights). Decodes whole token canvases per diffusion
round instead of autoregressive sampling. Runs on CPU and NVIDIA CUDA 13
(including Jetson/GB10 L4T targets).
urls:
- https://github.com/mudler/dllm.cpp
tags:
- text-to-text
- LLM
- gguf
- diffusion
- CPU
- CUDA
capabilities:
default: "cpu-dllm"
nvidia: "cuda13-dllm"
nvidia-cuda-13: "cuda13-dllm"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-dllm"
- &whispercpp
name: "whisper"
alias: "whisper"
@@ -1272,6 +1295,13 @@
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ds4-development"
metal: "metal-ds4-development"
metal-darwin-arm64: "metal-ds4-development"
- !!merge <<: *dllm
name: "dllm-development"
capabilities:
default: "cpu-dllm-development"
nvidia: "cuda13-dllm-development"
nvidia-cuda-13: "cuda13-dllm-development"
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-dllm-development"
- !!merge <<: *stablediffusionggml
name: "stablediffusion-ggml-development"
capabilities:
@@ -1859,6 +1889,37 @@
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ds4"
mirrors:
- localai/localai-backends:master-metal-darwin-arm64-ds4
## dllm
- !!merge <<: *dllm
name: "cpu-dllm"
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-dllm"
mirrors:
- localai/localai-backends:latest-cpu-dllm
- !!merge <<: *dllm
name: "cpu-dllm-development"
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-dllm"
mirrors:
- localai/localai-backends:master-cpu-dllm
- !!merge <<: *dllm
name: "cuda13-dllm"
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-dllm"
mirrors:
- localai/localai-backends:latest-gpu-nvidia-cuda-13-dllm
- !!merge <<: *dllm
name: "cuda13-dllm-development"
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-dllm"
mirrors:
- localai/localai-backends:master-gpu-nvidia-cuda-13-dllm
- !!merge <<: *dllm
name: "cuda13-nvidia-l4t-arm64-dllm"
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-dllm"
mirrors:
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-dllm
- !!merge <<: *dllm
name: "cuda13-nvidia-l4t-arm64-dllm-development"
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-dllm"
mirrors:
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-dllm
## whisper
- !!merge <<: *whispercpp
name: "whisper-development"

View File

@@ -25,6 +25,10 @@ var knownPrefOnlyBackends = []schema.KnownBackend{
// Text LLM
// ds4: antirez/ds4 - single-model DeepSeek V4 Flash engine; auto-detected via DS4Importer
{Name: "ds4", Modality: "text", AutoDetect: false, Description: "antirez/ds4 DeepSeek V4 Flash engine (auto-detected; pref-only fallback)"},
// dllm consumes GGUF weights like llama-cpp does, but only for the
// DiffusionGemma architecture - auto-detecting on .gguf would shadow
// llama-cpp, so it stays preference-only.
{Name: "dllm", Modality: "text", AutoDetect: false, Description: "dllm.cpp DiffusionGemma block-diffusion engine (preference-only)"},
{Name: "sglang", Modality: "text", AutoDetect: false, Description: "SGLang runtime (preference-only)"},
{Name: "tinygrad", Modality: "text", AutoDetect: false, Description: "tinygrad runtime (preference-only)"},
{Name: "trl", Modality: "text", AutoDetect: false, Description: "Transformers Reinforcement Learning (preference-only)"},

View File

@@ -135,6 +135,7 @@ var _ = Describe("Backend Endpoints", func() {
Expect(entry.Modality).To(Equal(modality))
}
expectPrefOnly("dllm", "text")
expectPrefOnly("sglang", "text")
expectPrefOnly("tinygrad", "text")
expectPrefOnly("trl", "text")