From 52b3b68ceaefc12cbfd7a2dce1fa3d191cf151b2 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 11 Jun 2026 17:05:18 +0000
Subject: [PATCH] feat(dllm): backend packaging, gallery index, CI matrix

Registers the dllm backend across every surface: backend gallery index
(cpu amd64+arm64 with manifest merge, cuda13, l4t-cuda13 for GB10-class
hardware; no darwin per engine scope), top-level Makefile targets,
bump_deps pin tracking for DLLM_VERSION, and the curated known-backends
list for /backends/known (pref-only: auto-detecting on .gguf would
shadow llama-cpp). Note: image builds and the nightly bump leg stay red
until github.com/mudler/dllm.cpp is published (planned at merge time).

Assisted-by: Claude Code (Fable 5)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 .github/backend-matrix.yml                  | 55 +++++++++++++++++++
 .github/workflows/bump_deps.yaml            |  4 ++
 Makefile                                    |  6 +-
 backend/go/dllm/Makefile                    |  4 ++
 backend/go/dllm/dllm.go                     |  4 +-
 backend/index.yaml                          | 61 +++++++++++++++++++++
 core/http/endpoints/localai/backend.go      |  4 ++
 core/http/endpoints/localai/backend_test.go |  1 +
 8 files changed, 136 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 backend/go/dllm/Makefile

diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml
index 464ffc36c..997f5def0 100644
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -1608,6 +1608,19 @@ include:
     dockerfile: "./backend/Dockerfile.golang"
     context: "./"
     ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-nvidia-cuda-13-dllm'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "dllm"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
   - build-type: 'cublas'
     cuda-major-version: "13"
     cuda-minor-version: "0"
@@ -1647,6 +1660,19 @@ include:
     backend: "parakeet-cpp"
     dockerfile: "./backend/Dockerfile.golang"
     context: "./"
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/arm64'
+    skip-drivers: 'false'
+    tag-latest: 'auto'
+    tag-suffix: '-nvidia-l4t-cuda-13-arm64-dllm'
+    base-image: "ubuntu:24.04"
+    ubuntu-version: '2404'
+    runs-on: 'ubuntu-24.04-arm'
+    backend: "dllm"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
   - build-type: 'cublas'
     cuda-major-version: "13"
     cuda-minor-version: "0"
@@ -3145,6 +3171,35 @@ include:
     dockerfile: "./backend/Dockerfile.golang"
     context: "./"
     ubuntu-version: '2404'
+  # dllm
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    platform-tag: 'amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-dllm'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "dllm"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/arm64'
+    platform-tag: 'arm64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-dllm'
+    runs-on: 'ubuntu-24.04-arm'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "dllm"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
   - build-type: 'sycl_f32'
     cuda-major-version: ""
     cuda-minor-version: ""
diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml
index 5f1ac0c21..5572262d1 100644
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -38,6 +38,10 @@ jobs:
             variable: "PARAKEET_VERSION"
             branch: "master"
             file: "backend/go/parakeet-cpp/Makefile"
+          - repository: "mudler/dllm.cpp"
+            variable: "DLLM_VERSION"
+            branch: "main"
+            file: "backend/go/dllm/Makefile"
           - repository: "leejet/stable-diffusion.cpp"
             variable: "STABLEDIFFUSION_GGML_VERSION"
             branch: "master"
diff --git a/Makefile b/Makefile
index cafcdd44a..89cc6bf01 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Disable parallel execution for backend builds
-.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio
+.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/dllm backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio
 
 GOCMD=go
 GOTEST=$(GOCMD) test
@@ -1171,6 +1171,9 @@ BACKEND_STABLEDIFFUSION_GGML = stablediffusion-ggml|golang|.|--progress=plain|tr
 BACKEND_WHISPER = whisper|golang|.|false|true
 BACKEND_CRISPASR = crispasr|golang|.|false|true
 BACKEND_PARAKEET_CPP = parakeet-cpp|golang|.|false|true
+# dllm is mudler/dllm.cpp, the DiffusionGemma block-diffusion engine,
+# wrapped by the purego backend at backend/go/dllm.
+BACKEND_DLLM = dllm|golang|.|false|true
 BACKEND_VOXTRAL = voxtral|golang|.|false|true
 BACKEND_ACESTEP_CPP = acestep-cpp|golang|.|false|true
 BACKEND_QWEN3_TTS_CPP = qwen3-tts-cpp|golang|.|false|true
@@ -1260,6 +1263,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_STABLEDIFFUSION_GGML)))
 $(eval $(call generate-docker-build-target,$(BACKEND_WHISPER)))
 $(eval $(call generate-docker-build-target,$(BACKEND_CRISPASR)))
 $(eval $(call generate-docker-build-target,$(BACKEND_PARAKEET_CPP)))
+$(eval $(call generate-docker-build-target,$(BACKEND_DLLM)))
 $(eval $(call generate-docker-build-target,$(BACKEND_VOXTRAL)))
 $(eval $(call generate-docker-build-target,$(BACKEND_OPUS)))
 $(eval $(call generate-docker-build-target,$(BACKEND_RERANKERS)))
diff --git a/backend/go/dllm/Makefile b/backend/go/dllm/Makefile
old mode 100644
new mode 100755
index 3b7114c12..1e0825c73
--- a/backend/go/dllm/Makefile
+++ b/backend/go/dllm/Makefile
@@ -14,6 +14,10 @@
 # That's what the gated C-ABI binding smoke uses (DLLM_TEST_LIBRARY). The
 # default target below does the proper clone-at-pin + cmake build so CI
 # doesn't need a side-checkout.
+#
+# NOTE: github.com/mudler/dllm.cpp is still private (publishing is planned);
+# until then the anonymous clone below fails. Use the symlink shortcut above
+# with a local checkout, or a git credential helper with access to the repo.
 
 DLLM_VERSION?=b22fcebebfb225131113188599a9ae542b2935d7
 DLLM_REPO?=https://github.com/mudler/dllm.cpp
diff --git a/backend/go/dllm/dllm.go b/backend/go/dllm/dllm.go
index cd82ff0b3..17d46de2f 100755
--- a/backend/go/dllm/dllm.go
+++ b/backend/go/dllm/dllm.go
@@ -275,8 +275,8 @@ func (d *Dllm) requestOptsJSON(opts *pb.PredictOptions) (string, error) {
 		// The engine rounds n_predict UP to a whole number of diffusion
 		// blocks (the canvas is denoised block-wise), so the completion may
 		// run slightly past the requested budget. Tokens==0 omits the key so
-		// the engine's GGUF-metadata default applies (the C-ABI documents
-		// per-key defaults; no hardcoded 256 like ds4's grpc-server).
+		// the C-ABI default of 256 applies (hardcoded in capi.cpp's
+		// parse_gen_opts, independent of canvas_length).
 		m["n_predict"] = n
 	}
 	if s := opts.GetSeed(); s > 0 {
diff --git a/backend/index.yaml b/backend/index.yaml
index 37e689071..508e87b24 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -95,6 +95,29 @@
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ds4"
     metal: "metal-ds4"
     metal-darwin-arm64: "metal-ds4"
+- &dllm
+  name: "dllm"
+  alias: "dllm"
+  license: mit
+  description: |
+    mudler/dllm.cpp - DiffusionGemma block-diffusion LLM inference engine
+    (C++/ggml, GGUF weights). Decodes whole token canvases per diffusion
+    round instead of autoregressive sampling. Runs on CPU and NVIDIA CUDA 13
+    (including Jetson/GB10 L4T targets).
+  urls:
+    - https://github.com/mudler/dllm.cpp
+  tags:
+    - text-to-text
+    - LLM
+    - gguf
+    - diffusion
+    - CPU
+    - CUDA
+  capabilities:
+    default: "cpu-dllm"
+    nvidia: "cuda13-dllm"
+    nvidia-cuda-13: "cuda13-dllm"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-dllm"
 - &whispercpp
   name: "whisper"
   alias: "whisper"
@@ -1272,6 +1295,13 @@
     nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ds4-development"
     metal: "metal-ds4-development"
     metal-darwin-arm64: "metal-ds4-development"
+- !!merge <<: *dllm
+  name: "dllm-development"
+  capabilities:
+    default: "cpu-dllm-development"
+    nvidia: "cuda13-dllm-development"
+    nvidia-cuda-13: "cuda13-dllm-development"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-dllm-development"
 - !!merge <<: *stablediffusionggml
   name: "stablediffusion-ggml-development"
   capabilities:
@@ -1859,6 +1889,37 @@
   uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ds4"
   mirrors:
     - localai/localai-backends:master-metal-darwin-arm64-ds4
+## dllm
+- !!merge <<: *dllm
+  name: "cpu-dllm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-dllm"
+  mirrors:
+    - localai/localai-backends:latest-cpu-dllm
+- !!merge <<: *dllm
+  name: "cpu-dllm-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-dllm"
+  mirrors:
+    - localai/localai-backends:master-cpu-dllm
+- !!merge <<: *dllm
+  name: "cuda13-dllm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-dllm"
+  mirrors:
+    - localai/localai-backends:latest-gpu-nvidia-cuda-13-dllm
+- !!merge <<: *dllm
+  name: "cuda13-dllm-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-dllm"
+  mirrors:
+    - localai/localai-backends:master-gpu-nvidia-cuda-13-dllm
+- !!merge <<: *dllm
+  name: "cuda13-nvidia-l4t-arm64-dllm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-dllm"
+  mirrors:
+    - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-dllm
+- !!merge <<: *dllm
+  name: "cuda13-nvidia-l4t-arm64-dllm-development"
+  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-dllm"
+  mirrors:
+    - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-dllm
 ## whisper
 - !!merge <<: *whispercpp
   name: "whisper-development"
diff --git a/core/http/endpoints/localai/backend.go b/core/http/endpoints/localai/backend.go
index cbda648d6..29ac18704 100644
--- a/core/http/endpoints/localai/backend.go
+++ b/core/http/endpoints/localai/backend.go
@@ -25,6 +25,10 @@ var knownPrefOnlyBackends = []schema.KnownBackend{
 	// Text LLM
 	// ds4: antirez/ds4 - single-model DeepSeek V4 Flash engine; auto-detected via DS4Importer
 	{Name: "ds4", Modality: "text", AutoDetect: false, Description: "antirez/ds4 DeepSeek V4 Flash engine (auto-detected; pref-only fallback)"},
+	// dllm consumes GGUF weights like llama-cpp does, but only for the
+	// DiffusionGemma architecture - auto-detecting on .gguf would shadow
+	// llama-cpp, so it stays preference-only.
+	{Name: "dllm", Modality: "text", AutoDetect: false, Description: "dllm.cpp DiffusionGemma block-diffusion engine (preference-only)"},
 	{Name: "sglang", Modality: "text", AutoDetect: false, Description: "SGLang runtime (preference-only)"},
 	{Name: "tinygrad", Modality: "text", AutoDetect: false, Description: "tinygrad runtime (preference-only)"},
 	{Name: "trl", Modality: "text", AutoDetect: false, Description: "Transformers Reinforcement Learning (preference-only)"},
diff --git a/core/http/endpoints/localai/backend_test.go b/core/http/endpoints/localai/backend_test.go
index 0c21bb7b4..70877c1b4 100644
--- a/core/http/endpoints/localai/backend_test.go
+++ b/core/http/endpoints/localai/backend_test.go
@@ -135,6 +135,7 @@ var _ = Describe("Backend Endpoints", func() {
 				Expect(entry.Modality).To(Equal(modality))
 			}
 
+			expectPrefOnly("dllm", "text")
 			expectPrefOnly("sglang", "text")
 			expectPrefOnly("tinygrad", "text")
 			expectPrefOnly("trl", "text")