From 52b3b68ceaefc12cbfd7a2dce1fa3d191cf151b2 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Thu, 11 Jun 2026 17:05:18 +0000 Subject: [PATCH] feat(dllm): backend packaging, gallery index, CI matrix Registers the dllm backend across every surface: backend gallery index (cpu amd64+arm64 with manifest merge, cuda13, l4t-cuda13 for GB10-class hardware; no darwin per engine scope), top-level Makefile targets, bump_deps pin tracking for DLLM_VERSION, and the curated known-backends list for /backends/known (pref-only: auto-detecting on .gguf would shadow llama-cpp). Note: image builds and the nightly bump leg stay red until github.com/mudler/dllm.cpp is published (planned at merge time). Assisted-by: Claude Code (Fable 5) Signed-off-by: Ettore Di Giacinto --- .github/backend-matrix.yml | 55 +++++++++++++++++++ .github/workflows/bump_deps.yaml | 4 ++ Makefile | 6 +- backend/go/dllm/Makefile | 4 ++ backend/go/dllm/dllm.go | 4 +- backend/index.yaml | 61 +++++++++++++++++++++ core/http/endpoints/localai/backend.go | 4 ++ core/http/endpoints/localai/backend_test.go | 1 + 8 files changed, 136 insertions(+), 3 deletions(-) mode change 100644 => 100755 backend/go/dllm/Makefile diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index 464ffc36c..997f5def0 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -1608,6 +1608,19 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-dllm' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "dllm" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -1647,6 +1660,19 @@ include: backend: "parakeet-cpp" dockerfile: "./backend/Dockerfile.golang" context: "./" + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-dllm' + base-image: "ubuntu:24.04" + ubuntu-version: '2404' + runs-on: 'ubuntu-24.04-arm' + backend: "dllm" + dockerfile: "./backend/Dockerfile.golang" + context: "./" - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -3145,6 +3171,35 @@ include: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + # dllm + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + platform-tag: 'amd64' + tag-latest: 'auto' + tag-suffix: '-cpu-dllm' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "dllm" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/arm64' + platform-tag: 'arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-dllm' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "dllm" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' - build-type: 'sycl_f32' cuda-major-version: "" cuda-minor-version: "" diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index 5f1ac0c21..5572262d1 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -38,6 +38,10 @@ jobs: variable: "PARAKEET_VERSION" branch: "master" file: "backend/go/parakeet-cpp/Makefile" + - repository: "mudler/dllm.cpp" + variable: "DLLM_VERSION" + branch: "main" + file: "backend/go/dllm/Makefile" - repository: "leejet/stable-diffusion.cpp" variable: "STABLEDIFFUSION_GGML_VERSION" branch: "master" diff --git a/Makefile b/Makefile index cafcdd44a..89cc6bf01 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/dllm backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio GOCMD=go GOTEST=$(GOCMD) test @@ -1171,6 +1171,9 @@ BACKEND_STABLEDIFFUSION_GGML = stablediffusion-ggml|golang|.|--progress=plain|tr BACKEND_WHISPER = whisper|golang|.|false|true BACKEND_CRISPASR = crispasr|golang|.|false|true BACKEND_PARAKEET_CPP = parakeet-cpp|golang|.|false|true +# dllm is mudler/dllm.cpp, the DiffusionGemma block-diffusion engine, +# wrapped by the purego backend at backend/go/dllm. +BACKEND_DLLM = dllm|golang|.|false|true BACKEND_VOXTRAL = voxtral|golang|.|false|true BACKEND_ACESTEP_CPP = acestep-cpp|golang|.|false|true BACKEND_QWEN3_TTS_CPP = qwen3-tts-cpp|golang|.|false|true @@ -1260,6 +1263,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_STABLEDIFFUSION_GGML))) $(eval $(call generate-docker-build-target,$(BACKEND_WHISPER))) $(eval $(call generate-docker-build-target,$(BACKEND_CRISPASR))) $(eval $(call generate-docker-build-target,$(BACKEND_PARAKEET_CPP))) +$(eval $(call generate-docker-build-target,$(BACKEND_DLLM))) $(eval $(call generate-docker-build-target,$(BACKEND_VOXTRAL))) $(eval $(call generate-docker-build-target,$(BACKEND_OPUS))) $(eval $(call generate-docker-build-target,$(BACKEND_RERANKERS))) diff --git a/backend/go/dllm/Makefile b/backend/go/dllm/Makefile old mode 100644 new mode 100755 index 3b7114c12..1e0825c73 --- a/backend/go/dllm/Makefile +++ b/backend/go/dllm/Makefile @@ -14,6 +14,10 @@ # That's what the gated C-ABI binding smoke uses (DLLM_TEST_LIBRARY). The # default target below does the proper clone-at-pin + cmake build so CI # doesn't need a side-checkout. +# +# NOTE: github.com/mudler/dllm.cpp is still private (publishing is planned); +# until then the anonymous clone below fails. Use the symlink shortcut above +# with a local checkout, or a git credential helper with access to the repo. DLLM_VERSION?=b22fcebebfb225131113188599a9ae542b2935d7 DLLM_REPO?=https://github.com/mudler/dllm.cpp diff --git a/backend/go/dllm/dllm.go b/backend/go/dllm/dllm.go index cd82ff0b3..17d46de2f 100755 --- a/backend/go/dllm/dllm.go +++ b/backend/go/dllm/dllm.go @@ -275,8 +275,8 @@ func (d *Dllm) requestOptsJSON(opts *pb.PredictOptions) (string, error) { // The engine rounds n_predict UP to a whole number of diffusion // blocks (the canvas is denoised block-wise), so the completion may // run slightly past the requested budget. Tokens==0 omits the key so - // the engine's GGUF-metadata default applies (the C-ABI documents - // per-key defaults; no hardcoded 256 like ds4's grpc-server). + // the C-ABI default of 256 applies (hardcoded in capi.cpp's + // parse_gen_opts, independent of canvas_length). m["n_predict"] = n } if s := opts.GetSeed(); s > 0 { diff --git a/backend/index.yaml b/backend/index.yaml index 37e689071..508e87b24 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -95,6 +95,29 @@ nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ds4" metal: "metal-ds4" metal-darwin-arm64: "metal-ds4" +- &dllm + name: "dllm" + alias: "dllm" + license: mit + description: | + mudler/dllm.cpp - DiffusionGemma block-diffusion LLM inference engine + (C++/ggml, GGUF weights). Decodes whole token canvases per diffusion + round instead of autoregressive sampling. Runs on CPU and NVIDIA CUDA 13 + (including Jetson/GB10 L4T targets). + urls: + - https://github.com/mudler/dllm.cpp + tags: + - text-to-text + - LLM + - gguf + - diffusion + - CPU + - CUDA + capabilities: + default: "cpu-dllm" + nvidia: "cuda13-dllm" + nvidia-cuda-13: "cuda13-dllm" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-dllm" - &whispercpp name: "whisper" alias: "whisper" @@ -1272,6 +1295,13 @@ nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ds4-development" metal: "metal-ds4-development" metal-darwin-arm64: "metal-ds4-development" +- !!merge <<: *dllm + name: "dllm-development" + capabilities: + default: "cpu-dllm-development" + nvidia: "cuda13-dllm-development" + nvidia-cuda-13: "cuda13-dllm-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-dllm-development" - !!merge <<: *stablediffusionggml name: "stablediffusion-ggml-development" capabilities: @@ -1859,6 +1889,37 @@ uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ds4" mirrors: - localai/localai-backends:master-metal-darwin-arm64-ds4 +## dllm +- !!merge <<: *dllm + name: "cpu-dllm" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-dllm" + mirrors: + - localai/localai-backends:latest-cpu-dllm +- !!merge <<: *dllm + name: "cpu-dllm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-dllm" + mirrors: + - localai/localai-backends:master-cpu-dllm +- !!merge <<: *dllm + name: "cuda13-dllm" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-dllm" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-dllm +- !!merge <<: *dllm + name: "cuda13-dllm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-dllm" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-dllm +- !!merge <<: *dllm + name: "cuda13-nvidia-l4t-arm64-dllm" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-dllm" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-dllm +- !!merge <<: *dllm + name: "cuda13-nvidia-l4t-arm64-dllm-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-dllm" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-dllm ## whisper - !!merge <<: *whispercpp name: "whisper-development" diff --git a/core/http/endpoints/localai/backend.go b/core/http/endpoints/localai/backend.go index cbda648d6..29ac18704 100644 --- a/core/http/endpoints/localai/backend.go +++ b/core/http/endpoints/localai/backend.go @@ -25,6 +25,10 @@ var knownPrefOnlyBackends = []schema.KnownBackend{ // Text LLM // ds4: antirez/ds4 - single-model DeepSeek V4 Flash engine; auto-detected via DS4Importer {Name: "ds4", Modality: "text", AutoDetect: false, Description: "antirez/ds4 DeepSeek V4 Flash engine (auto-detected; pref-only fallback)"}, + // dllm consumes GGUF weights like llama-cpp does, but only for the + // DiffusionGemma architecture - auto-detecting on .gguf would shadow + // llama-cpp, so it stays preference-only. + {Name: "dllm", Modality: "text", AutoDetect: false, Description: "dllm.cpp DiffusionGemma block-diffusion engine (preference-only)"}, {Name: "sglang", Modality: "text", AutoDetect: false, Description: "SGLang runtime (preference-only)"}, {Name: "tinygrad", Modality: "text", AutoDetect: false, Description: "tinygrad runtime (preference-only)"}, {Name: "trl", Modality: "text", AutoDetect: false, Description: "Transformers Reinforcement Learning (preference-only)"}, diff --git a/core/http/endpoints/localai/backend_test.go b/core/http/endpoints/localai/backend_test.go index 0c21bb7b4..70877c1b4 100644 --- a/core/http/endpoints/localai/backend_test.go +++ b/core/http/endpoints/localai/backend_test.go @@ -135,6 +135,7 @@ var _ = Describe("Backend Endpoints", func() { Expect(entry.Modality).To(Equal(modality)) } + expectPrefOnly("dllm", "text") expectPrefOnly("sglang", "text") expectPrefOnly("tinygrad", "text") expectPrefOnly("trl", "text")