mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-12 02:38:19 -04:00
feat(dllm): backend packaging, gallery index, CI matrix
Registers the dllm backend across every surface: backend gallery index (cpu amd64+arm64 with manifest merge, cuda13, l4t-cuda13 for GB10-class hardware; no darwin per engine scope), top-level Makefile targets, bump_deps pin tracking for DLLM_VERSION, and the curated known-backends list for /backends/known (pref-only: auto-detecting on .gguf would shadow llama-cpp). Note: image builds and the nightly bump leg stay red until github.com/mudler/dllm.cpp is published (planned at merge time). Assisted-by: Claude Code (Fable 5) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
55
.github/backend-matrix.yml
vendored
55
.github/backend-matrix.yml
vendored
@@ -1608,6 +1608,19 @@ include:
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-nvidia-cuda-13-dllm'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "dllm"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
@@ -1647,6 +1660,19 @@ include:
|
||||
backend: "parakeet-cpp"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/arm64'
|
||||
skip-drivers: 'false'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-nvidia-l4t-cuda-13-arm64-dllm'
|
||||
base-image: "ubuntu:24.04"
|
||||
ubuntu-version: '2404'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
backend: "dllm"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
@@ -3145,6 +3171,35 @@ include:
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
# dllm
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
platform-tag: 'amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-dllm'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "dllm"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/arm64'
|
||||
platform-tag: 'arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-dllm'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "dllm"
|
||||
dockerfile: "./backend/Dockerfile.golang"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'sycl_f32'
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
|
||||
4
.github/workflows/bump_deps.yaml
vendored
4
.github/workflows/bump_deps.yaml
vendored
@@ -38,6 +38,10 @@ jobs:
|
||||
variable: "PARAKEET_VERSION"
|
||||
branch: "master"
|
||||
file: "backend/go/parakeet-cpp/Makefile"
|
||||
- repository: "mudler/dllm.cpp"
|
||||
variable: "DLLM_VERSION"
|
||||
branch: "main"
|
||||
file: "backend/go/dllm/Makefile"
|
||||
- repository: "leejet/stable-diffusion.cpp"
|
||||
variable: "STABLEDIFFUSION_GGML_VERSION"
|
||||
branch: "master"
|
||||
|
||||
6
Makefile
6
Makefile
@@ -1,5 +1,5 @@
|
||||
# Disable parallel execution for backend builds
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/crispasr backends/parakeet-cpp backends/dllm backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/rfdetr-cpp backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin backends/liquid-audio
|
||||
|
||||
GOCMD=go
|
||||
GOTEST=$(GOCMD) test
|
||||
@@ -1171,6 +1171,9 @@ BACKEND_STABLEDIFFUSION_GGML = stablediffusion-ggml|golang|.|--progress=plain|tr
|
||||
BACKEND_WHISPER = whisper|golang|.|false|true
|
||||
BACKEND_CRISPASR = crispasr|golang|.|false|true
|
||||
BACKEND_PARAKEET_CPP = parakeet-cpp|golang|.|false|true
|
||||
# dllm is mudler/dllm.cpp, the DiffusionGemma block-diffusion engine,
|
||||
# wrapped by the purego backend at backend/go/dllm.
|
||||
BACKEND_DLLM = dllm|golang|.|false|true
|
||||
BACKEND_VOXTRAL = voxtral|golang|.|false|true
|
||||
BACKEND_ACESTEP_CPP = acestep-cpp|golang|.|false|true
|
||||
BACKEND_QWEN3_TTS_CPP = qwen3-tts-cpp|golang|.|false|true
|
||||
@@ -1260,6 +1263,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_STABLEDIFFUSION_GGML)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_WHISPER)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_CRISPASR)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_PARAKEET_CPP)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_DLLM)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_VOXTRAL)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_OPUS)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_RERANKERS)))
|
||||
|
||||
4
backend/go/dllm/Makefile
Normal file → Executable file
4
backend/go/dllm/Makefile
Normal file → Executable file
@@ -14,6 +14,10 @@
|
||||
# That's what the gated C-ABI binding smoke uses (DLLM_TEST_LIBRARY). The
|
||||
# default target below does the proper clone-at-pin + cmake build so CI
|
||||
# doesn't need a side-checkout.
|
||||
#
|
||||
# NOTE: github.com/mudler/dllm.cpp is still private (publishing is planned);
|
||||
# until then the anonymous clone below fails. Use the symlink shortcut above
|
||||
# with a local checkout, or a git credential helper with access to the repo.
|
||||
|
||||
DLLM_VERSION?=b22fcebebfb225131113188599a9ae542b2935d7
|
||||
DLLM_REPO?=https://github.com/mudler/dllm.cpp
|
||||
|
||||
@@ -275,8 +275,8 @@ func (d *Dllm) requestOptsJSON(opts *pb.PredictOptions) (string, error) {
|
||||
// The engine rounds n_predict UP to a whole number of diffusion
|
||||
// blocks (the canvas is denoised block-wise), so the completion may
|
||||
// run slightly past the requested budget. Tokens==0 omits the key so
|
||||
// the engine's GGUF-metadata default applies (the C-ABI documents
|
||||
// per-key defaults; no hardcoded 256 like ds4's grpc-server).
|
||||
// the C-ABI default of 256 applies (hardcoded in capi.cpp's
|
||||
// parse_gen_opts, independent of canvas_length).
|
||||
m["n_predict"] = n
|
||||
}
|
||||
if s := opts.GetSeed(); s > 0 {
|
||||
|
||||
@@ -95,6 +95,29 @@
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ds4"
|
||||
metal: "metal-ds4"
|
||||
metal-darwin-arm64: "metal-ds4"
|
||||
- &dllm
|
||||
name: "dllm"
|
||||
alias: "dllm"
|
||||
license: mit
|
||||
description: |
|
||||
mudler/dllm.cpp - DiffusionGemma block-diffusion LLM inference engine
|
||||
(C++/ggml, GGUF weights). Decodes whole token canvases per diffusion
|
||||
round instead of autoregressive sampling. Runs on CPU and NVIDIA CUDA 13
|
||||
(including Jetson/GB10 L4T targets).
|
||||
urls:
|
||||
- https://github.com/mudler/dllm.cpp
|
||||
tags:
|
||||
- text-to-text
|
||||
- LLM
|
||||
- gguf
|
||||
- diffusion
|
||||
- CPU
|
||||
- CUDA
|
||||
capabilities:
|
||||
default: "cpu-dllm"
|
||||
nvidia: "cuda13-dllm"
|
||||
nvidia-cuda-13: "cuda13-dllm"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-dllm"
|
||||
- &whispercpp
|
||||
name: "whisper"
|
||||
alias: "whisper"
|
||||
@@ -1272,6 +1295,13 @@
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ds4-development"
|
||||
metal: "metal-ds4-development"
|
||||
metal-darwin-arm64: "metal-ds4-development"
|
||||
- !!merge <<: *dllm
|
||||
name: "dllm-development"
|
||||
capabilities:
|
||||
default: "cpu-dllm-development"
|
||||
nvidia: "cuda13-dllm-development"
|
||||
nvidia-cuda-13: "cuda13-dllm-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-dllm-development"
|
||||
- !!merge <<: *stablediffusionggml
|
||||
name: "stablediffusion-ggml-development"
|
||||
capabilities:
|
||||
@@ -1859,6 +1889,37 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ds4"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-ds4
|
||||
## dllm
|
||||
- !!merge <<: *dllm
|
||||
name: "cpu-dllm"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-dllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-cpu-dllm
|
||||
- !!merge <<: *dllm
|
||||
name: "cpu-dllm-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-cpu-dllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-cpu-dllm
|
||||
- !!merge <<: *dllm
|
||||
name: "cuda13-dllm"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-dllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-nvidia-cuda-13-dllm
|
||||
- !!merge <<: *dllm
|
||||
name: "cuda13-dllm-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-dllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-13-dllm
|
||||
- !!merge <<: *dllm
|
||||
name: "cuda13-nvidia-l4t-arm64-dllm"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-dllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-dllm
|
||||
- !!merge <<: *dllm
|
||||
name: "cuda13-nvidia-l4t-arm64-dllm-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-dllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-dllm
|
||||
## whisper
|
||||
- !!merge <<: *whispercpp
|
||||
name: "whisper-development"
|
||||
|
||||
@@ -25,6 +25,10 @@ var knownPrefOnlyBackends = []schema.KnownBackend{
|
||||
// Text LLM
|
||||
// ds4: antirez/ds4 - single-model DeepSeek V4 Flash engine; auto-detected via DS4Importer
|
||||
{Name: "ds4", Modality: "text", AutoDetect: false, Description: "antirez/ds4 DeepSeek V4 Flash engine (auto-detected; pref-only fallback)"},
|
||||
// dllm consumes GGUF weights like llama-cpp does, but only for the
|
||||
// DiffusionGemma architecture - auto-detecting on .gguf would shadow
|
||||
// llama-cpp, so it stays preference-only.
|
||||
{Name: "dllm", Modality: "text", AutoDetect: false, Description: "dllm.cpp DiffusionGemma block-diffusion engine (preference-only)"},
|
||||
{Name: "sglang", Modality: "text", AutoDetect: false, Description: "SGLang runtime (preference-only)"},
|
||||
{Name: "tinygrad", Modality: "text", AutoDetect: false, Description: "tinygrad runtime (preference-only)"},
|
||||
{Name: "trl", Modality: "text", AutoDetect: false, Description: "Transformers Reinforcement Learning (preference-only)"},
|
||||
|
||||
@@ -135,6 +135,7 @@ var _ = Describe("Backend Endpoints", func() {
|
||||
Expect(entry.Modality).To(Equal(modality))
|
||||
}
|
||||
|
||||
expectPrefOnly("dllm", "text")
|
||||
expectPrefOnly("sglang", "text")
|
||||
expectPrefOnly("tinygrad", "text")
|
||||
expectPrefOnly("trl", "text")
|
||||
|
||||
Reference in New Issue
Block a user