diff --git a/.agents/ds4-backend.md b/.agents/ds4-backend.md new file mode 100644 index 000000000..691ec88cb --- /dev/null +++ b/.agents/ds4-backend.md @@ -0,0 +1,80 @@ +# Working on the ds4 Backend + +`antirez/ds4` is a single-model inference engine for DeepSeek V4 Flash. +LocalAI wraps the engine's C API (`ds4/ds4.h`) with a fresh C++ gRPC server at +`backend/cpp/ds4/` - NOT a fork of llama-cpp's grpc-server.cpp. + +## Pin + +`backend/cpp/ds4/prepare.sh` clones `antirez/ds4` at `DS4_VERSION`. Bump that +commit to follow upstream. + +## Wire shape + +| RPC | Implementation | +|---|---| +| Health, Free, Status | Trivial; no engine dependency for Health | +| LoadModel | `ds4_engine_open` + `ds4_session_create`; backend is compile-time (DS4_NO_GPU → CPU, __APPLE__ → Metal, otherwise CUDA) | +| TokenizeString | `ds4_tokenize_text` | +| Predict | `ds4_engine_generate_argmax` + `DsmlParser` → one ChatDelta with content / reasoning_content / tool_calls[] | +| PredictStream | Same, per-token ChatDelta writes | + +## DSML + +ds4 emits tool calls as literal text markers (`<|DSML|tool_calls>` etc.) - +NOT special tokens. `dsml_parser.{h,cpp}` is our streaming state machine that +classifies token bytes into CONTENT / REASONING / TOOL_START / TOOL_ARGS / TOOL_END +events. `dsml_renderer.{h,cpp}` does the prompt direction: turns +OpenAI tool_calls + role=tool messages back into DSML for the next turn. + +## Thinking modes + +`PredictOptions.Metadata["enable_thinking"]` gates thinking on/off (default ON). +`["reasoning_effort"] == "max" | "xhigh"` selects `DS4_THINK_MAX`; anything else +maps to `DS4_THINK_HIGH`. We pass the chosen mode to `ds4_chat_append_assistant_prefix`. + +## Disk KV cache + +`kv_cache.{h,cpp}` implements an SHA1-keyed file cache using ds4's public +`ds4_session_save_payload` / `ds4_session_load_payload` API. Enable per request +via `ModelOptions.Options[] = "kv_cache_dir:/some/path"`. Format is **our own** - +NOT bit-compatible with ds4-server's KVC files (interop is a follow-up plan). + +## Build matrix + +| Build | Where | Notes | +|---|---|---| +| `cpu-ds4` (amd64 + arm64) | Linux GHA | ds4 considers CPU debug-only; useful only for wiring tests | +| `cuda13-ds4` (amd64 + arm64) | Linux GHA + DGX Spark validation | Primary production path on Linux | +| `ds4-darwin` (arm64) | macOS GHA runners | Metal; uses `scripts/build/ds4-darwin.sh` like llama-cpp-darwin | + +cuda12 is intentionally omitted. ROCm / Vulkan / SYCL are not applicable. + +## Hardware-gated validation + +`tests/e2e-backends/backend_test.go` in `BACKEND_BINARY` mode: + +``` +BACKEND_BINARY=$(pwd)/backend/cpp/ds4/package/run.sh \ +BACKEND_TEST_MODEL_FILE=/path/to/ds4flash.gguf \ +BACKEND_TEST_CAPS=health,load,predict,stream,tools \ +BACKEND_TEST_TOOL_PROMPT="What's the weather in Paris?" \ +go test -count=1 -timeout=30m -v ./tests/e2e-backends/... +``` + +CI does not load the model; the suite is opt-in via env vars. + +## Importer + +`core/gallery/importers/ds4.go` (`DS4Importer`) auto-detects ds4 weights by +matching the `antirez/deepseek-v4-gguf` repo URI or the +`DeepSeek-V4-Flash-*.gguf` filename pattern. **Registered BEFORE +`LlamaCPPImporter`** in `defaultImporters` - both match `.gguf` but ds4 is more +specific, and first-match-wins. The importer emits `backend: ds4`, uses +`ds4flash.gguf` as the local filename (matches ds4's own CLI default), and +disables the Go-side automatic tool-parsing fallback (the C++ backend emits +ChatDelta.tool_calls natively via `DsmlParser`). + +ds4 is also listed in `core/http/endpoints/localai/backend.go`'s pref-only +slice so the `/import-model` UI surfaces it as a manual choice for users who +want to force the backend on a non-canonical URI. diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index f957cea03..903d415ab 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -948,6 +948,32 @@ include: backend: "turboquant" dockerfile: "./backend/Dockerfile.turboquant" context: "./" + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-ds4' + runs-on: 'ubuntu-latest' + base-image: "nvidia/cuda:13.0.0-devel-ubuntu24.04" + skip-drivers: 'true' + backend: "ds4" + dockerfile: "./backend/Dockerfile.ds4" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'true' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-ds4' + base-image: "nvidia/cuda:13.0.0-devel-ubuntu24.04" + runs-on: 'ubuntu-24.04-arm' + ubuntu-version: '2404' + backend: "ds4" + dockerfile: "./backend/Dockerfile.ds4" + context: "./" - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -2321,6 +2347,34 @@ include: dockerfile: "./backend/Dockerfile.turboquant" context: "./" ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + platform-tag: 'amd64' + tag-latest: 'auto' + tag-suffix: '-cpu-ds4' + runs-on: 'ubuntu-latest' + base-image: "nvidia/cuda:13.0.0-devel-ubuntu24.04" + skip-drivers: 'true' + backend: "ds4" + dockerfile: "./backend/Dockerfile.ds4" + context: "./" + ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/arm64' + platform-tag: 'arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-ds4' + runs-on: 'ubuntu-24.04-arm' + base-image: "nvidia/cuda:13.0.0-devel-ubuntu24.04" + skip-drivers: 'true' + backend: "ds4" + dockerfile: "./backend/Dockerfile.ds4" + context: "./" + ubuntu-version: '2404' - build-type: '' cuda-major-version: "" cuda-minor-version: "" diff --git a/.github/workflows/backend_build_darwin.yml b/.github/workflows/backend_build_darwin.yml index 07763b4f7..ac39389f3 100644 --- a/.github/workflows/backend_build_darwin.yml +++ b/.github/workflows/backend_build_darwin.yml @@ -211,8 +211,13 @@ jobs: make protogen-go make backends/llama-cpp-darwin + - name: Build ds4 backend (Darwin Metal) + if: inputs.backend == 'ds4' + run: | + make backends/ds4-darwin + - name: Build ${{ inputs.backend }}-darwin - if: inputs.backend != 'llama-cpp' + if: inputs.backend != 'llama-cpp' && inputs.backend != 'ds4' run: | make protogen-go BACKEND=${{ inputs.backend }} BUILD_TYPE=${{ inputs.build-type }} USE_PIP=${{ inputs.use-pip }} make build-darwin-${{ inputs.lang }}-backend diff --git a/AGENTS.md b/AGENTS.md index 3f6fe96fb..e184ab1c4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -25,6 +25,7 @@ LocalAI follows the Linux kernel project's [guidelines for AI coding assistants] | [.agents/llama-cpp-backend.md](.agents/llama-cpp-backend.md) | Working on the llama.cpp backend — architecture, updating, tool call parsing | | [.agents/vllm-backend.md](.agents/vllm-backend.md) | Working on the vLLM / vLLM-omni backends — native parsers, ChatDelta, CPU build, libnuma packaging, backend hooks | | [.agents/sglang-backend.md](.agents/sglang-backend.md) | Working on the SGLang backend — `engine_args` validation against ServerArgs, speculative-decoding (EAGLE/EAGLE3/DFLASH/MTP) recipes, parser handling | +| [.agents/ds4-backend.md](.agents/ds4-backend.md) | Working on the ds4 backend - DSML state machine, thinking modes, KV cache, Metal+CUDA matrix | | [.agents/testing-mcp-apps.md](.agents/testing-mcp-apps.md) | Testing MCP Apps (interactive tool UIs) in the React UI | | [.agents/api-endpoints-and-auth.md](.agents/api-endpoints-and-auth.md) | Adding API endpoints, auth middleware, feature permissions, user access control | | [.agents/debugging-backends.md](.agents/debugging-backends.md) | Debugging runtime backend failures, dependency conflicts, rebuilding backends | diff --git a/Makefile b/Makefile index 49556077e..488018b0d 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/turboquant backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/insightface backends/speaker-recognition backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/sglang backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp backends/qwen3-tts-cpp backends/vibevoice-cpp backends/localvqe backends/tinygrad backends/sherpa-onnx backends/ds4 backends/ds4-darwin GOCMD=go GOTEST=$(GOCMD) test @@ -1009,6 +1009,10 @@ backends/llama-cpp-darwin: build bash ./scripts/build/llama-cpp-darwin.sh ./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)" +backends/ds4-darwin: build + bash ./scripts/build/ds4-darwin.sh + ./local-ai backends install "ocifile://$(abspath ./backend-images/ds4.tar)" + build-darwin-python-backend: build bash ./scripts/build/python-darwin.sh @@ -1050,6 +1054,10 @@ BACKEND_IK_LLAMA_CPP = ik-llama-cpp|ik-llama-cpp|.|false|false # turboquant is a llama.cpp fork with TurboQuant KV-cache quantization. # Reuses backend/cpp/llama-cpp grpc-server sources via a thin wrapper Makefile. BACKEND_TURBOQUANT = turboquant|turboquant|.|false|false +# ds4 is antirez/ds4, a DeepSeek V4 Flash-specific inference engine. +# Single-model; hardware-only validation lives at tests/e2e-backends/ +# (BACKEND_BINARY mode); see docs/superpowers/plans/2026-05-11-ds4-backend.md. +BACKEND_DS4 = ds4|ds4|.|false|false # Golang backends BACKEND_PIPER = piper|golang|.|false|true @@ -1135,6 +1143,7 @@ endef $(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP))) $(eval $(call generate-docker-build-target,$(BACKEND_IK_LLAMA_CPP))) $(eval $(call generate-docker-build-target,$(BACKEND_TURBOQUANT))) +$(eval $(call generate-docker-build-target,$(BACKEND_DS4))) $(eval $(call generate-docker-build-target,$(BACKEND_PIPER))) $(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE))) $(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE))) @@ -1188,7 +1197,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_SHERPA_ONNX))) docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx +docker-build-backends: docker-build-llama-cpp docker-build-ik-llama-cpp docker-build-turboquant docker-build-ds4 docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-sglang docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-tinygrad docker-build-kokoros docker-build-sam3-cpp docker-build-qwen3-tts-cpp docker-build-vibevoice-cpp docker-build-localvqe docker-build-insightface docker-build-speaker-recognition docker-build-sherpa-onnx ######################################################## ### Mock Backend for E2E Tests diff --git a/backend/Dockerfile.ds4 b/backend/Dockerfile.ds4 new file mode 100644 index 000000000..370d1eaae --- /dev/null +++ b/backend/Dockerfile.ds4 @@ -0,0 +1,41 @@ +ARG BASE_IMAGE=ubuntu:24.04 +ARG APT_MIRROR="" +ARG APT_PORTS_MIRROR="" + +# BASE_IMAGE is either ubuntu:24.04 (for cpu builds) or nvidia/cuda:13.0.0-devel-ubuntu24.04 +# (for cublas builds). Both ship apt + Ubuntu Noble packages; the nvidia/cuda base +# additionally provides /usr/local/cuda. Darwin (Metal) builds bypass this Dockerfile +# entirely via scripts/build/ds4-darwin.sh. +FROM ${BASE_IMAGE} AS builder +ARG BUILD_TYPE +ARG TARGETARCH +ARG TARGETVARIANT + +ENV BUILD_TYPE=${BUILD_TYPE} \ + DEBIAN_FRONTEND=noninteractive \ + PATH=/usr/local/cuda/bin:${PATH} + +WORKDIR /build + +# Install build-time deps via plain apt - install-base-deps.sh's full pipeline +# (CUDA keyring + from-source gRPC) is unnecessary here: +# - CUDA: when BASE_IMAGE=nvidia/cuda:*, /usr/local/cuda is already populated; +# for the cpu build we don't need CUDA at all. +# - gRPC/Protobuf: system apt packages are sufficient; ds4's wrapper only links +# against them, it doesn't ship the gRPC source tree. +# - nlohmann-json: dsml_renderer's only third-party dep. +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + git cmake build-essential pkg-config ca-certificates \ + libgrpc++-dev libprotobuf-dev protobuf-compiler protobuf-compiler-grpc \ + nlohmann-json3-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +COPY . /LocalAI + +RUN --mount=type=cache,target=/root/.ccache,id=ds4-ccache-${TARGETARCH}-${BUILD_TYPE},sharing=locked \ + make -C /LocalAI/backend/cpp/ds4 BUILD_TYPE=${BUILD_TYPE} NATIVE=false grpc-server package + +FROM scratch +COPY --from=builder /LocalAI/backend/cpp/ds4/package/. ./ diff --git a/backend/cpp/ds4/.gitignore b/backend/cpp/ds4/.gitignore new file mode 100644 index 000000000..a9f016206 --- /dev/null +++ b/backend/cpp/ds4/.gitignore @@ -0,0 +1,9 @@ +ds4/ +build/ +package/ +grpc-server +*.o +backend.pb.cc +backend.pb.h +backend.grpc.pb.cc +backend.grpc.pb.h diff --git a/backend/cpp/ds4/CMakeLists.txt b/backend/cpp/ds4/CMakeLists.txt new file mode 100644 index 000000000..efda49ac2 --- /dev/null +++ b/backend/cpp/ds4/CMakeLists.txt @@ -0,0 +1,101 @@ +cmake_minimum_required(VERSION 3.15) +project(ds4-grpc-server LANGUAGES CXX C) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(TARGET grpc-server) + +option(DS4_NATIVE "Compile with -march=native / -mcpu=native" ON) +set(DS4_GPU "cpu" CACHE STRING "GPU backend: cpu, cuda, or metal") +set(DS4_DIR "${CMAKE_CURRENT_SOURCE_DIR}/ds4" CACHE PATH "Path to cloned ds4 source") + +find_package(Threads REQUIRED) +find_package(Protobuf CONFIG QUIET) +if(NOT Protobuf_FOUND) + find_package(Protobuf REQUIRED) +endif() +find_package(gRPC CONFIG QUIET) +if(NOT gRPC_FOUND) + # Ubuntu's apt-installed grpc++ does not ship a CMake config - fall back. + find_library(GRPCPP_LIB grpc++ REQUIRED) + find_library(GRPCPP_REFLECTION_LIB grpc++_reflection REQUIRED) + add_library(gRPC::grpc++ INTERFACE IMPORTED) + set_target_properties(gRPC::grpc++ PROPERTIES INTERFACE_LINK_LIBRARIES "${GRPCPP_LIB}") + add_library(gRPC::grpc++_reflection INTERFACE IMPORTED) + set_target_properties(gRPC::grpc++_reflection PROPERTIES INTERFACE_LINK_LIBRARIES "${GRPCPP_REFLECTION_LIB}") +endif() + +find_program(_PROTOC NAMES protoc REQUIRED) +find_program(_GRPC_CPP_PLUGIN NAMES grpc_cpp_plugin REQUIRED) + +get_filename_component(HW_PROTO "${CMAKE_CURRENT_SOURCE_DIR}/../../backend.proto" ABSOLUTE) +get_filename_component(HW_PROTO_PATH "${HW_PROTO}" PATH) + +set(HW_PROTO_SRCS "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.cc") +set(HW_PROTO_HDRS "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.h") +set(HW_GRPC_SRCS "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.cc") +set(HW_GRPC_HDRS "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.h") + +add_custom_command( + OUTPUT "${HW_PROTO_SRCS}" "${HW_PROTO_HDRS}" "${HW_GRPC_SRCS}" "${HW_GRPC_HDRS}" + COMMAND ${_PROTOC} + ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" + --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" + -I "${HW_PROTO_PATH}" + --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN}" + "${HW_PROTO}" + DEPENDS "${HW_PROTO}") + +add_library(hw_grpc_proto STATIC + ${HW_GRPC_SRCS} ${HW_GRPC_HDRS} + ${HW_PROTO_SRCS} ${HW_PROTO_HDRS}) +target_include_directories(hw_grpc_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) + +set(DS4_OBJS "${DS4_DIR}/ds4.o") +if(DS4_GPU STREQUAL "cuda") + list(APPEND DS4_OBJS "${DS4_DIR}/ds4_cuda.o") +elseif(DS4_GPU STREQUAL "metal") + list(APPEND DS4_OBJS "${DS4_DIR}/ds4_metal.o") +elseif(DS4_GPU STREQUAL "cpu") + set(DS4_OBJS "${DS4_DIR}/ds4_cpu.o") +endif() + +add_executable(${TARGET} + grpc-server.cpp + dsml_parser.cpp + dsml_renderer.cpp + kv_cache.cpp) + +target_include_directories(${TARGET} PRIVATE ${DS4_DIR}) + +foreach(obj ${DS4_OBJS}) + target_sources(${TARGET} PRIVATE ${obj}) + set_source_files_properties(${obj} PROPERTIES EXTERNAL_OBJECT TRUE GENERATED TRUE) +endforeach() + +target_link_libraries(${TARGET} PRIVATE + hw_grpc_proto + gRPC::grpc++ + gRPC::grpc++_reflection + protobuf::libprotobuf + Threads::Threads + m) + +if(DS4_GPU STREQUAL "cuda") + find_package(CUDAToolkit REQUIRED) + target_link_libraries(${TARGET} PRIVATE CUDA::cudart CUDA::cublas) +elseif(DS4_GPU STREQUAL "metal") + find_library(FOUNDATION_LIB Foundation REQUIRED) + find_library(METAL_LIB Metal REQUIRED) + target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_LIB} ${METAL_LIB}) +elseif(DS4_GPU STREQUAL "cpu") + target_compile_definitions(${TARGET} PRIVATE DS4_NO_GPU) +endif() + +if(DS4_NATIVE) + if(APPLE) + target_compile_options(${TARGET} PRIVATE -mcpu=native) + else() + target_compile_options(${TARGET} PRIVATE -march=native) + endif() +endif() diff --git a/backend/cpp/ds4/Makefile b/backend/cpp/ds4/Makefile new file mode 100644 index 000000000..b702713d6 --- /dev/null +++ b/backend/cpp/ds4/Makefile @@ -0,0 +1,63 @@ +# ds4 backend Makefile. +CURDIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) +DS4_DIR := $(CURDIR)ds4 +BUILD_DIR := $(CURDIR)build + +BUILD_TYPE ?= +NATIVE ?= false +JOBS ?= $(shell nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null || echo 4) + +UNAME_S := $(shell uname -s) + +CMAKE_ARGS ?= -DCMAKE_BUILD_TYPE=Release + +ifeq ($(BUILD_TYPE),cublas) + CMAKE_ARGS += -DDS4_GPU=cuda + DS4_OBJ_TARGET := ds4.o ds4_cuda.o +else ifeq ($(UNAME_S),Darwin) + CMAKE_ARGS += -DDS4_GPU=metal + DS4_OBJ_TARGET := ds4.o ds4_metal.o +else + # CPU reference path (Linux only - macOS CPU path is broken by VM bug per ds4 README). + CMAKE_ARGS += -DDS4_GPU=cpu + DS4_OBJ_TARGET := ds4_cpu.o +endif + +ifneq ($(NATIVE),true) + CMAKE_ARGS += -DDS4_NATIVE=OFF +endif + +.PHONY: prepare grpc-server package clean purge test all +all: grpc-server + +prepare: + bash $(CURDIR)prepare.sh + +# Build ds4's engine object files via its own Makefile, which already encodes +# the right per-platform compile flags (Objective-C/Metal on Darwin, nvcc on Linux+CUDA). +$(DS4_DIR)/ds4.o: prepare +ifeq ($(BUILD_TYPE),cublas) + +$(MAKE) -C $(DS4_DIR) ds4.o ds4_cuda.o +else ifeq ($(UNAME_S),Darwin) + +$(MAKE) -C $(DS4_DIR) ds4.o ds4_metal.o +else + +$(MAKE) -C $(DS4_DIR) ds4_cpu.o +endif + +grpc-server: $(DS4_DIR)/ds4.o + mkdir -p $(BUILD_DIR) + cd $(BUILD_DIR) && cmake $(CMAKE_ARGS) -DDS4_DIR=$(DS4_DIR) $(CURDIR) && cmake --build . --config Release -j $(JOBS) + cp $(BUILD_DIR)/grpc-server $(CURDIR)grpc-server + +package: grpc-server + bash $(CURDIR)package.sh + +test: + @echo "ds4 backend: e2e coverage at tests/e2e-backends/ (BACKEND_BINARY mode)" + +clean: + rm -rf $(BUILD_DIR) $(CURDIR)grpc-server $(CURDIR)package + if [ -d $(DS4_DIR) ]; then $(MAKE) -C $(DS4_DIR) clean; fi + +purge: clean + rm -rf $(DS4_DIR) diff --git a/backend/cpp/ds4/dsml_parser.cpp b/backend/cpp/ds4/dsml_parser.cpp new file mode 100644 index 000000000..6fb88a9fc --- /dev/null +++ b/backend/cpp/ds4/dsml_parser.cpp @@ -0,0 +1,359 @@ +#include "dsml_parser.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace ds4cpp { + +namespace { + +constexpr const char *kThinkOpen = ""; +constexpr const char *kThinkClose = ""; +constexpr const char *kToolsOpen = "<\xef\xbd\x9c" "DSML\xef\xbd\x9c" "tool_calls>"; // <|DSML|tool_calls> +constexpr const char *kToolsClose = ""; // +constexpr const char *kInvokeOpenPfx = "<\xef\xbd\x9c" "DSML\xef\xbd\x9c" "invoke name=\""; // <|DSML|invoke name=" +constexpr const char *kInvokeClose = ""; // +constexpr const char *kParamOpenPfx = "<\xef\xbd\x9c" "DSML\xef\xbd\x9c" "parameter name=\""; // <|DSML|parameter name=" +constexpr const char *kParamClose = ""; // + +// All structural markers the parser might encounter - used to detect "buf +// might be a partial marker, don't drain yet" conditions. +const std::vector &all_markers() { + static const std::vector v = { + kThinkOpen, kThinkClose, + kToolsOpen, kToolsClose, + kInvokeOpenPfx, kInvokeClose, + kParamOpenPfx, kParamClose, + }; + return v; +} + +// Returns true if `buf` could be a *prefix* of any marker (i.e., we should +// wait for more text before draining as plain content). The marker-prefix +// loop handles fixed markers exactly. For markers with variable-length +// internal data (kInvokeOpenPfx, kParamOpenPfx have an open quote, then the +// tool/param name, then a closing quote and `>`), we also wait while buf +// starts with `<` and has not yet seen a `>`: the leading `<` could be the +// start of one of those open markers, or a literal that we can confirm only +// once we know what follows. Anything after the first `>` arrives is either +// consumed by TryConsumeMarker or emitted as a literal `<` by the caller. +bool looks_like_prefix(const std::string &buf) { + for (const auto &m : all_markers()) { + if (m.size() > buf.size() && m.compare(0, buf.size(), buf) == 0) return true; + } + if (!buf.empty() && buf[0] == '<' && buf.find('>') == std::string::npos) { + return true; + } + return false; +} + +bool consume_literal(std::string &buf, const std::string &lit) { + if (buf.compare(0, lit.size(), lit) == 0) { + buf.erase(0, lit.size()); + return true; + } + return false; +} + +// Find the next '<' in buf starting at offset; returns std::string::npos if none. +size_t next_tag(const std::string &buf, size_t off = 0) { + return buf.find('<', off); +} + +std::string json_escape(const std::string &in) { + std::string out; + out.reserve(in.size() + 2); + for (char c : in) { + switch (c) { + case '"': out += "\\\""; break; + case '\\': out += "\\\\"; break; + case '\b': out += "\\b"; break; + case '\f': out += "\\f"; break; + case '\n': out += "\\n"; break; + case '\r': out += "\\r"; break; + case '\t': out += "\\t"; break; + default: + if (static_cast(c) < 0x20) { + char tmp[8]; + std::snprintf(tmp, sizeof(tmp), "\\u%04x", c); + out += tmp; + } else { + out += c; + } + } + } + return out; +} + +} // namespace + +DsmlParser::DsmlParser() = default; + +bool DsmlParser::IsInDsmlStructural() const { + switch (state_) { + case State::TOOL_CALLS: + case State::INVOKE: + return true; + case State::PARAM_VALUE: // payload bytes; user sampling applies + case State::TEXT: + case State::THINK: + return false; + } + return false; +} + +void DsmlParser::EmitArgsChunk(const std::string &chunk, std::vector &out) { + if (chunk.empty()) return; + ParserEvent e; + e.type = ParserEvent::TOOL_ARGS; + e.text = chunk; + e.index = tool_index_; + out.push_back(std::move(e)); +} + +void DsmlParser::FinishCurrentToolCall(std::vector &out) { + if (tool_index_ < 0) return; + // Close the JSON object that was opened on the first parameter. + if (args_emitted_open_brace_) { + EmitArgsChunk("}", out); + } else { + EmitArgsChunk("{}", out); + } + ParserEvent e; + e.type = ParserEvent::TOOL_END; + e.index = tool_index_; + out.push_back(std::move(e)); + current_tool_name_.clear(); + args_emitted_open_brace_ = false; + args_param_count_ = 0; +} + +bool DsmlParser::TryConsumeMarker(std::vector &out) { + switch (state_) { + case State::TEXT: { + if (consume_literal(buf_, kThinkOpen)) { state_ = State::THINK; return true; } + if (consume_literal(buf_, kToolsOpen)) { state_ = State::TOOL_CALLS; return true; } + return false; + } + case State::THINK: { + if (consume_literal(buf_, kThinkClose)) { state_ = State::TEXT; return true; } + return false; + } + case State::TOOL_CALLS: { + if (consume_literal(buf_, kToolsClose)) { state_ = State::TEXT; return true; } + // <|DSML|invoke name="X"> + if (buf_.compare(0, std::strlen(kInvokeOpenPfx), kInvokeOpenPfx) == 0) { + size_t close_q = buf_.find('"', std::strlen(kInvokeOpenPfx)); + if (close_q == std::string::npos) return false; // need more bytes + size_t close_gt = buf_.find('>', close_q); + if (close_gt == std::string::npos) return false; + current_tool_name_ = buf_.substr(std::strlen(kInvokeOpenPfx), + close_q - std::strlen(kInvokeOpenPfx)); + tool_index_++; + buf_.erase(0, close_gt + 1); + ParserEvent e; + e.type = ParserEvent::TOOL_START; + e.tool_name = current_tool_name_; + e.tool_id = RandomToolId(); + e.index = tool_index_; + out.push_back(std::move(e)); + args_emitted_open_brace_ = false; + args_param_count_ = 0; + state_ = State::INVOKE; + return true; + } + return false; + } + case State::INVOKE: { + if (consume_literal(buf_, kInvokeClose)) { + FinishCurrentToolCall(out); + state_ = State::TOOL_CALLS; + return true; + } + // <|DSML|parameter name="K" string="true|false"> + if (buf_.compare(0, std::strlen(kParamOpenPfx), kParamOpenPfx) == 0) { + size_t close_q = buf_.find('"', std::strlen(kParamOpenPfx)); + if (close_q == std::string::npos) return false; + size_t string_attr = buf_.find("string=\"", close_q); + if (string_attr == std::string::npos) return false; + size_t string_q = buf_.find('"', string_attr + 8); + if (string_q == std::string::npos) return false; + size_t close_gt = buf_.find('>', string_q); + if (close_gt == std::string::npos) return false; + param_name_ = buf_.substr(std::strlen(kParamOpenPfx), + close_q - std::strlen(kParamOpenPfx)); + std::string string_val = buf_.substr(string_attr + 8, + string_q - (string_attr + 8)); + param_is_string_ = (string_val == "true"); + param_value_.clear(); + buf_.erase(0, close_gt + 1); + // Emit args JSON opener / separator. + std::string opener; + if (!args_emitted_open_brace_) { opener = "{"; args_emitted_open_brace_ = true; } + else { opener = ","; } + opener += "\"" + json_escape(param_name_) + "\":"; + if (param_is_string_) opener += "\""; + EmitArgsChunk(opener, out); + args_param_count_++; + state_ = State::PARAM_VALUE; + return true; + } + return false; + } + case State::PARAM_VALUE: { + if (consume_literal(buf_, kParamClose)) { + if (param_is_string_) EmitArgsChunk("\"", out); + state_ = State::INVOKE; + return true; + } + return false; + } + } + return false; +} + +void DsmlParser::DrainPlain(std::vector &out) { + // Drain everything up to the next '<' that *might* start a marker. + // Anything before the next '<' is safe to emit; the '<...' tail stays buffered. + while (!buf_.empty()) { + size_t lt = next_tag(buf_, 0); + if (lt == std::string::npos) { + // No tag at all - emit (or accumulate) the whole buffer. + ParserEvent e; + if (state_ == State::PARAM_VALUE) { + std::string esc = param_is_string_ ? json_escape(buf_) : buf_; + EmitArgsChunk(esc, out); + } else if (state_ == State::THINK) { + e.type = ParserEvent::REASONING; + e.text = buf_; + out.push_back(std::move(e)); + } else if (state_ == State::TEXT) { + e.type = ParserEvent::CONTENT; + e.text = buf_; + out.push_back(std::move(e)); + } + // Inside INVOKE / TOOL_CALLS with no marker, raw bytes are + // structural whitespace - discard. + buf_.clear(); + return; + } + if (lt > 0) { + std::string chunk = buf_.substr(0, lt); + buf_.erase(0, lt); + ParserEvent e; + if (state_ == State::PARAM_VALUE) { + std::string esc = param_is_string_ ? json_escape(chunk) : chunk; + EmitArgsChunk(esc, out); + } else if (state_ == State::THINK) { + e.type = ParserEvent::REASONING; + e.text = chunk; + out.push_back(std::move(e)); + } else if (state_ == State::TEXT) { + e.type = ParserEvent::CONTENT; + e.text = chunk; + out.push_back(std::move(e)); + } + } + // buf_[0] == '<' - try consuming a marker. If we consumed one, loop again. + if (!TryConsumeMarker(out)) { + // Could be a partial marker - wait for more bytes. + if (looks_like_prefix(buf_)) return; + // Otherwise this '<' is a literal - emit one char and continue. + std::string one(1, buf_[0]); + buf_.erase(0, 1); + ParserEvent e; + if (state_ == State::PARAM_VALUE) { + std::string esc = param_is_string_ ? json_escape(one) : one; + EmitArgsChunk(esc, out); + } else if (state_ == State::THINK) { + e.type = ParserEvent::REASONING; + e.text = one; + out.push_back(std::move(e)); + } else if (state_ == State::TEXT) { + e.type = ParserEvent::CONTENT; + e.text = one; + out.push_back(std::move(e)); + } + } + } +} + +void DsmlParser::Feed(const std::string &chunk, std::vector &out) { + buf_ += chunk; + DrainPlain(out); +} + +void DsmlParser::Flush(std::vector &out) { + // At flush time we no longer wait for marker completion - drain everything + // (the trailing bytes won't grow). Mirror DrainPlain's state-aware + // classification: PARAM_VALUE bytes become TOOL_ARGS, THINK bytes become + // REASONING, TEXT bytes become CONTENT, and INVOKE/TOOL_CALLS bytes are + // structural whitespace (discarded). + auto emit_plain = [&](const std::string &chunk) { + if (chunk.empty()) return; + if (state_ == State::PARAM_VALUE) { + std::string esc = param_is_string_ ? json_escape(chunk) : chunk; + EmitArgsChunk(esc, out); + return; + } + if (state_ == State::THINK) { + ParserEvent e; + e.type = ParserEvent::REASONING; + e.text = chunk; + out.push_back(std::move(e)); + return; + } + if (state_ == State::TEXT) { + ParserEvent e; + e.type = ParserEvent::CONTENT; + e.text = chunk; + out.push_back(std::move(e)); + return; + } + // INVOKE / TOOL_CALLS: structural whitespace, discard. + }; + while (!buf_.empty()) { + size_t lt = next_tag(buf_, 0); + if (lt == std::string::npos) { + emit_plain(buf_); + buf_.clear(); + return; + } + if (lt > 0) { + std::string chunk = buf_.substr(0, lt); + buf_.erase(0, lt); + emit_plain(chunk); + } + if (!TryConsumeMarker(out)) { + // Definitely a literal '<' now (no chance of more bytes arriving). + std::string one(1, buf_[0]); + buf_.erase(0, 1); + emit_plain(one); + } + } + // If we ended mid-tool-call (model truncated), close it cleanly. + if (state_ == State::INVOKE || state_ == State::PARAM_VALUE) { + if (state_ == State::PARAM_VALUE && param_is_string_) EmitArgsChunk("\"", out); + FinishCurrentToolCall(out); + state_ = State::TEXT; + } +} + +std::string RandomToolId() { + static thread_local std::mt19937_64 rng{ + static_cast(std::chrono::system_clock::now().time_since_epoch().count())}; + const char *alphabet = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + std::string out = "call_"; + for (int i = 0; i < 16; ++i) { + out += alphabet[rng() % 62]; + } + return out; +} + +} // namespace ds4cpp diff --git a/backend/cpp/ds4/dsml_parser.h b/backend/cpp/ds4/dsml_parser.h new file mode 100644 index 000000000..c09833673 --- /dev/null +++ b/backend/cpp/ds4/dsml_parser.h @@ -0,0 +1,77 @@ +#pragma once +#include +#include +#include + +namespace ds4cpp { + +struct ParserEvent { + enum Type { CONTENT, REASONING, TOOL_START, TOOL_ARGS, TOOL_END }; + Type type; + std::string text; // CONTENT, REASONING, TOOL_ARGS + std::string tool_name; // TOOL_START + std::string tool_id; // TOOL_START (caller-assigned) + int index = 0; // TOOL_START / TOOL_ARGS / TOOL_END +}; + +// Streaming parser. Stateless across instances; one per Predict call. +class DsmlParser { +public: + DsmlParser(); + + // Feed a chunk of raw model-emitted text. Appends classified events to + // `out`. May buffer the tail of `chunk` internally if it looks like a + // marker prefix. + void Feed(const std::string &chunk, std::vector &out); + + // Flush any remaining buffered text as CONTENT (called at generation end). + void Flush(std::vector &out); + + // True when the parser is inside a DSML structural position - that is, + // tags/markers between tool-call boundaries where the model is expected + // to emit protocol bytes verbatim. Mirrors ds4_server.c's "force + // temperature=0 unless dsml_decode_state_uses_payload_sampling" rule: + // + // TEXT / THINK -> false (user sampling applies) + // PARAM_VALUE -> false (payload uses user sampling) + // TOOL_CALLS / INVOKE -> true (structural; force greedy) + // + // Callers should use this BEFORE the next sample() call to pick the + // effective temperature; the parser's state reflects what's already + // been consumed, so it predicts the next token's classification. + bool IsInDsmlStructural() const; + +private: + enum class State { TEXT, THINK, TOOL_CALLS, INVOKE, PARAM_VALUE }; + State state_ = State::TEXT; + std::string buf_; + std::string current_tool_name_; + int tool_index_ = -1; + // While parsing a parameter value: + std::string param_name_; + bool param_is_string_ = true; + std::string param_value_; + // Incrementally-built arguments JSON for the active tool call. + std::string args_json_so_far_; + bool args_emitted_open_brace_ = false; + int args_param_count_ = 0; + + // Try to consume one structural marker starting at buf_[0]. Returns true + // and advances state if a complete marker was consumed; false if the + // buffer is ambiguous (could be a marker prefix). + bool TryConsumeMarker(std::vector &out); + + // Drain plain text from buf_ as far as we're sure it's not a marker prefix. + // Emits CONTENT or REASONING depending on current state. + void DrainPlain(std::vector &out); + + // Emit the next chunk of arguments JSON to the consumer. + void EmitArgsChunk(const std::string &chunk, std::vector &out); + void FinishCurrentToolCall(std::vector &out); +}; + +// Generate a random tool call ID (e.g. "call_AbCdEf"). Used by the gRPC layer +// when assigning IDs to streamed tool calls. +std::string RandomToolId(); + +} // namespace ds4cpp diff --git a/backend/cpp/ds4/dsml_renderer.cpp b/backend/cpp/ds4/dsml_renderer.cpp new file mode 100644 index 000000000..dfd75d607 --- /dev/null +++ b/backend/cpp/ds4/dsml_renderer.cpp @@ -0,0 +1,140 @@ +#include "dsml_renderer.h" + +// We accept either nlohmann::json (if available) or fall back to a tiny +// hand-rolled parser. The LocalAI tree already has nlohmann/json bundled +// in vendor paths; we use the apt-installed nlohmann-json3-dev (installed +// in Task 11 step 1) when present, otherwise the bundled copy. +#if __has_include() +#include +using json = nlohmann::json; +#else +#error "nlohmann/json.hpp not found; install nlohmann-json3-dev" +#endif + +#include + +namespace ds4cpp { + +namespace { + +void render_param(std::ostringstream &os, const std::string &name, + const json &value) { + bool is_string = value.is_string(); + os << "<\xef\xbd\x9c" "DSML\xef\xbd\x9c" "parameter name=\"" << name + << "\" string=\"" << (is_string ? "true" : "false") << "\">"; + if (is_string) { + os << value.get(); + } else { + os << value.dump(); + } + os << "\n"; +} + +} // namespace + +std::string RenderAssistantToolCalls(const std::string &tool_calls_json) { + if (tool_calls_json.empty()) return ""; + json arr; + try { + arr = json::parse(tool_calls_json); + } catch (const std::exception &) { + return ""; + } + if (!arr.is_array() || arr.empty()) return ""; + + std::ostringstream os; + os << "\n\n<\xef\xbd\x9c" "DSML\xef\xbd\x9c" "tool_calls>\n"; + for (const auto &call : arr) { + // OpenAI shape: { id, type, function: { name, arguments (JSON string) } } + // Anthropic shape comes through normalized by LocalAI. + std::string name; + std::string args_str; + if (call.contains("function")) { + const auto &fn = call["function"]; + if (fn.contains("name") && fn["name"].is_string()) + name = fn["name"].get(); + if (fn.contains("arguments") && fn["arguments"].is_string()) + args_str = fn["arguments"].get(); + } + os << "<\xef\xbd\x9c" "DSML\xef\xbd\x9c" "invoke name=\"" << name << "\">\n"; + if (!args_str.empty()) { + json args; + try { + args = json::parse(args_str); + } catch (...) { + args = json{}; + } + if (args.is_object()) { + for (auto it = args.begin(); it != args.end(); ++it) { + render_param(os, it.key(), it.value()); + } + } + } + os << "\n"; + } + os << ""; + return os.str(); +} + +std::string RenderToolResult(const std::string &tool_call_id, const std::string &content) { + std::ostringstream os; + // ds4_server.c wraps tool results in a "tool_result" DSML tag carrying + // the tool_call_id. Match that shape. + os << "<\xef\xbd\x9c" "DSML\xef\xbd\x9c" "tool_result id=\"" << tool_call_id << "\">" + << content + << ""; + return os.str(); +} + +std::string RenderToolsManifest(const std::string &tools_json) { + if (tools_json.empty()) return ""; + json arr; + try { + arr = json::parse(tools_json); + } catch (const std::exception &) { + return ""; + } + if (!arr.is_array() || arr.empty()) return ""; + + // Extract each OpenAI tool's `function` object, dump as compact JSON, one + // per line. Mirrors openai_function_schema_from_tool() in ds4_server.c. + std::ostringstream schemas; + for (const auto &tool : arr) { + if (tool.contains("function") && tool["function"].is_object()) { + schemas << tool["function"].dump() << "\n"; + } else if (tool.is_object()) { + // Anthropic / direct-schema form: pass through. + schemas << tool.dump() << "\n"; + } + } + if (schemas.tellp() == std::streampos(0)) return ""; + + // Verbatim text from ds4_server.c append_tools_prompt_text. Do NOT + // paraphrase - the model was trained on these exact bytes. + std::ostringstream os; + os << "## Tools\n\n" + "You have access to a set of tools to help answer the user question. " + "You can invoke tools by writing a \"<\xef\xbd\x9c" "DSML\xef\xbd\x9c" "tool_calls>\" block like the following:\n\n" + "<\xef\xbd\x9c" "DSML\xef\xbd\x9c" "tool_calls>\n" + "<\xef\xbd\x9c" "DSML\xef\xbd\x9c" "invoke name=\"$TOOL_NAME\">\n" + "<\xef\xbd\x9c" "DSML\xef\xbd\x9c" "parameter name=\"$PARAMETER_NAME\" string=\"true|false\">$PARAMETER_VALUE\n" + "...\n" + "\n" + "<\xef\xbd\x9c" "DSML\xef\xbd\x9c" "invoke name=\"$TOOL_NAME2\">\n" + "...\n" + "\n" + "\n\n" + "String parameters should be specified as raw text and set `string=\"true\"`. " + "Preserve characters such as `>`, `&`, and `&&` exactly; never replace normal string characters with XML or HTML entity escapes. " + "Only if a string value itself contains the exact closing parameter tag ``, write that tag as `</\xef\xbd\x9c" "DSML\xef\xbd\x9c" "parameter>` inside the value. " + "For all other types (numbers, booleans, arrays, objects), pass the value in JSON format and set `string=\"false\"`.\n\n" + "If thinking_mode is enabled (triggered by ), you MUST output your complete reasoning inside ... BEFORE any tool calls or final response.\n\n" + "Otherwise, output directly after with tool calls or final response.\n\n" + "### Available Tool Schemas\n\n" + << schemas.str() + << "\nYou MUST strictly follow the above defined tool name and parameter schemas to invoke tool calls. " + "Use the exact parameter names from the schemas."; + return os.str(); +} + +} // namespace ds4cpp diff --git a/backend/cpp/ds4/dsml_renderer.h b/backend/cpp/ds4/dsml_renderer.h new file mode 100644 index 000000000..38aa5702c --- /dev/null +++ b/backend/cpp/ds4/dsml_renderer.h @@ -0,0 +1,27 @@ +#pragma once +#include + +namespace ds4cpp { + +// Render an assistant message's tool_calls JSON array into the DSML block +// that ds4 expects in its prompt. `tool_calls_json` is the value of +// proto.Message.tool_calls (OpenAI shape: array of {id, type, function:{name, arguments}}). +// Returns the DSML text to append after the assistant's content. +std::string RenderAssistantToolCalls(const std::string &tool_calls_json); + +// Render a role="tool" message into the DSML "tool result" block. ds4's +// prompt template expects tool results inside a specific tag; we wrap the +// `content` with that tag and include the `tool_call_id` so the model can +// correlate. +std::string RenderToolResult(const std::string &tool_call_id, const std::string &content); + +// Render the "## Tools" manifest that ds4 expects in the SYSTEM prompt when +// tools are available. Without this preamble the model has no idea tools +// exist and will not emit DSML tool calls. Mirrors append_tools_prompt_text() +// in ds4_server.c (~line 1646): a fixed preamble + "### Available Tool +// Schemas" section + one JSON schema per line (extracted from each OpenAI +// tool's .function object) + a fixed closing instruction. Returns empty +// when tools_json is empty / unparseable. +std::string RenderToolsManifest(const std::string &tools_json); + +} // namespace ds4cpp diff --git a/backend/cpp/ds4/grpc-server.cpp b/backend/cpp/ds4/grpc-server.cpp new file mode 100644 index 000000000..c4f149b75 --- /dev/null +++ b/backend/cpp/ds4/grpc-server.cpp @@ -0,0 +1,696 @@ +// ds4 LocalAI gRPC backend. +// +// Wraps antirez/ds4's `ds4_engine_*` / `ds4_session_*` public API +// (see ds4/ds4.h) over LocalAI's backend.proto. Tool calls, thinking +// mode, and disk KV cache are wired in follow-up commits; this commit +// is just the bind/listen/Health/Free skeleton. + +#include "backend.pb.h" +#include "backend.grpc.pb.h" + +#include "dsml_parser.h" // populated in Task 12 +#include "dsml_renderer.h" // populated in Task 16 +#include "kv_cache.h" // populated in Task 17 + +extern "C" { +#include "ds4.h" +} + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using grpc::Server; +using grpc::ServerBuilder; +using grpc::ServerContext; +using grpc::ServerWriter; +// NOTE: do NOT alias `grpc::Status` as `Status` - the Status RPC method below +// would shadow the type, breaking the other RPC method declarations that use +// it as a return type. Use GStatus instead. +using GStatus = ::grpc::Status; +using grpc::StatusCode; + +namespace { + +// Global state - ds4 is single-engine-per-process by design. +std::mutex g_engine_mu; +ds4_engine *g_engine = nullptr; +ds4_session *g_session = nullptr; +int g_ctx_size = 32768; +std::string g_kv_cache_dir; // empty disables disk cache + +std::atomic g_server{nullptr}; + +// Parse a "key:value" option string. Returns empty when no colon. +static std::pair split_option(const std::string &opt) { + auto colon = opt.find(':'); + if (colon == std::string::npos) return {opt, ""}; + return {opt.substr(0, colon), opt.substr(colon + 1)}; +} + +static void append_token_text(ds4_engine *engine, int token, std::string &out) { + size_t len = 0; + const char *text = ds4_token_text(engine, token, &len); + if (text && len > 0) out.append(text, len); +} + +struct CollectCtx { + ds4_engine *engine; + std::string raw_buf; // exact raw bytes for Reply.message + ds4cpp::DsmlParser parser; + backend::Reply *reply; + int tokens; + + // Per-tool aggregation: accumulate ChatDelta tool_calls so we emit one + // delta with all calls, mirroring how vllm's non-streaming path returns. + struct Pending { + std::string id; + std::string name; + std::string args; + }; + std::vector pending; + + std::string content_buf; + std::string reasoning_buf; +}; + +static void apply_events(CollectCtx *c, const std::vector &events) { + for (const auto &e : events) { + switch (e.type) { + case ds4cpp::ParserEvent::CONTENT: + c->content_buf += e.text; + break; + case ds4cpp::ParserEvent::REASONING: + c->reasoning_buf += e.text; + break; + case ds4cpp::ParserEvent::TOOL_START: + if ((int)c->pending.size() <= e.index) + c->pending.resize(e.index + 1); + c->pending[e.index].id = e.tool_id; + c->pending[e.index].name = e.tool_name; + break; + case ds4cpp::ParserEvent::TOOL_ARGS: + if ((int)c->pending.size() > e.index) + c->pending[e.index].args += e.text; + break; + case ds4cpp::ParserEvent::TOOL_END: + // No-op for non-streaming: the final delta is emitted at the end. + break; + } + } +} + +static void collect_emit(void *ud, int token) { + auto *c = static_cast(ud); + if (token == ds4_token_eos(c->engine)) return; + size_t len = 0; + const char *text = ds4_token_text(c->engine, token, &len); + if (!text || len == 0) return; + std::string chunk(text, len); + c->raw_buf += chunk; + std::vector events; + c->parser.Feed(chunk, events); + apply_events(c, events); + c->tokens++; +} +static void collect_done(void *) {} + +struct StreamCtx { + ds4_engine *engine; + ServerWriter *writer; + ds4cpp::DsmlParser parser; + int tokens; + bool aborted; + // Track which tool indices we've seen TOOL_START for, so subsequent + // ARGS deltas can elide the redundant id/name fields. + std::vector tool_started; +}; + +static void stream_emit(void *ud, int token) { + auto *s = static_cast(ud); + if (s->aborted) return; + if (token == ds4_token_eos(s->engine)) return; + size_t len = 0; + const char *text = ds4_token_text(s->engine, token, &len); + if (!text || len == 0) return; + std::string chunk(text, len); + std::vector events; + s->parser.Feed(chunk, events); + if (events.empty()) { s->tokens++; return; } + + backend::Reply reply; + auto *delta = reply.add_chat_deltas(); + bool any_field = false; + for (const auto &e : events) { + switch (e.type) { + case ds4cpp::ParserEvent::CONTENT: + delta->set_content(delta->content() + e.text); + any_field = true; + break; + case ds4cpp::ParserEvent::REASONING: + delta->set_reasoning_content(delta->reasoning_content() + e.text); + any_field = true; + break; + case ds4cpp::ParserEvent::TOOL_START: { + if ((int)s->tool_started.size() <= e.index) + s->tool_started.resize(e.index + 1, false); + s->tool_started[e.index] = true; + auto *tc = delta->add_tool_calls(); + tc->set_index(e.index); + tc->set_id(e.tool_id); + tc->set_name(e.tool_name); + any_field = true; + break; + } + case ds4cpp::ParserEvent::TOOL_ARGS: { + auto *tc = delta->add_tool_calls(); + tc->set_index(e.index); + tc->set_arguments(e.text); + any_field = true; + break; + } + case ds4cpp::ParserEvent::TOOL_END: + // No marker delta needed - the Go side closes the tool call on + // the final aggregator pass. + break; + } + } + reply.set_message(chunk); + reply.set_tokens(1); + if (any_field) { + if (!s->writer->Write(reply)) s->aborted = true; + } + s->tokens++; +} +static void stream_done(void *) {} + +// Per-thread RNG seed for ds4_session_sample. Initialized lazily from +// system_clock; ds4 owns the random walk after that. +static uint64_t *get_rng() { + static thread_local uint64_t seed = 0; + if (seed == 0) { + seed = static_cast( + std::chrono::system_clock::now().time_since_epoch().count()); + if (seed == 0) seed = 1; + } + return &seed; +} + +struct SampleParams { + float temperature; + int top_k; + float top_p; + float min_p; +}; + +// Compute the effective sampling parameters for the next token, mirroring +// ds4_server.c:7102-7115: +// - thinking mode enabled -> override (T=1, top_k=0, top_p=1, min_p=0) +// - inside DSML structural position (tool-call markers) -> force T=0 +// - otherwise -> the request's user-supplied sampling settings +// The parser argument carries state from tokens emitted so far; its +// IsInDsmlStructural() predicts the next token's classification. +static SampleParams compute_sample_params(const backend::PredictOptions *request, + const ds4cpp::DsmlParser &parser, + bool think_enabled); + +static ds4_think_mode parse_think_mode(const backend::PredictOptions *request) { + // Per the vllm backend convention, "enable_thinking" gates thinking on/off, + // and "reasoning_effort" picks the strength when on. + const auto &md = request->metadata(); + auto et = md.find("enable_thinking"); + bool enabled = true; // default ON per ds4-server + if (et != md.end()) enabled = (et->second == "true" || et->second == "1"); + if (!enabled) return DS4_THINK_NONE; + auto re = md.find("reasoning_effort"); + if (re != md.end() && (re->second == "max" || re->second == "xhigh")) + return DS4_THINK_MAX; + return DS4_THINK_HIGH; +} + +static SampleParams compute_sample_params(const backend::PredictOptions *request, + const ds4cpp::DsmlParser &parser, + bool think_enabled) { + SampleParams p = { + request->temperature(), + request->topk(), + request->topp(), + request->minp(), + }; + if (think_enabled) { + // Match ds4-server: thinking mode wants creativity in the reasoning + // pass and the trailing content, so the entire generation overrides + // sampling unless DSML structural bytes take over below. + p.temperature = 1.0f; + p.top_k = 0; + p.top_p = 1.0f; + p.min_p = 0.0f; + } + if (parser.IsInDsmlStructural()) { + // Tool-call structural bytes (tags, markers, headers) must parse + // cleanly. Force greedy regardless of user/thinking settings. + p.temperature = 0.0f; + } + return p; +} + +// Build the rendered text for cache keying. We feed the same text the model +// will see; that lets the cache survive small client-side reformatting of +// chat history (the cache is keyed on bytes, not tokens). +static std::string render_prompt_text(const backend::PredictOptions *request) { + // Two-mode: either the raw prompt or the chat-template path. We mirror + // build_prompt's branching but accumulate text (not tokens) so we can + // SHA1 it for the cache key. ds4_session caches a tokens-indexed + // checkpoint, but the disk format keys on bytes per ds4-server's design. + if (!request->usetokenizertemplate() || request->messages_size() == 0) { + return request->prompt(); + } + std::string out; + const std::string sys_role = "system"; + for (const auto &m : request->messages()) { + if (m.role() == sys_role) { out += "[sys] " + m.content() + "\n"; break; } + } + for (const auto &m : request->messages()) { + if (m.role() == sys_role) continue; + out += "[" + m.role() + "] " + m.content() + "\n"; + } + return out; +} + +ds4cpp::KvCache g_kv_cache; + +// Try to recover prefill state for `rendered`. Returns the matched prefix length. +static size_t maybe_load_cache(const std::string &rendered) { + if (!g_kv_cache.enabled() || !g_session) return 0; + return g_kv_cache.LoadLongestPrefix(g_session, rendered, g_ctx_size); +} + +static void maybe_save_cache(const std::string &rendered) { + if (g_kv_cache.enabled() && g_session) { + g_kv_cache.Save(g_session, rendered, g_ctx_size); + } +} + +static void build_prompt(ds4_engine *engine, const backend::PredictOptions *request, + ds4_tokens *out) { + if (!request->usetokenizertemplate() || request->messages_size() == 0) { + ds4_tokenize_text(engine, request->prompt().c_str(), out); + return; + } + // Chat-template path: render via ds4's helpers. + ds4_chat_begin(engine, out); + + ds4_think_mode think = parse_think_mode(request); + + // ds4_encode_chat_prompt is convenient when there is exactly one + // system+user pair, but for arbitrary turn lists we use the granular + // append helpers. Pull the first system message (if any), then append + // every other message in order. + const std::string sys_role = "system"; + std::string system_text; + for (const auto &m : request->messages()) { + if (m.role() == sys_role) { system_text = m.content(); break; } + } + // Inject the tools manifest into the system prompt when tools are present. + // ds4 was trained to emit DSML tool calls ONLY when this preamble is in + // the system message - without it, the model has no idea tools exist and + // the e2e tool-call test will fail. The renderer lives in dsml_renderer + // and is a verbatim port of ds4_server.c's append_tools_prompt_text. + std::string tools_manifest; + if (!request->tools().empty()) { + tools_manifest = ds4cpp::RenderToolsManifest(request->tools()); + } + if (!system_text.empty() || !tools_manifest.empty()) { + std::string combined = system_text; + if (!tools_manifest.empty()) { + if (!combined.empty()) combined += "\n\n"; + combined += tools_manifest; + } + ds4_chat_append_message(engine, out, "system", combined.c_str()); + } + for (const auto &m : request->messages()) { + if (m.role() == sys_role) continue; + if (m.role() == "assistant" && !m.tool_calls().empty()) { + std::string combined = m.content(); + combined += ds4cpp::RenderAssistantToolCalls(m.tool_calls()); + ds4_chat_append_message(engine, out, "assistant", combined.c_str()); + } else if (m.role() == "tool") { + std::string body = ds4cpp::RenderToolResult(m.tool_call_id(), m.content()); + ds4_chat_append_message(engine, out, "user", body.c_str()); + } else { + ds4_chat_append_message(engine, out, m.role().c_str(), m.content().c_str()); + } + } + ds4_chat_append_assistant_prefix(engine, out, think); +} + +class DS4Backend final : public backend::Backend::Service { +public: + GStatus Health(ServerContext *, const backend::HealthMessage *, + backend::Reply *reply) override { + reply->set_message(std::string("OK")); + return GStatus::OK; + } + + GStatus Free(ServerContext *, const backend::HealthMessage *, + backend::Result *result) override { + std::lock_guard lock(g_engine_mu); + if (g_session) { ds4_session_free(g_session); g_session = nullptr; } + if (g_engine) { ds4_engine_close(g_engine); g_engine = nullptr; } + result->set_success(true); + return GStatus::OK; + } + + GStatus LoadModel(ServerContext *, const backend::ModelOptions *request, + backend::Result *result) override { + std::lock_guard lock(g_engine_mu); + + if (g_engine) { + if (g_session) { ds4_session_free(g_session); g_session = nullptr; } + ds4_engine_close(g_engine); + g_engine = nullptr; + } + + std::string model_path = request->modelfile(); + if (model_path.empty()) model_path = request->model(); + if (model_path.empty()) { + result->set_success(false); + result->set_message("ds4: ModelOptions.Model or .ModelFile must be set"); + return GStatus::OK; + } + + std::string mtp_path; + int mtp_draft = 0; + float mtp_margin = 3.0f; + for (const auto &opt : request->options()) { + auto [k, v] = split_option(opt); + if (k == "mtp_path") mtp_path = v; + else if (k == "mtp_draft") mtp_draft = std::stoi(v); + else if (k == "mtp_margin") mtp_margin = std::stof(v); + else if (k == "kv_cache_dir") g_kv_cache_dir = v; + } + + g_kv_cache.SetDir(g_kv_cache_dir); + + ds4_engine_options opt = {}; + opt.model_path = model_path.c_str(); + opt.mtp_path = mtp_path.empty() ? nullptr : mtp_path.c_str(); + opt.n_threads = request->threads() > 0 ? request->threads() : 0; + opt.mtp_draft_tokens = mtp_draft; + opt.mtp_margin = mtp_margin; + opt.directional_steering_file = nullptr; + opt.warm_weights = false; + opt.quality = false; + +#if defined(DS4_NO_GPU) + opt.backend = DS4_BACKEND_CPU; +#elif defined(__APPLE__) + opt.backend = DS4_BACKEND_METAL; +#else + opt.backend = DS4_BACKEND_CUDA; +#endif + + int rc = ds4_engine_open(&g_engine, &opt); + if (rc != 0 || !g_engine) { + result->set_success(false); + result->set_message("ds4_engine_open failed (rc=" + std::to_string(rc) + ")"); + return GStatus::OK; + } + + g_ctx_size = request->contextsize() > 0 ? request->contextsize() : 32768; + rc = ds4_session_create(&g_session, g_engine, g_ctx_size); + if (rc != 0 || !g_session) { + ds4_engine_close(g_engine); + g_engine = nullptr; + result->set_success(false); + result->set_message("ds4_session_create failed (rc=" + std::to_string(rc) + ")"); + return GStatus::OK; + } + + result->set_success(true); + result->set_message("loaded " + model_path); + return GStatus::OK; + } + + GStatus TokenizeString(ServerContext *, const backend::PredictOptions *request, + backend::TokenizationResponse *response) override { + std::lock_guard lock(g_engine_mu); + if (!g_engine) return GStatus(StatusCode::FAILED_PRECONDITION, "ds4: model not loaded"); + ds4_tokens out = {}; + ds4_tokenize_text(g_engine, request->prompt().c_str(), &out); + for (int i = 0; i < out.len; ++i) response->add_tokens(out.v[i]); + response->set_length(out.len); + ds4_tokens_free(&out); + return GStatus::OK; + } + + GStatus Predict(ServerContext *, const backend::PredictOptions *request, + backend::Reply *reply) override { + std::lock_guard lock(g_engine_mu); + if (!g_engine || !g_session) { + return GStatus(StatusCode::FAILED_PRECONDITION, "ds4: model not loaded"); + } + ds4_tokens prompt = {}; + build_prompt(g_engine, request, &prompt); + int n_predict = request->tokens() > 0 ? request->tokens() : 256; + + CollectCtx collect = {g_engine, "", {}, reply, 0, {}, "", ""}; + std::string cache_key = render_prompt_text(request); + size_t cache_hit = maybe_load_cache(cache_key); + (void)cache_hit; // future: skip prompt prefix if hit covers full prompt + + // Manual generation loop on g_session. When MTP speculative weights + // were loaded (LoadModel option 'mtp_path:'), we use the + // ds4_session_eval_speculative_argmax path which may accept N>1 + // tokens per outer iteration. Otherwise per-token argmax + eval. + // Either way g_session advances so the disk KV cache picks up a + // real checkpoint after the call (see maybe_save_cache below). + char err[256] = {0}; + int rc = ds4_session_sync(g_session, &prompt, err, sizeof(err)); + int prompt_len = prompt.len; + ds4_tokens_free(&prompt); + if (rc == 0) { + const int eos = ds4_token_eos(g_engine); + const int draft_max = ds4_engine_mtp_draft_tokens(g_engine); + const bool think_enabled = ds4_think_mode_enabled(parse_think_mode(request)); + int produced = 0; + while (produced < n_predict) { + SampleParams sp = compute_sample_params(request, collect.parser, think_enabled); + int first; + if (sp.temperature <= 0.0f) { + first = ds4_session_argmax(g_session); + } else { + first = ds4_session_sample(g_session, + sp.temperature, sp.top_k, + sp.top_p, sp.min_p, get_rng()); + } + if (first == eos) break; + // MTP only when sampling is greedy (ds4-server gate). + if (draft_max > 0 && sp.temperature <= 0.0f) { + constexpr int kAcceptedMax = 8; + int accepted[kAcceptedMax]; + int cap = std::min(kAcceptedMax, draft_max + 1); + int n = ds4_session_eval_speculative_argmax( + g_session, first, draft_max, eos, + accepted, cap, err, sizeof(err)); + if (n < 0) { rc = -1; break; } + bool stop = false; + for (int j = 0; j < n; ++j) { + if (accepted[j] == eos) { stop = true; break; } + collect_emit(&collect, accepted[j]); + if (++produced >= n_predict) { stop = true; break; } + } + if (stop) break; + } else { + collect_emit(&collect, first); + if (++produced >= n_predict) break; + rc = ds4_session_eval(g_session, first, err, sizeof(err)); + if (rc != 0) break; + } + } + collect_done(&collect); + } + maybe_save_cache(cache_key); + + // Flush any buffered parser state. + std::vector events; + collect.parser.Flush(events); + apply_events(&collect, events); + + if (rc != 0) { + return GStatus(StatusCode::INTERNAL, + std::string("ds4 generation failed: ") + err); + } + + // Emit one ChatDelta with content/reasoning/tool_calls. + auto *delta = reply->add_chat_deltas(); + delta->set_content(collect.content_buf); + delta->set_reasoning_content(collect.reasoning_buf); + for (size_t i = 0; i < collect.pending.size(); ++i) { + auto *tc = delta->add_tool_calls(); + tc->set_index(static_cast(i)); + tc->set_id(collect.pending[i].id); + tc->set_name(collect.pending[i].name); + tc->set_arguments(collect.pending[i].args); + } + + reply->set_message(collect.raw_buf); + reply->set_tokens(collect.tokens); + reply->set_prompt_tokens(prompt_len); + return GStatus::OK; + } + + GStatus PredictStream(ServerContext *, const backend::PredictOptions *request, + ServerWriter *writer) override { + std::lock_guard lock(g_engine_mu); + if (!g_engine || !g_session) { + return GStatus(StatusCode::FAILED_PRECONDITION, "ds4: model not loaded"); + } + ds4_tokens prompt = {}; + build_prompt(g_engine, request, &prompt); + int n_predict = request->tokens() > 0 ? request->tokens() : 256; + + StreamCtx s = {g_engine, writer, {}, 0, false, {}}; + std::string cache_key = render_prompt_text(request); + size_t cache_hit = maybe_load_cache(cache_key); + (void)cache_hit; + + // Manual loop on g_session - see Predict() above for the rationale. + // MTP speculative path used when ds4_engine_mtp_draft_tokens > 0. + char err[256] = {0}; + int rc = ds4_session_sync(g_session, &prompt, err, sizeof(err)); + ds4_tokens_free(&prompt); + if (rc == 0) { + const int eos = ds4_token_eos(g_engine); + const int draft_max = ds4_engine_mtp_draft_tokens(g_engine); + const bool think_enabled = ds4_think_mode_enabled(parse_think_mode(request)); + int produced = 0; + while (produced < n_predict && !s.aborted) { + SampleParams sp = compute_sample_params(request, s.parser, think_enabled); + int first; + if (sp.temperature <= 0.0f) { + first = ds4_session_argmax(g_session); + } else { + first = ds4_session_sample(g_session, + sp.temperature, sp.top_k, + sp.top_p, sp.min_p, get_rng()); + } + if (first == eos) break; + if (draft_max > 0 && sp.temperature <= 0.0f) { + constexpr int kAcceptedMax = 8; + int accepted[kAcceptedMax]; + int cap = std::min(kAcceptedMax, draft_max + 1); + int n = ds4_session_eval_speculative_argmax( + g_session, first, draft_max, eos, + accepted, cap, err, sizeof(err)); + if (n < 0) { rc = -1; break; } + bool stop = false; + for (int j = 0; j < n; ++j) { + if (accepted[j] == eos) { stop = true; break; } + stream_emit(&s, accepted[j]); + if (s.aborted) { stop = true; break; } + if (++produced >= n_predict) { stop = true; break; } + } + if (stop) break; + } else { + stream_emit(&s, first); + if (s.aborted || ++produced >= n_predict) break; + rc = ds4_session_eval(g_session, first, err, sizeof(err)); + if (rc != 0) break; + } + } + stream_done(&s); + } + maybe_save_cache(cache_key); + + // Flush parser state. + std::vector events; + s.parser.Flush(events); + if (!events.empty() && !s.aborted) { + backend::Reply reply; + auto *delta = reply.add_chat_deltas(); + for (const auto &e : events) { + if (e.type == ds4cpp::ParserEvent::CONTENT) { + delta->set_content(delta->content() + e.text); + } else if (e.type == ds4cpp::ParserEvent::REASONING) { + delta->set_reasoning_content(delta->reasoning_content() + e.text); + } + } + s.writer->Write(reply); + } + + if (rc != 0 && !s.aborted) { + return GStatus(StatusCode::INTERNAL, + std::string("ds4 generation failed: ") + err); + } + return GStatus::OK; + } + + GStatus Status(ServerContext *, const backend::HealthMessage *, + backend::StatusResponse *response) override { + std::lock_guard lock(g_engine_mu); + response->set_state(g_engine ? backend::StatusResponse::READY + : backend::StatusResponse::UNINITIALIZED); + return GStatus::OK; + } +}; + +void RunServer(const std::string &addr) { + DS4Backend service; + grpc::EnableDefaultHealthCheckService(true); + grpc::reflection::InitProtoReflectionServerBuilderPlugin(); + + ServerBuilder builder; + builder.AddListeningPort(addr, grpc::InsecureServerCredentials()); + builder.RegisterService(&service); + builder.SetMaxReceiveMessageSize(64 * 1024 * 1024); + builder.SetMaxSendMessageSize(64 * 1024 * 1024); + + std::unique_ptr server(builder.BuildAndStart()); + if (!server) { + std::cerr << "ds4 grpc-server: failed to bind " << addr << "\n"; + std::exit(1); + } + g_server = server.get(); + std::cerr << "ds4 grpc-server listening on " << addr << "\n"; + server->Wait(); +} + +void signal_handler(int) { + if (auto *srv = g_server.load()) { + srv->Shutdown(std::chrono::system_clock::now() + + std::chrono::seconds(3)); + } +} + +} // namespace + +int main(int argc, char *argv[]) { + std::string addr = "127.0.0.1:50051"; + for (int i = 1; i < argc; ++i) { + std::string a = argv[i]; + const std::string addr_flag = "--addr="; + if (a.rfind(addr_flag, 0) == 0) addr = a.substr(addr_flag.size()); + else if (a == "--addr" && i + 1 < argc) addr = argv[++i]; + else if (a == "--help" || a == "-h") { + std::cout << "Usage: grpc-server --addr=HOST:PORT\n"; + return 0; + } + } + std::signal(SIGINT, signal_handler); + std::signal(SIGTERM, signal_handler); + RunServer(addr); + return 0; +} diff --git a/backend/cpp/ds4/kv_cache.cpp b/backend/cpp/ds4/kv_cache.cpp new file mode 100644 index 000000000..f4d31b23c --- /dev/null +++ b/backend/cpp/ds4/kv_cache.cpp @@ -0,0 +1,205 @@ +#include "kv_cache.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace ds4cpp { + +namespace { + +// Minimal SHA1 (public domain reference). 30 lines; used only here. +struct Sha1 { + uint32_t h[5]; + uint64_t bits; + uint8_t block[64]; + size_t used; + Sha1() { h[0]=0x67452301; h[1]=0xEFCDAB89; h[2]=0x98BADCFE; h[3]=0x10325476; h[4]=0xC3D2E1F0; bits=0; used=0; } + static uint32_t rol(uint32_t x, int n){ return (x<>(32-n)); } + void transform(const uint8_t *b) { + uint32_t w[80]; + for (int i=0;i<16;i++) w[i] = (uint32_t)b[i*4]<<24 | (uint32_t)b[i*4+1]<<16 | (uint32_t)b[i*4+2]<<8 | b[i*4+3]; + for (int i=16;i<80;i++) w[i] = rol(w[i-3]^w[i-8]^w[i-14]^w[i-16], 1); + uint32_t a=h[0],bb=h[1],c=h[2],d=h[3],e=h[4]; + for (int i=0;i<80;i++) { + uint32_t f,k; + if (i<20) { f=(bb&c)|((~bb)&d); k=0x5A827999; } + else if (i<40) { f=bb^c^d; k=0x6ED9EBA1; } + else if (i<60) { f=(bb&c)|(bb&d)|(c&d); k=0x8F1BBCDC; } + else { f=bb^c^d; k=0xCA62C1D6; } + uint32_t t = rol(a,5)+f+e+k+w[i]; + e=d; d=c; c=rol(bb,30); bb=a; a=t; + } + h[0]+=a; h[1]+=bb; h[2]+=c; h[3]+=d; h[4]+=e; + } + void update(const void *p, size_t n) { + const uint8_t *bp = (const uint8_t*)p; + bits += (uint64_t)n*8; + while (n) { + size_t take = 64-used; + if (take>n) take=n; + std::memcpy(block+used, bp, take); + used += take; bp += take; n -= take; + if (used == 64) { transform(block); used = 0; } + } + } + void final(uint8_t out[20]) { + uint8_t pad[64] = {0x80}; + size_t padlen = (used < 56) ? (56-used) : (120-used); + uint64_t lb = bits; + uint8_t len[8]; + for (int i=0;i<8;i++) len[7-i] = (uint8_t)(lb >> (i*8)); + update(pad, padlen); + update(len, 8); + for (int i=0;i<5;i++) { + out[i*4] = h[i]>>24; + out[i*4+1] = h[i]>>16; + out[i*4+2] = h[i]>>8; + out[i*4+3] = h[i]; + } + } +}; + +std::string mkdir_p(const std::string &d) { + if (d.empty()) return d; + struct stat st{}; + if (stat(d.c_str(), &st) == 0) return d; + mkdir(d.c_str(), 0755); + return d; +} + +bool file_exists(const std::string &p) { + struct stat st{}; + return stat(p.c_str(), &st) == 0; +} + +} // namespace + +std::string Sha1Hex(const void *data, size_t len) { + Sha1 s; + s.update(data, len); + uint8_t out[20]; + s.final(out); + char hex[41]; + for (int i = 0; i < 20; ++i) std::snprintf(hex + i*2, 3, "%02x", out[i]); + hex[40] = 0; + return std::string(hex); +} + +KvCache::KvCache() = default; + +void KvCache::SetDir(const std::string &dir) { + dir_ = dir; + if (!dir_.empty()) { + mkdir_p(dir_); + std::fprintf(stderr, "ds4 KvCache: enabled at %s\n", dir_.c_str()); + } else { + std::fprintf(stderr, "ds4 KvCache: disabled (no dir set)\n"); + } +} + +std::string KvCache::Path(const std::string &rendered_text) const { + if (dir_.empty()) return ""; + return dir_ + "/" + Sha1Hex(rendered_text.data(), rendered_text.size()) + ".kv"; +} + +size_t KvCache::LoadLongestPrefix(ds4_session *session, + const std::string &rendered_text, + int ctx_size) { + if (dir_.empty() || !session) return 0; + // Strategy: enumerate all .kv files in dir, read their stored prefix + // header, pick the longest one that is also a prefix of rendered_text. + DIR *d = opendir(dir_.c_str()); + if (!d) return 0; + struct dirent *de; + size_t best_len = 0; + std::string best_path; + while ((de = readdir(d)) != nullptr) { + std::string name = de->d_name; + if (name.size() < 4 || name.substr(name.size()-3) != ".kv") continue; + std::string path = dir_ + "/" + name; + std::ifstream f(path, std::ios::binary); + if (!f) continue; + char magic[4]; f.read(magic, 4); + if (f.gcount() != 4 || std::memcmp(magic, "DS4G", 4) != 0) continue; + uint32_t version=0, file_ctx=0, prefix_len=0; + f.read((char*)&version, 4); f.read((char*)&file_ctx, 4); f.read((char*)&prefix_len, 4); + if (version != 1) continue; + if ((int)file_ctx != ctx_size) continue; + if (prefix_len > rendered_text.size()) continue; + std::vector prefix(prefix_len); + f.read(prefix.data(), prefix_len); + if (std::memcmp(prefix.data(), rendered_text.data(), prefix_len) != 0) continue; + if (prefix_len > best_len) { + best_len = prefix_len; + best_path = path; + } + } + closedir(d); + if (best_len == 0) return 0; + + // Load best_path's payload into session. + std::ifstream f(best_path, std::ios::binary); + char magic[4]; f.read(magic, 4); + uint32_t version, file_ctx, prefix_len; + f.read((char*)&version, 4); f.read((char*)&file_ctx, 4); f.read((char*)&prefix_len, 4); + f.seekg(prefix_len, std::ios::cur); + uint64_t payload_bytes = 0; + f.read((char*)&payload_bytes, 8); + // ds4_session_load_payload reads from a FILE*; reopen via fopen. + FILE *fp = std::fopen(best_path.c_str(), "rb"); + if (!fp) return 0; + // Seek past header + prefix + payload_bytes field. + std::fseek(fp, 4 + 4 + 4 + 4 + prefix_len + 8, SEEK_SET); + char errbuf[256] = {0}; + int rc = ds4_session_load_payload(session, fp, payload_bytes, errbuf, sizeof(errbuf)); + std::fclose(fp); + if (rc != 0) return 0; + return best_len; +} + +void KvCache::Save(ds4_session *session, const std::string &rendered_text, int ctx_size) { + if (dir_.empty()) { + std::fprintf(stderr, "ds4 KvCache::Save: skipped (dir empty)\n"); + return; + } + if (!session) { + std::fprintf(stderr, "ds4 KvCache::Save: skipped (session null)\n"); + return; + } + std::string path = Path(rendered_text); + uint64_t payload_bytes = ds4_session_payload_bytes(session); + std::fprintf(stderr, "ds4 KvCache::Save: path=%s payload_bytes=%llu prefix_len=%zu\n", + path.c_str(), (unsigned long long)payload_bytes, rendered_text.size()); + FILE *fp = std::fopen(path.c_str(), "wb"); + if (!fp) { + std::fprintf(stderr, "ds4 KvCache::Save: fopen failed: %s\n", std::strerror(errno)); + return; + } + char magic[4] = {'D','S','4','G'}; + uint32_t version = 1; + uint32_t ctx = static_cast(ctx_size); + uint32_t prefix_len = static_cast(rendered_text.size()); + std::fwrite(magic, 4, 1, fp); + std::fwrite(&version, 4, 1, fp); + std::fwrite(&ctx, 4, 1, fp); + std::fwrite(&prefix_len, 4, 1, fp); + std::fwrite(rendered_text.data(), prefix_len, 1, fp); + std::fwrite(&payload_bytes, 8, 1, fp); + char errbuf[256] = {0}; + int rc = ds4_session_save_payload(session, fp, errbuf, sizeof(errbuf)); + std::fclose(fp); + if (rc != 0) { + std::fprintf(stderr, "ds4 KvCache::Save: ds4_session_save_payload rc=%d err=%s; removing %s\n", + rc, errbuf, path.c_str()); + std::remove(path.c_str()); + } else { + std::fprintf(stderr, "ds4 KvCache::Save: wrote %s ok\n", path.c_str()); + } +} + +} // namespace ds4cpp diff --git a/backend/cpp/ds4/kv_cache.h b/backend/cpp/ds4/kv_cache.h new file mode 100644 index 000000000..1b0478188 --- /dev/null +++ b/backend/cpp/ds4/kv_cache.h @@ -0,0 +1,44 @@ +#pragma once +#include +extern "C" { +#include "ds4.h" +} + +namespace ds4cpp { + +// Disk-backed KV cache for ds4 sessions. Keyed by SHA1(rendered prompt prefix). +// Format (our own, NOT bit-compatible with ds4-server's KVC files - interop +// is a follow-up plan): +// +// "DS4G" (4 bytes magic) + u32 version=1 + u32 ctx_size + +// u32 prefix_text_len + prefix_text + u64 payload_bytes + payload +class KvCache { +public: + KvCache(); // disabled (dir empty) + + // Set the cache directory. Empty disables. + void SetDir(const std::string &dir); + + // Returns the cache file path for a given rendered text prefix. + std::string Path(const std::string &rendered_text) const; + + // Look up the longest cached prefix that is also a prefix of + // `rendered_text`. Loads it into `session` if found. Returns the + // matched prefix length in bytes (0 if no hit). + size_t LoadLongestPrefix(ds4_session *session, + const std::string &rendered_text, + int ctx_size); + + // Save the current session, associated with this rendered text prefix. + void Save(ds4_session *session, const std::string &rendered_text, int ctx_size); + + bool enabled() const { return !dir_.empty(); } + +private: + std::string dir_; +}; + +// Compute SHA1 of arbitrary bytes; returns 40-char hex. +std::string Sha1Hex(const void *data, size_t len); + +} // namespace ds4cpp diff --git a/backend/cpp/ds4/package.sh b/backend/cpp/ds4/package.sh new file mode 100755 index 000000000..46d9d3c3c --- /dev/null +++ b/backend/cpp/ds4/package.sh @@ -0,0 +1,39 @@ +#!/bin/bash +set -e +CURDIR=$(dirname "$(realpath "$0")") +REPO_ROOT="${CURDIR}/../../.." + +mkdir -p "$CURDIR/package/lib" +cp -avf "$CURDIR/grpc-server" "$CURDIR/package/" +cp -rfv "$CURDIR/run.sh" "$CURDIR/package/" + +UNAME_S=$(uname -s) +if [ "$UNAME_S" = "Darwin" ]; then + # Darwin: bundle dylibs via otool -L (handled by scripts/build/ds4-darwin.sh). + echo "package.sh: Darwin handled by ds4-darwin.sh" + exit 0 +fi + +if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then + cp -arfLv /lib64/ld-linux-x86-64.so.2 "$CURDIR/package/lib/ld.so" + LIBDIR=/lib/x86_64-linux-gnu +elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then + cp -arfLv /lib/ld-linux-aarch64.so.1 "$CURDIR/package/lib/ld.so" + LIBDIR=/lib/aarch64-linux-gnu +else + echo "package.sh: unknown architecture" >&2; exit 1 +fi + +for lib in libc.so.6 libgcc_s.so.1 libstdc++.so.6 libm.so.6 libgomp.so.1 \ + libdl.so.2 librt.so.1 libpthread.so.0; do + cp -arfLv "$LIBDIR/$lib" "$CURDIR/package/lib/$lib" +done + +GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh" +if [ -f "$GPU_LIB_SCRIPT" ]; then + source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib" + package_gpu_libs +fi + +echo "ds4 package contents:" +ls -lah "$CURDIR/package/" "$CURDIR/package/lib/" diff --git a/backend/cpp/ds4/prepare.sh b/backend/cpp/ds4/prepare.sh new file mode 100755 index 000000000..ec76e6fe2 --- /dev/null +++ b/backend/cpp/ds4/prepare.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Clones the upstream ds4 source at the pinned commit into ./ds4/. Idempotent. +set -e + +DS4_REPO="${DS4_REPO:-https://github.com/antirez/ds4}" +DS4_VERSION="${DS4_VERSION:-ae302c2fa18cc6d9aefc021d0f27ae03c9ad2fc0}" + +if [ -d ds4/.git ]; then + current=$(git -C ds4 rev-parse HEAD 2>/dev/null || echo none) + if [ "$current" = "$DS4_VERSION" ]; then + echo "ds4 already at $DS4_VERSION" + exit 0 + fi + git -C ds4 fetch --depth 1 origin "$DS4_VERSION" + git -C ds4 checkout "$DS4_VERSION" + exit 0 +fi + +mkdir -p ds4 +cd ds4 +git init -q +git remote add origin "$DS4_REPO" +git fetch --depth 1 origin "$DS4_VERSION" +git checkout FETCH_HEAD diff --git a/backend/cpp/ds4/run.sh b/backend/cpp/ds4/run.sh new file mode 100755 index 000000000..36b0431c1 --- /dev/null +++ b/backend/cpp/ds4/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# Entry point for the ds4 backend image / BACKEND_BINARY mode. +set -e +CURDIR=$(dirname "$(realpath "$0")") +export LD_LIBRARY_PATH="$CURDIR/lib:$LD_LIBRARY_PATH" +if [ -f "$CURDIR/lib/ld.so" ]; then + exec "$CURDIR/lib/ld.so" "$CURDIR/grpc-server" "$@" +fi +exec "$CURDIR/grpc-server" "$@" diff --git a/backend/index.yaml b/backend/index.yaml index 6c8c57b3e..d04a2d6d5 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -72,6 +72,29 @@ nvidia-cuda-12: "cuda12-turboquant" nvidia-l4t-cuda-12: "nvidia-l4t-arm64-turboquant" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-turboquant" +- &ds4 + name: "ds4" + alias: "ds4" + license: mit + description: | + antirez/ds4 - DeepSeek V4 Flash inference engine. Single-model, + optimized for Metal (Darwin) and CUDA (Linux). Requires the GGUFs + published at huggingface.co/antirez/deepseek-v4-gguf. + urls: + - https://github.com/antirez/ds4 + tags: + - text-to-text + - LLM + - CPU + - CUDA + - Metal + capabilities: + default: "cpu-ds4" + nvidia: "cuda13-ds4" + nvidia-cuda-13: "cuda13-ds4" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ds4" + metal: "metal-ds4" + metal-darwin-arm64: "metal-ds4" - &whispercpp name: "whisper" alias: "whisper" @@ -1127,6 +1150,15 @@ nvidia-cuda-12: "cuda12-turboquant-development" nvidia-l4t-cuda-12: "nvidia-l4t-arm64-turboquant-development" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-turboquant-development" +- !!merge <<: *ds4 + name: "ds4-development" + capabilities: + default: "cpu-ds4-development" + nvidia: "cuda13-ds4-development" + nvidia-cuda-13: "cuda13-ds4-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-ds4-development" + metal: "metal-ds4-development" + metal-darwin-arm64: "metal-ds4-development" - !!merge <<: *stablediffusionggml name: "stablediffusion-ggml-development" capabilities: @@ -1673,6 +1705,47 @@ uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-turboquant" mirrors: - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-turboquant +## ds4 +- !!merge <<: *ds4 + name: "cpu-ds4" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-ds4" + mirrors: + - localai/localai-backends:latest-cpu-ds4 +- !!merge <<: *ds4 + name: "cpu-ds4-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-ds4" + mirrors: + - localai/localai-backends:master-cpu-ds4 +- !!merge <<: *ds4 + name: "cuda13-ds4" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-ds4" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-ds4 +- !!merge <<: *ds4 + name: "cuda13-ds4-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ds4" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-ds4 +- !!merge <<: *ds4 + name: "cuda13-nvidia-l4t-arm64-ds4" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-ds4" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-ds4 +- !!merge <<: *ds4 + name: "cuda13-nvidia-l4t-arm64-ds4-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-ds4" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-ds4 +- !!merge <<: *ds4 + name: "metal-ds4" + uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-ds4" + mirrors: + - localai/localai-backends:latest-metal-darwin-arm64-ds4 +- !!merge <<: *ds4 + name: "metal-ds4-development" + uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-ds4" + mirrors: + - localai/localai-backends:master-metal-darwin-arm64-ds4 ## whisper - !!merge <<: *whispercpp name: "whisper-development" diff --git a/core/gallery/importers/ds4.go b/core/gallery/importers/ds4.go new file mode 100644 index 000000000..f34f7106f --- /dev/null +++ b/core/gallery/importers/ds4.go @@ -0,0 +1,130 @@ +package importers + +import ( + "encoding/json" + "path/filepath" + "strings" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/gallery" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/downloader" + "github.com/mudler/LocalAI/pkg/functions" + "go.yaml.in/yaml/v2" +) + +var _ Importer = &DS4Importer{} + +// DS4Importer detects antirez/ds4 weights - single-model DeepSeek V4 Flash +// inference engine. ds4 only loads the GGUFs published at +// huggingface.co/antirez/deepseek-v4-gguf; auto-detect keys on: +// +// - the repo name itself ("antirez/deepseek-v4-gguf" anywhere in URI) +// - the canonical filename pattern "DeepSeek-V4-Flash-*.gguf" +// +// Must register BEFORE LlamaCPPImporter - both match .gguf, but ds4 is +// more specific and first-match-wins. +type DS4Importer struct{} + +func (i *DS4Importer) Name() string { return "ds4" } +func (i *DS4Importer) Modality() string { return "text" } +func (i *DS4Importer) AutoDetects() bool { return true } + +func (i *DS4Importer) Match(details Details) bool { + preferences, err := details.Preferences.MarshalJSON() + if err != nil { + return false + } + preferencesMap := make(map[string]any) + if len(preferences) > 0 { + _ = json.Unmarshal(preferences, &preferencesMap) + } + + if b, ok := preferencesMap["backend"].(string); ok && b == "ds4" { + return true + } + + if strings.Contains(details.URI, "antirez/deepseek-v4-gguf") { + return true + } + + base := filepath.Base(details.URI) + if strings.HasPrefix(base, "DeepSeek-V4-Flash-") && strings.HasSuffix(base, ".gguf") { + return true + } + + if details.HuggingFace != nil { + for _, file := range details.HuggingFace.Files { + fb := filepath.Base(file.Path) + if strings.HasPrefix(fb, "DeepSeek-V4-Flash-") && strings.HasSuffix(fb, ".gguf") { + return true + } + } + } + + return false +} + +func (i *DS4Importer) Import(details Details) (gallery.ModelConfig, error) { + preferences, err := details.Preferences.MarshalJSON() + if err != nil { + return gallery.ModelConfig{}, err + } + preferencesMap := make(map[string]any) + if len(preferences) > 0 { + _ = json.Unmarshal(preferences, &preferencesMap) + } + + name, ok := preferencesMap["name"].(string) + if !ok { + name = filepath.Base(details.URI) + name = strings.TrimSuffix(name, ".gguf") + } + description, ok := preferencesMap["description"].(string) + if !ok { + description = "DeepSeek V4 Flash - antirez/ds4 backend" + } + + modelConfig := config.ModelConfig{ + Name: name, + Description: description, + KnownUsecaseStrings: []string{config.UsecaseChat}, + Backend: "ds4", + PredictionOptions: schema.PredictionOptions{ + BasicModelRequest: schema.BasicModelRequest{ + Model: "ds4flash.gguf", + }, + }, + TemplateConfig: config.TemplateConfig{ + UseTokenizerTemplate: true, + }, + FunctionsConfig: functions.FunctionsConfig{ + GrammarConfig: functions.GrammarConfig{NoGrammar: true}, + // ds4 emits OpenAI-shape tool_calls in ChatDelta natively via + // our DSML parser; the Go-side regex fallback should NOT fire. + AutomaticToolParsingFallback: false, + }, + } + + cfg := gallery.ModelConfig{ + Name: name, + Description: description, + } + + // The file to fetch: derive from the URI. We standardize the local + // filename to "ds4flash.gguf" to match ds4's own convention (its CLI + // defaults to that path), so users can run the model without extra + // config. + uri := downloader.URI(details.URI) + cfg.Files = append(cfg.Files, gallery.File{ + Filename: "ds4flash.gguf", + URI: string(uri), + }) + + out, err := yaml.Marshal(modelConfig) + if err != nil { + return gallery.ModelConfig{}, err + } + cfg.ConfigFile = string(out) + return cfg, nil +} diff --git a/core/gallery/importers/ds4_test.go b/core/gallery/importers/ds4_test.go new file mode 100644 index 000000000..fcdd6ff72 --- /dev/null +++ b/core/gallery/importers/ds4_test.go @@ -0,0 +1,69 @@ +package importers_test + +import ( + "encoding/json" + "strings" + + . "github.com/mudler/LocalAI/core/gallery/importers" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("DS4Importer", func() { + var importer *DS4Importer + + BeforeEach(func() { + importer = &DS4Importer{} + }) + + Context("Match", func() { + It("matches the canonical HuggingFace repo URI", func() { + details := Details{ + URI: "huggingface://antirez/deepseek-v4-gguf/DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf", + } + Expect(importer.Match(details)).To(BeTrue()) + }) + + It("matches when filename has the DeepSeek-V4-Flash prefix", func() { + details := Details{ + URI: "https://example.com/mirror/DeepSeek-V4-Flash-Q4KExperts-F16HC-F16Compressor-F16Indexer-Q8Attn-Q8Shared-Q8Out-chat-v2.gguf", + } + Expect(importer.Match(details)).To(BeTrue()) + }) + + It("matches when backend preference is ds4", func() { + prefs := json.RawMessage(`{"backend": "ds4"}`) + details := Details{ + URI: "https://example.com/some-other.gguf", + Preferences: prefs, + } + Expect(importer.Match(details)).To(BeTrue()) + }) + + It("does not match arbitrary GGUFs (must fall through to llama-cpp)", func() { + details := Details{URI: "huggingface://TheBloke/Llama-2-7B-GGUF/llama-2-7b.Q4_K_M.gguf"} + Expect(importer.Match(details)).To(BeFalse()) + }) + + It("does not match non-GGUF assets", func() { + details := Details{URI: "https://example.com/model.bin"} + Expect(importer.Match(details)).To(BeFalse()) + }) + }) + + Context("Import", func() { + It("emits backend: ds4 and the standard ds4flash.gguf filename", func() { + details := Details{ + URI: "huggingface://antirez/deepseek-v4-gguf/DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf", + } + cfg, err := importer.Import(details) + Expect(err).NotTo(HaveOccurred()) + Expect(cfg.Files).To(HaveLen(1)) + Expect(cfg.Files[0].Filename).To(Equal("ds4flash.gguf")) + Expect(cfg.Files[0].URI).To(Equal(details.URI)) + Expect(strings.Contains(cfg.ConfigFile, "backend: ds4")).To(BeTrue(), + "ConfigFile must specify backend: ds4, got: %s", cfg.ConfigFile) + Expect(strings.Contains(cfg.ConfigFile, "use_tokenizer_template: true")).To(BeTrue()) + }) + }) +}) diff --git a/core/gallery/importers/importers.go b/core/gallery/importers/importers.go index dc4408e0a..02606f099 100644 --- a/core/gallery/importers/importers.go +++ b/core/gallery/importers/importers.go @@ -153,6 +153,11 @@ var defaultImporters = []Importer{ // checkpoints may carry tokenizer-adjacent artefacts. &RFDetrImporter{}, // Existing + // DS4Importer must precede LlamaCPPImporter - ds4 weights are GGUFs and + // would otherwise be claimed by the generic .gguf-handling llama-cpp + // importer. Matches only the antirez/deepseek-v4-gguf repo + filename + // pattern, so false-positives against arbitrary GGUFs are impossible. + &DS4Importer{}, &LlamaCPPImporter{}, &MLXImporter{}, &VLLMImporter{}, diff --git a/core/http/endpoints/localai/backend.go b/core/http/endpoints/localai/backend.go index fead4ce35..da5808c26 100644 --- a/core/http/endpoints/localai/backend.go +++ b/core/http/endpoints/localai/backend.go @@ -23,6 +23,8 @@ import ( // backends that should appear in the import form dropdown. var knownPrefOnlyBackends = []schema.KnownBackend{ // Text LLM + // ds4: antirez/ds4 - single-model DeepSeek V4 Flash engine; auto-detected via DS4Importer + {Name: "ds4", Modality: "text", AutoDetect: false, Description: "antirez/ds4 DeepSeek V4 Flash engine (auto-detected; pref-only fallback)"}, {Name: "sglang", Modality: "text", AutoDetect: false, Description: "SGLang runtime (preference-only)"}, {Name: "tinygrad", Modality: "text", AutoDetect: false, Description: "tinygrad runtime (preference-only)"}, {Name: "trl", Modality: "text", AutoDetect: false, Description: "Transformers Reinforcement Learning (preference-only)"}, diff --git a/gallery/index.yaml b/gallery/index.yaml index f16255536..5cfe31b47 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -30632,3 +30632,24 @@ - torch_dtype:bf16 parameters: model: Lightricks/LTX-2.3 +- name: deepseek-v4-flash-q2 + description: | + DeepSeek V4 Flash (IQ2XXS GGUF, ~81 GB) - only loadable via the ds4 backend. + Requires >=128 GB RAM. Metal (Darwin) or CUDA (Linux). + See https://github.com/antirez/ds4 for details. + urls: + - https://huggingface.co/antirez/deepseek-v4-gguf + tags: + - deepseek + - ds4 + - gguf + - llm + - chat + overrides: + backend: ds4 + parameters: + model: ds4flash.gguf + files: + - filename: ds4flash.gguf + sha256: 31598c67c8b8744d3bcebcd19aa62253c6dc43cef3b8adf9f593656c9e86fd8c + uri: huggingface://antirez/deepseek-v4-gguf/DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf diff --git a/scripts/build/ds4-darwin.sh b/scripts/build/ds4-darwin.sh new file mode 100755 index 000000000..23017d9a1 --- /dev/null +++ b/scripts/build/ds4-darwin.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# Darwin/Metal build for the ds4 backend. Mirrors llama-cpp-darwin.sh: +# native make, otool -L for dylib bundling, then assemble an OCI tar that +# `local-ai backends install` can consume. +set -ex + +IMAGE_NAME="${IMAGE_NAME:-localai/ds4-darwin}" + +pushd backend/cpp/ds4 +make NATIVE=false grpc-server package +popd + +mkdir -p build/darwin +mkdir -p build/darwin/lib +mkdir -p backend-images + +cp -rf backend/cpp/ds4/grpc-server build/darwin/ +cp -rf backend/cpp/ds4/run.sh build/darwin/ + +# Apple Silicon: pick up Homebrew-installed protobuf utf8_validity if present. +if [[ "$(uname -s)" == "Darwin" && "$(uname -m)" == "arm64" ]]; then + ADDITIONAL_LIBS=${ADDITIONAL_LIBS:-$(ls /opt/homebrew/Cellar/protobuf/**/lib/libutf8_validity*.dylib 2>/dev/null)} +else + ADDITIONAL_LIBS=${ADDITIONAL_LIBS:-""} +fi +for file in $ADDITIONAL_LIBS; do + cp -rfv "$file" build/darwin/lib +done + +# Walk dylibs via otool -L and bundle anything that isn't a system framework. +for file in build/darwin/grpc-server; do + LIBS="$(otool -L "$file" | awk 'NR > 1 { system("echo " $1) } ' | xargs echo)" + for lib in $LIBS; do + if [[ "$lib" == *.dylib ]] && [[ -e "$lib" ]]; then + cp -rvf "$lib" build/darwin/lib + fi + done +done + +echo "Bundled libraries:" +ls -la build/darwin/lib + +# Build an OCI tar that local-ai backends install can consume. +# scripts/build/oci-pack.sh is the existing helper used by llama-cpp-darwin +# - if your tree doesn't have it, write one (5 lines: tar + manifest.json). +if [ -f scripts/build/oci-pack.sh ]; then + bash scripts/build/oci-pack.sh build/darwin backend-images/ds4.tar "$IMAGE_NAME" +else + # Fallback: simple tar - local-ai accepts a flat tar in dev environments. + tar -C build/darwin -cvf backend-images/ds4.tar . +fi diff --git a/tests/e2e-backends/backend_test.go b/tests/e2e-backends/backend_test.go index 605f32f70..95f95f666 100644 --- a/tests/e2e-backends/backend_test.go +++ b/tests/e2e-backends/backend_test.go @@ -194,7 +194,18 @@ var _ = Describe("Backend container", Ordered, func() { BeforeAll(func() { image := os.Getenv("BACKEND_IMAGE") - Expect(image).NotTo(BeEmpty(), "BACKEND_IMAGE env var must be set (e.g. local-ai-backend:llama-cpp)") + // BACKEND_BINARY is an escape hatch for hardware-gated backends (e.g. ds4) + // where building a full Docker image around an 80+ GB model is impractical. + // Points at a `run.sh` produced by `make -C backend/cpp/ package`. + binary := os.Getenv("BACKEND_BINARY") + Expect(image != "" || binary != "").To(BeTrue(), + "either BACKEND_IMAGE or BACKEND_BINARY env var must be set") + Expect(image != "" && binary != "").To(BeFalse(), + "BACKEND_IMAGE and BACKEND_BINARY are mutually exclusive") + if binary != "" { + Expect(filepath.Base(binary)).To(Equal("run.sh"), + "BACKEND_BINARY must point at a run.sh produced by 'make -C backend/cpp/ package'") + } modelURL := os.Getenv("BACKEND_TEST_MODEL_URL") modelFile = os.Getenv("BACKEND_TEST_MODEL_FILE") @@ -203,7 +214,11 @@ var _ = Describe("Backend container", Ordered, func() { "one of BACKEND_TEST_MODEL_URL, BACKEND_TEST_MODEL_FILE, or BACKEND_TEST_MODEL_NAME must be set") caps = parseCaps() - GinkgoWriter.Printf("Testing image=%q with capabilities=%v\n", image, keys(caps)) + src := image + if src == "" { + src = binary + } + GinkgoWriter.Printf("Testing src=%q with capabilities=%v\n", src, keys(caps)) prompt = os.Getenv("BACKEND_TEST_PROMPT") if prompt == "" { @@ -223,10 +238,13 @@ var _ = Describe("Backend container", Ordered, func() { workDir, err = os.MkdirTemp("", "backend-e2e-*") Expect(err).NotTo(HaveOccurred()) - // Extract the image filesystem so we can run run.sh directly. - binaryDir = filepath.Join(workDir, "rootfs") - Expect(os.MkdirAll(binaryDir, 0o755)).To(Succeed()) - extractImage(image, binaryDir) + if image != "" { + binaryDir = filepath.Join(workDir, "rootfs") + Expect(os.MkdirAll(binaryDir, 0o755)).To(Succeed()) + extractImage(image, binaryDir) + } else { + binaryDir = filepath.Dir(binary) + } Expect(filepath.Join(binaryDir, "run.sh")).To(BeAnExistingFile()) // Download the model once if not provided and no HF name given.