diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index aa16c6494..410cb97ad 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -574,6 +574,19 @@ jobs: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "8" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-sam3-cpp' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "sam3-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "8" @@ -1147,6 +1160,32 @@ jobs: backend: "stablediffusion-ggml" dockerfile: "./backend/Dockerfile.golang" context: "./" + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-sam3-cpp' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "sam3-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-sam3-cpp' + base-image: "ubuntu:24.04" + ubuntu-version: '2404' + runs-on: 'ubuntu-24.04-arm' + backend: "sam3-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -1907,6 +1946,59 @@ jobs: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2404' + # sam3-cpp + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-cpu-sam3-cpp' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "sam3-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'sycl_f32' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f32-sam3-cpp' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "sam3-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'sycl_f16' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-sycl-f16-sam3-cpp' + runs-on: 'ubuntu-latest' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "sam3-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' + - build-type: 'vulkan' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64,linux/arm64' + tag-latest: 'auto' + tag-suffix: '-gpu-vulkan-sam3-cpp' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "sam3-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2404' - build-type: 'sycl_f32' cuda-major-version: "" cuda-minor-version: "" @@ -1959,6 +2051,19 @@ jobs: dockerfile: "./backend/Dockerfile.golang" context: "./" ubuntu-version: '2204' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + skip-drivers: 'false' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-arm64-sam3-cpp' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + runs-on: 'ubuntu-24.04-arm' + backend: "sam3-cpp" + dockerfile: "./backend/Dockerfile.golang" + context: "./" + ubuntu-version: '2204' # whisper - build-type: '' cuda-major-version: "" diff --git a/.github/workflows/bump_deps.yaml b/.github/workflows/bump_deps.yaml index 49e489beb..50bd56c69 100644 --- a/.github/workflows/bump_deps.yaml +++ b/.github/workflows/bump_deps.yaml @@ -34,6 +34,10 @@ jobs: variable: "ACESTEP_CPP_VERSION" branch: "master" file: "backend/go/acestep-cpp/Makefile" + - repository: "PABannier/sam3.cpp" + variable: "SAM3_VERSION" + branch: "main" + file: "backend/go/sam3-cpp/Makefile" runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 diff --git a/Makefile b/Makefile index eeffc8e2b..816d256fb 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/mlx-distributed backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/acestep-cpp backends/fish-speech backends/voxtral backends/opus backends/trl backends/llama-cpp-quantization backends/kokoros backends/sam3-cpp GOCMD=go GOTEST=$(GOCMD) test @@ -593,6 +593,9 @@ BACKEND_LLAMA_CPP_QUANTIZATION = llama-cpp-quantization|python|.|false|true # Rust backends BACKEND_KOKOROS = kokoros|rust|.|false|true +# C++ backends (Go wrapper with purego) +BACKEND_SAM3_CPP = sam3-cpp|golang|.|false|true + # Helper function to build docker image for a backend # Usage: $(call docker-build-backend,BACKEND_NAME,DOCKERFILE_TYPE,BUILD_CONTEXT,PROGRESS_FLAG,NEEDS_BACKEND_ARG) define docker-build-backend @@ -652,12 +655,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_MLX_DISTRIBUTED))) $(eval $(call generate-docker-build-target,$(BACKEND_TRL))) $(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_QUANTIZATION))) $(eval $(call generate-docker-build-target,$(BACKEND_KOKOROS))) +$(eval $(call generate-docker-build-target,$(BACKEND_SAM3_CPP))) # Pattern rule for docker-save targets docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-kokoros +docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-fish-speech docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-acestep-cpp docker-build-voxtral docker-build-mlx-distributed docker-build-trl docker-build-llama-cpp-quantization docker-build-kokoros docker-build-sam3-cpp ######################################################## ### Mock Backend for E2E Tests diff --git a/backend/backend.proto b/backend/backend.proto index 9a5eea630..078b2edc1 100644 --- a/backend/backend.proto +++ b/backend/backend.proto @@ -444,6 +444,10 @@ message Message { message DetectOptions { string src = 1; + string prompt = 2; // Text prompt (for SAM 3 PCS mode) + repeated float points = 3; // Point coordinates as [x1, y1, label1, x2, y2, label2, ...] (label: 1=pos, 0=neg) + repeated float boxes = 4; // Box coordinates as [x1, y1, x2, y2, ...] + float threshold = 5; // Detection confidence threshold } message Detection { @@ -453,6 +457,7 @@ message Detection { float height = 4; float confidence = 5; string class_name = 6; + bytes mask = 7; // PNG-encoded binary segmentation mask } message DetectResponse { diff --git a/backend/go/sam3-cpp/.gitignore b/backend/go/sam3-cpp/.gitignore new file mode 100644 index 000000000..c776f8683 --- /dev/null +++ b/backend/go/sam3-cpp/.gitignore @@ -0,0 +1,7 @@ +sources/ +build*/ +package/ +libgosam3*.so +sam3-cpp +test-models/ +test-data/ diff --git a/backend/go/sam3-cpp/CMakeLists.txt b/backend/go/sam3-cpp/CMakeLists.txt new file mode 100644 index 000000000..c43569d50 --- /dev/null +++ b/backend/go/sam3-cpp/CMakeLists.txt @@ -0,0 +1,26 @@ +cmake_minimum_required(VERSION 3.14) +project(gosam3 LANGUAGES C CXX) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +# Build ggml as static libraries to avoid runtime .so dependencies +set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static libraries" FORCE) + +set(SAM3_BUILD_EXAMPLES OFF CACHE BOOL "Disable sam3.cpp examples" FORCE) +set(SAM3_BUILD_TESTS OFF CACHE BOOL "Disable sam3.cpp tests" FORCE) + +add_subdirectory(./sources/sam3.cpp) + +add_library(gosam3 MODULE gosam3.cpp) +target_link_libraries(gosam3 PRIVATE sam3 ggml) + +if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0) + target_link_libraries(gosam3 PRIVATE stdc++fs) +endif() + +target_include_directories(gosam3 PUBLIC + sources/sam3.cpp + sources/sam3.cpp/ggml/include +) + +set_property(TARGET gosam3 PROPERTY CXX_STANDARD 14) +set_target_properties(gosam3 PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) diff --git a/backend/go/sam3-cpp/Makefile b/backend/go/sam3-cpp/Makefile new file mode 100644 index 000000000..689ed490a --- /dev/null +++ b/backend/go/sam3-cpp/Makefile @@ -0,0 +1,122 @@ +CMAKE_ARGS?= +BUILD_TYPE?= +NATIVE?=false + +GOCMD?=go +GO_TAGS?= +JOBS?=$(shell nproc --ignore=1) + +# sam3.cpp +SAM3_REPO?=https://github.com/PABannier/sam3.cpp +SAM3_VERSION?=8cc6e62bc740d7972746fcd47465ddf1c2b1e3c3 + +ifeq ($(NATIVE),false) + CMAKE_ARGS+=-DGGML_NATIVE=OFF +endif + +# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically +ifeq ($(BUILD_TYPE),cublas) + CMAKE_ARGS+=-DGGML_CUDA=ON +else ifeq ($(BUILD_TYPE),openblas) + CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS +else ifeq ($(BUILD_TYPE),clblas) + CMAKE_ARGS+=-DGGML_CLBLAST=ON +else ifeq ($(BUILD_TYPE),hipblas) + ROCM_HOME ?= /opt/rocm + ROCM_PATH ?= /opt/rocm + export CXX=$(ROCM_HOME)/llvm/bin/clang++ + export CC=$(ROCM_HOME)/llvm/bin/clang + AMDGPU_TARGETS?=gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201 + CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS) +else ifeq ($(BUILD_TYPE),vulkan) + CMAKE_ARGS+=-DGGML_VULKAN=ON +else ifeq ($(OS),Darwin) + ifneq ($(BUILD_TYPE),metal) + CMAKE_ARGS+=-DGGML_METAL=OFF + else + CMAKE_ARGS+=-DGGML_METAL=ON + CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON + endif +endif + +ifeq ($(BUILD_TYPE),sycl_f16) + CMAKE_ARGS+=-DGGML_SYCL=ON \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx \ + -DGGML_SYCL_F16=ON +endif + +ifeq ($(BUILD_TYPE),sycl_f32) + CMAKE_ARGS+=-DGGML_SYCL=ON \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx +endif + +sources/sam3.cpp: + git clone --recursive $(SAM3_REPO) sources/sam3.cpp && \ + cd sources/sam3.cpp && \ + git checkout $(SAM3_VERSION) && \ + git submodule update --init --recursive --depth 1 --single-branch + +# Detect OS +UNAME_S := $(shell uname -s) + +# Only build CPU variants on Linux +ifeq ($(UNAME_S),Linux) + VARIANT_TARGETS = libgosam3-avx.so libgosam3-avx2.so libgosam3-avx512.so libgosam3-fallback.so +else + # On non-Linux (e.g., Darwin), build only fallback variant + VARIANT_TARGETS = libgosam3-fallback.so +endif + +sam3-cpp: main.go gosam3.go $(VARIANT_TARGETS) + CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o sam3-cpp ./ + +package: sam3-cpp + bash package.sh + +build: package + +clean: purge + rm -rf libgosam3*.so sam3-cpp package sources + +purge: + rm -rf build* + +# Build all variants (Linux only) +ifeq ($(UNAME_S),Linux) +libgosam3-avx.so: sources/sam3.cpp + $(MAKE) purge + $(info ${GREEN}I sam3-cpp build info:avx${RESET}) + SO_TARGET=libgosam3-avx.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgosam3-custom + rm -rfv build* + +libgosam3-avx2.so: sources/sam3.cpp + $(MAKE) purge + $(info ${GREEN}I sam3-cpp build info:avx2${RESET}) + SO_TARGET=libgosam3-avx2.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgosam3-custom + rm -rfv build* + +libgosam3-avx512.so: sources/sam3.cpp + $(MAKE) purge + $(info ${GREEN}I sam3-cpp build info:avx512${RESET}) + SO_TARGET=libgosam3-avx512.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) libgosam3-custom + rm -rfv build* +endif + +# Build fallback variant (all platforms) +libgosam3-fallback.so: sources/sam3.cpp + $(MAKE) purge + $(info ${GREEN}I sam3-cpp build info:fallback${RESET}) + SO_TARGET=libgosam3-fallback.so CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) libgosam3-custom + rm -rfv build* + +libgosam3-custom: CMakeLists.txt gosam3.cpp gosam3.h + mkdir -p build-$(SO_TARGET) && \ + cd build-$(SO_TARGET) && \ + cmake .. $(CMAKE_ARGS) && \ + cmake --build . --config Release -j$(JOBS) && \ + cd .. && \ + mv build-$(SO_TARGET)/libgosam3.so ./$(SO_TARGET) + +all: sam3-cpp package diff --git a/backend/go/sam3-cpp/gosam3.cpp b/backend/go/sam3-cpp/gosam3.cpp new file mode 100644 index 000000000..fe4e9e997 --- /dev/null +++ b/backend/go/sam3-cpp/gosam3.cpp @@ -0,0 +1,193 @@ +#include "sam3.h" +#include "gosam3.h" + +#include +#include +#include +#include + +#define STB_IMAGE_WRITE_IMPLEMENTATION +#define STB_IMAGE_WRITE_STATIC +#include "stb_image_write.h" + +// Static state +static std::shared_ptr g_model; +static sam3_state_ptr g_state; +static sam3_result g_result; +static std::vector> g_mask_pngs; + +// Callback for stbi_write_png_to_mem via stbi_write_png_to_func +static void png_write_callback(void *context, void *data, int size) { + auto *buf = static_cast*>(context); + auto *bytes = static_cast(data); + buf->insert(buf->end(), bytes, bytes + size); +} + +// Encode all masks as PNGs after segmentation +static void encode_masks_as_png() { + g_mask_pngs.clear(); + g_mask_pngs.resize(g_result.detections.size()); + + for (size_t i = 0; i < g_result.detections.size(); i++) { + const auto &mask = g_result.detections[i].mask; + if (mask.width > 0 && mask.height > 0 && !mask.data.empty()) { + stbi_write_png_to_func(png_write_callback, &g_mask_pngs[i], + mask.width, mask.height, 1, + mask.data.data(), mask.width); + } + } +} + +extern "C" { + +int sam3_cpp_load_model(const char *model_path, int threads) { + sam3_params params; + params.model_path = model_path; + params.n_threads = threads; + params.use_gpu = true; + + g_model = sam3_load_model(params); + if (!g_model) { + fprintf(stderr, "[sam3-cpp] Failed to load model: %s\n", model_path); + return 1; + } + + g_state = sam3_create_state(*g_model, params); + if (!g_state) { + fprintf(stderr, "[sam3-cpp] Failed to create state\n"); + g_model.reset(); + return 2; + } + + fprintf(stderr, "[sam3-cpp] Model loaded: %s (threads=%d)\n", model_path, threads); + return 0; +} + +int sam3_cpp_encode_image(const char *image_path) { + if (!g_model || !g_state) { + fprintf(stderr, "[sam3-cpp] Model not loaded\n"); + return 1; + } + + sam3_image img = sam3_load_image(image_path); + if (img.data.empty()) { + fprintf(stderr, "[sam3-cpp] Failed to load image: %s\n", image_path); + return 2; + } + + if (!sam3_encode_image(*g_state, *g_model, img)) { + fprintf(stderr, "[sam3-cpp] Failed to encode image\n"); + return 3; + } + + return 0; +} + +int sam3_cpp_segment_pvs(float *points, int n_point_triples, + float *boxes, int n_box_quads, + float threshold) { + if (!g_model || !g_state) { + return -1; + } + + sam3_pvs_params pvs_params; + + // Parse points: each triple is [x, y, label] + for (int i = 0; i < n_point_triples; i++) { + float x = points[i * 3]; + float y = points[i * 3 + 1]; + float label = points[i * 3 + 2]; + sam3_point pt = {x, y}; + if (label > 0.5f) { + pvs_params.pos_points.push_back(pt); + } else { + pvs_params.neg_points.push_back(pt); + } + } + + // Parse boxes: each quad is [x1, y1, x2, y2], use only first box + if (n_box_quads > 0) { + pvs_params.box = {boxes[0], boxes[1], boxes[2], boxes[3]}; + pvs_params.use_box = true; + } + + g_result = sam3_segment_pvs(*g_state, *g_model, pvs_params); + encode_masks_as_png(); + + return static_cast(g_result.detections.size()); +} + +int sam3_cpp_segment_pcs(const char *text_prompt, float threshold) { + if (!g_model || !g_state) { + return -1; + } + + // PCS mode requires SAM 3 (full model with text encoder) + if (sam3_is_visual_only(*g_model) || + sam3_get_model_type(*g_model) != SAM3_MODEL_SAM3) { + fprintf(stderr, "[sam3-cpp] PCS mode requires full SAM 3 model\n"); + return -1; + } + + sam3_pcs_params pcs_params; + pcs_params.text_prompt = text_prompt; + pcs_params.score_threshold = threshold > 0 ? threshold : 0.5f; + + g_result = sam3_segment_pcs(*g_state, *g_model, pcs_params); + encode_masks_as_png(); + + return static_cast(g_result.detections.size()); +} + +int sam3_cpp_get_n_detections(void) { + return static_cast(g_result.detections.size()); +} + +float sam3_cpp_get_detection_x(int i) { + if (i < 0 || i >= static_cast(g_result.detections.size())) return 0; + return g_result.detections[i].box.x0; +} + +float sam3_cpp_get_detection_y(int i) { + if (i < 0 || i >= static_cast(g_result.detections.size())) return 0; + return g_result.detections[i].box.y0; +} + +float sam3_cpp_get_detection_w(int i) { + if (i < 0 || i >= static_cast(g_result.detections.size())) return 0; + const auto &box = g_result.detections[i].box; + return box.x1 - box.x0; +} + +float sam3_cpp_get_detection_h(int i) { + if (i < 0 || i >= static_cast(g_result.detections.size())) return 0; + const auto &box = g_result.detections[i].box; + return box.y1 - box.y0; +} + +float sam3_cpp_get_detection_score(int i) { + if (i < 0 || i >= static_cast(g_result.detections.size())) return 0; + return g_result.detections[i].score; +} + +int sam3_cpp_get_detection_mask_png(int i, unsigned char *buf, int buf_size) { + if (i < 0 || i >= static_cast(g_mask_pngs.size())) return 0; + + const auto &png = g_mask_pngs[i]; + int size = static_cast(png.size()); + + if (buf == nullptr) { + return size; + } + + int to_copy = size < buf_size ? size : buf_size; + memcpy(buf, png.data(), to_copy); + return to_copy; +} + +void sam3_cpp_free_results(void) { + g_result.detections.clear(); + g_mask_pngs.clear(); +} + +} // extern "C" diff --git a/backend/go/sam3-cpp/gosam3.go b/backend/go/sam3-cpp/gosam3.go new file mode 100644 index 000000000..27438a6f8 --- /dev/null +++ b/backend/go/sam3-cpp/gosam3.go @@ -0,0 +1,143 @@ +package main + +import ( + "encoding/base64" + "fmt" + "os" + "path/filepath" + "unsafe" + + "github.com/mudler/LocalAI/pkg/grpc/base" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" +) + +type SAM3 struct { + base.SingleThread +} + +var ( + CppLoadModel func(modelPath string, threads int) int + CppEncodeImage func(imagePath string) int + CppSegmentPVS func(points uintptr, nPointTriples int, boxes uintptr, nBoxQuads int, threshold float32) int + CppSegmentPCS func(textPrompt string, threshold float32) int + CppGetNDetections func() int + CppGetDetectionX func(i int) float32 + CppGetDetectionY func(i int) float32 + CppGetDetectionW func(i int) float32 + CppGetDetectionH func(i int) float32 + CppGetDetectionScore func(i int) float32 + CppGetDetectionMaskPNG func(i int, buf uintptr, bufSize int) int + CppFreeResults func() +) + +func (s *SAM3) Load(opts *pb.ModelOptions) error { + modelFile := opts.ModelFile + if modelFile == "" { + modelFile = opts.Model + } + + var modelPath string + if filepath.IsAbs(modelFile) { + modelPath = modelFile + } else { + modelPath = filepath.Join(opts.ModelPath, modelFile) + } + + threads := int(opts.Threads) + if threads <= 0 { + threads = 4 + } + + ret := CppLoadModel(modelPath, threads) + if ret != 0 { + return fmt.Errorf("failed to load SAM3 model (error %d): %s", ret, modelPath) + } + + return nil +} + +func (s *SAM3) Detect(opts *pb.DetectOptions) (pb.DetectResponse, error) { + // Decode base64 image and write to temp file + imgData, err := base64.StdEncoding.DecodeString(opts.Src) + if err != nil { + return pb.DetectResponse{}, fmt.Errorf("failed to decode image: %w", err) + } + + tmpFile, err := os.CreateTemp("", "sam3-*.png") + if err != nil { + return pb.DetectResponse{}, fmt.Errorf("failed to create temp file: %w", err) + } + defer os.Remove(tmpFile.Name()) + + if _, err := tmpFile.Write(imgData); err != nil { + tmpFile.Close() + return pb.DetectResponse{}, fmt.Errorf("failed to write temp file: %w", err) + } + tmpFile.Close() + + // Encode image + ret := CppEncodeImage(tmpFile.Name()) + if ret != 0 { + return pb.DetectResponse{}, fmt.Errorf("failed to encode image (error %d)", ret) + } + + threshold := opts.Threshold + if threshold <= 0 { + threshold = 0.5 + } + + // Determine segmentation mode + var nDetections int + if opts.Prompt != "" { + // Text-prompted segmentation (PCS mode, SAM 3 only) + nDetections = CppSegmentPCS(opts.Prompt, threshold) + } else { + // Point/box-prompted segmentation (PVS mode) + var pointsPtr uintptr + var boxesPtr uintptr + nPointTriples := len(opts.Points) / 3 + nBoxQuads := len(opts.Boxes) / 4 + + if nPointTriples > 0 { + pointsPtr = uintptr(unsafe.Pointer(&opts.Points[0])) + } + if nBoxQuads > 0 { + boxesPtr = uintptr(unsafe.Pointer(&opts.Boxes[0])) + } + + nDetections = CppSegmentPVS(pointsPtr, nPointTriples, boxesPtr, nBoxQuads, threshold) + } + + if nDetections < 0 { + return pb.DetectResponse{}, fmt.Errorf("segmentation failed") + } + + defer CppFreeResults() + + // Build response + detections := make([]*pb.Detection, nDetections) + for i := 0; i < nDetections; i++ { + det := &pb.Detection{ + X: CppGetDetectionX(i), + Y: CppGetDetectionY(i), + Width: CppGetDetectionW(i), + Height: CppGetDetectionH(i), + Confidence: CppGetDetectionScore(i), + ClassName: "segment", + } + + // Get mask PNG + maskSize := CppGetDetectionMaskPNG(i, 0, 0) + if maskSize > 0 { + maskBuf := make([]byte, maskSize) + CppGetDetectionMaskPNG(i, uintptr(unsafe.Pointer(&maskBuf[0])), maskSize) + det.Mask = maskBuf + } + + detections[i] = det + } + + return pb.DetectResponse{ + Detections: detections, + }, nil +} diff --git a/backend/go/sam3-cpp/gosam3.h b/backend/go/sam3-cpp/gosam3.h new file mode 100644 index 000000000..bd39718d0 --- /dev/null +++ b/backend/go/sam3-cpp/gosam3.h @@ -0,0 +1,51 @@ +#ifndef GOSAM3_H +#define GOSAM3_H + +#ifdef __cplusplus +extern "C" { +#endif + +// Load model from file. Returns 0 on success, non-zero on failure. +int sam3_cpp_load_model(const char *model_path, int threads); + +// Encode an image from file path. Must be called before segmentation. +// Returns 0 on success. +int sam3_cpp_encode_image(const char *image_path); + +// Segment with point/box prompts (PVS mode). +// points: flat array of [x, y, label] triples (label: 1=positive, 0=negative) +// boxes: flat array of [x1, y1, x2, y2] quads +// Returns number of detections, or -1 on error. +int sam3_cpp_segment_pvs(float *points, int n_point_triples, + float *boxes, int n_box_quads, + float threshold); + +// Segment with text prompt (PCS mode, SAM 3 only). +// Returns number of detections, or -1 on error. +int sam3_cpp_segment_pcs(const char *text_prompt, float threshold); + +// Access detection results (valid after a segment call). +int sam3_cpp_get_n_detections(void); + +// Get bounding box for detection i (as x, y, width, height). +float sam3_cpp_get_detection_x(int i); +float sam3_cpp_get_detection_y(int i); +float sam3_cpp_get_detection_w(int i); +float sam3_cpp_get_detection_h(int i); + +// Get confidence score for detection i. +float sam3_cpp_get_detection_score(int i); + +// Get mask as PNG-encoded bytes. +// If buf is NULL, returns the required buffer size. +// Otherwise writes up to buf_size bytes and returns bytes written. +int sam3_cpp_get_detection_mask_png(int i, unsigned char *buf, int buf_size); + +// Free current detection results. +void sam3_cpp_free_results(void); + +#ifdef __cplusplus +} +#endif + +#endif // GOSAM3_H diff --git a/backend/go/sam3-cpp/main.go b/backend/go/sam3-cpp/main.go new file mode 100644 index 000000000..c83a59285 --- /dev/null +++ b/backend/go/sam3-cpp/main.go @@ -0,0 +1,56 @@ +package main + +import ( + "flag" + "os" + + "github.com/ebitengine/purego" + grpc "github.com/mudler/LocalAI/pkg/grpc" +) + +var ( + addr = flag.String("addr", "localhost:50051", "the address to connect to") +) + +type LibFuncs struct { + FuncPtr any + Name string +} + +func main() { + // Get library name from environment variable, default to fallback + libName := os.Getenv("SAM3_LIBRARY") + if libName == "" { + libName = "./libgosam3-fallback.so" + } + + gosamLib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL) + if err != nil { + panic(err) + } + + libFuncs := []LibFuncs{ + {&CppLoadModel, "sam3_cpp_load_model"}, + {&CppEncodeImage, "sam3_cpp_encode_image"}, + {&CppSegmentPVS, "sam3_cpp_segment_pvs"}, + {&CppSegmentPCS, "sam3_cpp_segment_pcs"}, + {&CppGetNDetections, "sam3_cpp_get_n_detections"}, + {&CppGetDetectionX, "sam3_cpp_get_detection_x"}, + {&CppGetDetectionY, "sam3_cpp_get_detection_y"}, + {&CppGetDetectionW, "sam3_cpp_get_detection_w"}, + {&CppGetDetectionH, "sam3_cpp_get_detection_h"}, + {&CppGetDetectionScore, "sam3_cpp_get_detection_score"}, + {&CppGetDetectionMaskPNG, "sam3_cpp_get_detection_mask_png"}, + {&CppFreeResults, "sam3_cpp_free_results"}, + } + + for _, lf := range libFuncs { + purego.RegisterLibFunc(lf.FuncPtr, gosamLib, lf.Name) + } + + flag.Parse() + + if err := grpc.StartServer(*addr, &SAM3{}); err != nil { + panic(err) + } +} diff --git a/backend/go/sam3-cpp/package.sh b/backend/go/sam3-cpp/package.sh new file mode 100755 index 000000000..254aef286 --- /dev/null +++ b/backend/go/sam3-cpp/package.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# Script to copy the appropriate libraries based on architecture + +set -e + +CURDIR=$(dirname "$(realpath $0)") +REPO_ROOT="${CURDIR}/../../.." + +# Create lib directory +mkdir -p $CURDIR/package/lib + +cp -avf $CURDIR/libgosam3-*.so $CURDIR/package/ +cp -avf $CURDIR/sam3-cpp $CURDIR/package/ +cp -fv $CURDIR/run.sh $CURDIR/package/ + +# Detect architecture and copy appropriate libraries +if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then + # x86_64 architecture + echo "Detected x86_64 architecture, copying x86_64 libraries..." + cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so + cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6 + cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2 + cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1 + cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0 +elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then + # ARM64 architecture + echo "Detected ARM64 architecture, copying ARM64 libraries..." + cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so + cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6 + cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2 + cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1 + cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0 +elif [ $(uname -s) = "Darwin" ]; then + echo "Detected Darwin" +else + echo "Error: Could not detect architecture" + exit 1 +fi + +# Package GPU libraries based on BUILD_TYPE +GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh" +if [ -f "$GPU_LIB_SCRIPT" ]; then + echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..." + source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib" + package_gpu_libs +fi + +echo "Packaging completed successfully" +ls -liah $CURDIR/package/ +ls -liah $CURDIR/package/lib/ diff --git a/backend/go/sam3-cpp/run.sh b/backend/go/sam3-cpp/run.sh new file mode 100755 index 000000000..423ed9199 --- /dev/null +++ b/backend/go/sam3-cpp/run.sh @@ -0,0 +1,52 @@ +#!/bin/bash +set -ex + +# Get the absolute current dir where the script is located +CURDIR=$(dirname "$(realpath $0)") + +cd / + +echo "CPU info:" +if [ "$(uname)" != "Darwin" ]; then + grep -e "model\sname" /proc/cpuinfo | head -1 + grep -e "flags" /proc/cpuinfo | head -1 +fi + +LIBRARY="$CURDIR/libgosam3-fallback.so" + +if [ "$(uname)" != "Darwin" ]; then + if grep -q -e "\savx\s" /proc/cpuinfo ; then + echo "CPU: AVX found OK" + if [ -e $CURDIR/libgosam3-avx.so ]; then + LIBRARY="$CURDIR/libgosam3-avx.so" + fi + fi + + if grep -q -e "\savx2\s" /proc/cpuinfo ; then + echo "CPU: AVX2 found OK" + if [ -e $CURDIR/libgosam3-avx2.so ]; then + LIBRARY="$CURDIR/libgosam3-avx2.so" + fi + fi + + # Check avx 512 + if grep -q -e "\savx512f\s" /proc/cpuinfo ; then + echo "CPU: AVX512F found OK" + if [ -e $CURDIR/libgosam3-avx512.so ]; then + LIBRARY="$CURDIR/libgosam3-avx512.so" + fi + fi +fi + +export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH +export SAM3_LIBRARY=$LIBRARY + +# If there is a lib/ld.so, use it +if [ -f $CURDIR/lib/ld.so ]; then + echo "Using lib/ld.so" + echo "Using library: $LIBRARY" + exec $CURDIR/lib/ld.so $CURDIR/sam3-cpp "$@" +fi + +echo "Using library: $LIBRARY" +exec $CURDIR/sam3-cpp "$@" diff --git a/backend/go/sam3-cpp/test.sh b/backend/go/sam3-cpp/test.sh new file mode 100755 index 000000000..e34c7402e --- /dev/null +++ b/backend/go/sam3-cpp/test.sh @@ -0,0 +1,50 @@ +#!/bin/bash +set -e + +CURDIR=$(dirname "$(realpath $0)") + +echo "Running sam3-cpp backend tests..." + +# The test requires a SAM model in GGML format. +# Uses EdgeTAM Q4_0 (~15MB) for fast CI testing. +SAM3_MODEL_DIR="${SAM3_MODEL_DIR:-$CURDIR/test-models}" +SAM3_MODEL_FILE="${SAM3_MODEL_FILE:-edgetam_q4_0.ggml}" +SAM3_MODEL_URL="${SAM3_MODEL_URL:-https://huggingface.co/PABannier/sam3.cpp/resolve/main/edgetam_q4_0.ggml}" + +# Download model if not present +if [ ! -f "$SAM3_MODEL_DIR/$SAM3_MODEL_FILE" ]; then + echo "Downloading EdgeTAM Q4_0 model for testing..." + mkdir -p "$SAM3_MODEL_DIR" + curl -L -o "$SAM3_MODEL_DIR/$SAM3_MODEL_FILE" "$SAM3_MODEL_URL" --progress-bar + echo "Model downloaded." +fi + +# Create a test image (4x4 red pixel PNG) using base64 +# This is a minimal valid PNG for testing the pipeline +TEST_IMAGE_DIR="$CURDIR/test-data" +mkdir -p "$TEST_IMAGE_DIR" + +# Generate a simple test image using Python if available, otherwise use a pre-encoded one +if command -v python3 &> /dev/null; then + python3 -c " +import struct, zlib, base64 +def create_png(width, height, r, g, b): + raw = b'' + for y in range(height): + raw += b'\x00' # filter byte + for x in range(width): + raw += bytes([r, g, b]) + def chunk(ctype, data): + c = ctype + data + return struct.pack('>I', len(data)) + c + struct.pack('>I', zlib.crc32(c) & 0xffffffff) + ihdr = struct.pack('>IIBBBBB', width, height, 8, 2, 0, 0, 0) + return b'\x89PNG\r\n\x1a\n' + chunk(b'IHDR', ihdr) + chunk(b'IDAT', zlib.compress(raw)) + chunk(b'IEND', b'') +with open('$TEST_IMAGE_DIR/test.png', 'wb') as f: + f.write(create_png(64, 64, 255, 0, 0)) +" + echo "Test image created." +fi + +echo "sam3-cpp test setup complete." +echo "Model: $SAM3_MODEL_DIR/$SAM3_MODEL_FILE" +echo "Note: Full integration tests run via the LocalAI test-extra target." diff --git a/backend/index.yaml b/backend/index.yaml index c424f5e42..6c482c12f 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -125,6 +125,31 @@ nvidia-cuda-13: "cuda13-rfdetr" nvidia-cuda-12: "cuda12-rfdetr" nvidia-l4t-cuda-12: "nvidia-l4t-arm64-rfdetr" +- &sam3cpp + name: "sam3-cpp" + alias: "sam3-cpp" + license: mit + description: | + Segment Anything Model (SAM 3/2/EdgeTAM) in C/C++ using GGML. + Supports text-prompted and point/box-prompted image segmentation. + urls: + - https://github.com/PABannier/sam3.cpp + tags: + - image-segmentation + - object-detection + - sam3 + - gpu + - cpu + capabilities: + default: "cpu-sam3-cpp" + nvidia: "cuda12-sam3-cpp" + nvidia-cuda-12: "cuda12-sam3-cpp" + nvidia-cuda-13: "cuda13-sam3-cpp" + nvidia-l4t: "nvidia-l4t-arm64-sam3-cpp" + nvidia-l4t-cuda-12: "nvidia-l4t-arm64-sam3-cpp" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-sam3-cpp" + intel: "intel-sycl-f32-sam3-cpp" + vulkan: "vulkan-sam3-cpp" - &vllm name: "vllm" license: apache-2.0 @@ -1628,6 +1653,89 @@ uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-rfdetr" mirrors: - localai/localai-backends:master-metal-darwin-arm64-rfdetr +## sam3-cpp +- !!merge <<: *sam3cpp + name: "sam3-cpp-development" + capabilities: + default: "cpu-sam3-cpp-development" + nvidia: "cuda12-sam3-cpp-development" + nvidia-cuda-12: "cuda12-sam3-cpp-development" + nvidia-cuda-13: "cuda13-sam3-cpp-development" + nvidia-l4t: "nvidia-l4t-arm64-sam3-cpp-development" + nvidia-l4t-cuda-12: "nvidia-l4t-arm64-sam3-cpp-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-sam3-cpp-development" + intel: "intel-sycl-f32-sam3-cpp-development" + vulkan: "vulkan-sam3-cpp-development" +- !!merge <<: *sam3cpp + name: "cpu-sam3-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-sam3-cpp" + mirrors: + - localai/localai-backends:latest-cpu-sam3-cpp +- !!merge <<: *sam3cpp + name: "cpu-sam3-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-sam3-cpp" + mirrors: + - localai/localai-backends:master-cpu-sam3-cpp +- !!merge <<: *sam3cpp + name: "cuda12-sam3-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-sam3-cpp" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-sam3-cpp +- !!merge <<: *sam3cpp + name: "cuda12-sam3-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-sam3-cpp" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-12-sam3-cpp +- !!merge <<: *sam3cpp + name: "cuda13-sam3-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-sam3-cpp" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-sam3-cpp +- !!merge <<: *sam3cpp + name: "cuda13-sam3-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-sam3-cpp" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-sam3-cpp +- !!merge <<: *sam3cpp + name: "nvidia-l4t-arm64-sam3-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-sam3-cpp" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-arm64-sam3-cpp +- !!merge <<: *sam3cpp + name: "nvidia-l4t-arm64-sam3-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-sam3-cpp" + mirrors: + - localai/localai-backends:master-nvidia-l4t-arm64-sam3-cpp +- !!merge <<: *sam3cpp + name: "cuda13-nvidia-l4t-arm64-sam3-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-sam3-cpp" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-sam3-cpp +- !!merge <<: *sam3cpp + name: "cuda13-nvidia-l4t-arm64-sam3-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-sam3-cpp" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-sam3-cpp +- !!merge <<: *sam3cpp + name: "intel-sycl-f32-sam3-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-sam3-cpp" + mirrors: + - localai/localai-backends:latest-gpu-intel-sycl-f32-sam3-cpp +- !!merge <<: *sam3cpp + name: "intel-sycl-f32-sam3-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-sam3-cpp" + mirrors: + - localai/localai-backends:master-gpu-intel-sycl-f32-sam3-cpp +- !!merge <<: *sam3cpp + name: "vulkan-sam3-cpp" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-sam3-cpp" + mirrors: + - localai/localai-backends:latest-gpu-vulkan-sam3-cpp +- !!merge <<: *sam3cpp + name: "vulkan-sam3-cpp-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-sam3-cpp" + mirrors: + - localai/localai-backends:master-gpu-vulkan-sam3-cpp ## Rerankers - !!merge <<: *rerankers name: "rerankers-development" diff --git a/core/backend/detection.go b/core/backend/detection.go index c7f866862..1a98c47a9 100644 --- a/core/backend/detection.go +++ b/core/backend/detection.go @@ -13,6 +13,10 @@ import ( func Detection( sourceFile string, + prompt string, + points []float32, + boxes []float32, + threshold float32, loader *model.ModelLoader, appConfig *config.ApplicationConfig, modelConfig config.ModelConfig, @@ -35,7 +39,11 @@ func Detection( } res, err := detectionModel.Detect(context.Background(), &proto.DetectOptions{ - Src: sourceFile, + Src: sourceFile, + Prompt: prompt, + Points: points, + Boxes: boxes, + Threshold: threshold, }) if appConfig.EnableTracing { diff --git a/core/config/model_config.go b/core/config/model_config.go index a4815c766..952398d5b 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -705,7 +705,8 @@ func (c *ModelConfig) GuessUsecases(u ModelConfigUsecase) bool { } if (u & FLAG_DETECTION) == FLAG_DETECTION { - if c.Backend != "rfdetr" { + detectionBackends := []string{"rfdetr", "sam3-cpp"} + if !slices.Contains(detectionBackends, c.Backend) { return false } } diff --git a/core/http/endpoints/localai/detection.go b/core/http/endpoints/localai/detection.go index 3dcbbde6d..a33711dbd 100644 --- a/core/http/endpoints/localai/detection.go +++ b/core/http/endpoints/localai/detection.go @@ -1,6 +1,8 @@ package localai import ( + "encoding/base64" + "github.com/labstack/echo/v4" "github.com/mudler/LocalAI/core/backend" "github.com/mudler/LocalAI/core/config" @@ -37,7 +39,7 @@ func DetectionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appC return err } - res, err := backend.Detection(image, ml, appConfig, *cfg) + res, err := backend.Detection(image, input.Prompt, input.Points, input.Boxes, input.Threshold, ml, appConfig, *cfg) if err != nil { return err } @@ -46,12 +48,18 @@ func DetectionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appC Detections: make([]schema.Detection, len(res.Detections)), } for i, detection := range res.Detections { + var mask string + if len(detection.Mask) > 0 { + mask = base64.StdEncoding.EncodeToString(detection.Mask) + } response.Detections[i] = schema.Detection{ - X: detection.X, - Y: detection.Y, - Width: detection.Width, - Height: detection.Height, - ClassName: detection.ClassName, + X: detection.X, + Y: detection.Y, + Width: detection.Width, + Height: detection.Height, + ClassName: detection.ClassName, + Confidence: detection.Confidence, + Mask: mask, } } diff --git a/core/schema/localai.go b/core/schema/localai.go index a8593d30d..65dc7a864 100644 --- a/core/schema/localai.go +++ b/core/schema/localai.go @@ -152,7 +152,11 @@ type SystemInformationResponse struct { type DetectionRequest struct { BasicModelRequest - Image string `json:"image"` // URL or base64-encoded image to analyze + Image string `json:"image"` // URL or base64-encoded image to analyze + Prompt string `json:"prompt,omitempty"` // Text prompt (for SAM 3 PCS mode) + Points []float32 `json:"points,omitempty"` // Point coordinates as [x,y,label,...] triples (label: 1=pos, 0=neg) + Boxes []float32 `json:"boxes,omitempty"` // Box coordinates as [x1,y1,x2,y2,...] quads + Threshold float32 `json:"threshold,omitempty"` // Detection confidence threshold } type DetectionResponse struct { @@ -160,11 +164,13 @@ type DetectionResponse struct { } type Detection struct { - X float32 `json:"x"` - Y float32 `json:"y"` - Width float32 `json:"width"` - Height float32 `json:"height"` - ClassName string `json:"class_name"` + X float32 `json:"x"` + Y float32 `json:"y"` + Width float32 `json:"width"` + Height float32 `json:"height"` + ClassName string `json:"class_name"` + Confidence float32 `json:"confidence,omitempty"` + Mask string `json:"mask,omitempty"` // base64-encoded PNG segmentation mask } type ImportModelRequest struct { diff --git a/docs/content/features/object-detection.md b/docs/content/features/object-detection.md index 43e8e7584..90ab2ef9e 100644 --- a/docs/content/features/object-detection.md +++ b/docs/content/features/object-detection.md @@ -5,7 +5,7 @@ weight = 13 url = "/features/object-detection/" +++ -LocalAI supports object detection through various backends. This feature allows you to identify and locate objects within images with high accuracy and real-time performance. Currently, [RF-DETR](https://github.com/roboflow/rf-detr) is available as an implementation. +LocalAI supports object detection and image segmentation through various backends. This feature allows you to identify and locate objects within images with high accuracy and real-time performance. Available backends include [RF-DETR](https://github.com/roboflow/rf-detr) for object detection and [sam3.cpp](https://github.com/PABannier/sam3.cpp) for image segmentation (SAM 3/2/EdgeTAM). ## Overview @@ -14,6 +14,8 @@ Object detection in LocalAI is implemented through dedicated backends that can i **Key Features:** - Real-time object detection - High accuracy detection with bounding boxes +- Image segmentation with binary masks (SAM backends) +- Text-prompted, point-prompted, and box-prompted segmentation - Support for multiple hardware accelerators (CPU, NVIDIA GPU, Intel GPU, AMD GPU) - Structured detection results with confidence scores - Easy integration through the `/v1/detection` endpoint @@ -45,6 +47,10 @@ The request body should contain: - `image`: The image to analyze, which can be: - A URL to an image - A base64-encoded image +- `prompt` (optional): Text prompt for text-prompted segmentation (SAM 3 only) +- `points` (optional): Point coordinates as `[x, y, label, ...]` triples (label: 1=positive, 0=negative) +- `boxes` (optional): Box coordinates as `[x1, y1, x2, y2, ...]` quads +- `threshold` (optional): Detection confidence threshold (default: 0.5) ### Response Format @@ -78,6 +84,7 @@ Each detection includes: - `width`, `height`: Dimensions of the bounding box - `confidence`: Detection confidence score (0.0 to 1.0) - `class_name`: The detected object class +- `mask` (optional): Base64-encoded PNG binary segmentation mask (SAM backends only) ## Backends @@ -123,6 +130,76 @@ Currently, the following model is available in the [Model Gallery]({{%relref "fe You can browse and install this model through the LocalAI web interface or using the command line. +### SAM3 Backend (sam3-cpp) + +The sam3-cpp backend provides image segmentation using [sam3.cpp](https://github.com/PABannier/sam3.cpp), a portable C++ implementation of Meta's Segment Anything Model. It supports multiple model architectures: + +- **SAM 3**: Full model with text encoder for text-prompted detection and segmentation +- **SAM 2 / SAM 2.1**: Hiera backbone models in multiple sizes +- **SAM 3 Visual-Only**: Point/box segmentation without text encoder +- **EdgeTAM**: Ultra-efficient mobile variant (~15MB quantized) + +#### Setup + +1. **Manual Configuration** + + Create a model configuration file in your `models` directory: + + ```yaml + name: sam3 + backend: sam3-cpp + parameters: + model: edgetam_q4_0.ggml + threads: 4 + known_usecases: + - detection + ``` + + Download the model from [Hugging Face](https://huggingface.co/PABannier/sam3.cpp). + +#### Segmentation Modes + +**Point-prompted segmentation** (all models): + +```bash +curl -X POST http://localhost:8080/v1/detection \ + -H "Content-Type: application/json" \ + -d '{ + "model": "sam3", + "image": "data:image/jpeg;base64,...", + "points": [256.0, 256.0, 1.0], + "threshold": 0.5 + }' +``` + +**Box-prompted segmentation** (all models): + +```bash +curl -X POST http://localhost:8080/v1/detection \ + -H "Content-Type: application/json" \ + -d '{ + "model": "sam3", + "image": "data:image/jpeg;base64,...", + "boxes": [100.0, 100.0, 400.0, 400.0], + "threshold": 0.5 + }' +``` + +**Text-prompted segmentation** (SAM 3 full model only): + +```bash +curl -X POST http://localhost:8080/v1/detection \ + -H "Content-Type: application/json" \ + -d '{ + "model": "sam3", + "image": "data:image/jpeg;base64,...", + "prompt": "cat", + "threshold": 0.5 + }' +``` + +The response includes segmentation masks as base64-encoded PNGs in the `mask` field of each detection. + ## Examples ### Basic Object Detection @@ -180,6 +257,7 @@ local-ai run --debug rfdetr-base LocalAI includes a dedicated **object-detection** category for models and backends that specialize in identifying and locating objects within images. This category currently includes: - **RF-DETR**: Real-time transformer-based object detection +- **sam3-cpp**: SAM 3/2/EdgeTAM image segmentation Additional object detection models and backends will be added to this category in the future. You can filter models by the `object-detection` tag in the model gallery to find all available object detection models. diff --git a/gallery/index.yaml b/gallery/index.yaml index e01b26288..c76695a5f 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -3134,6 +3134,37 @@ model: rfdetr-base known_usecases: - detection +- &sam3cpp + name: "edgetam" + url: "github:mudler/LocalAI/gallery/virtual.yaml@master" + size: "16MB" + license: apache-2.0 + description: | + EdgeTAM is an ultra-efficient variant of the Segment Anything Model (SAM) for image segmentation. + It uses a RepViT backbone and is only ~16MB quantized (Q4_0), making it ideal for edge deployment. + Supports point-prompted and box-prompted image segmentation via the /v1/detection endpoint. + Powered by sam3.cpp (C/C++ with GGML). + tags: + - image-segmentation + - object-detection + - sam3 + - edgetam + - cpu + - gpu + urls: + - https://github.com/PABannier/sam3.cpp + - https://huggingface.co/PABannier/sam3.cpp + overrides: + backend: sam3-cpp + parameters: + model: edgetam_q4_0.ggml + threads: 4 + known_usecases: + - detection + files: + - filename: edgetam_q4_0.ggml + sha256: a8a35e35fb9a1b6f099c3f35e3024548b0fc979c2a4184642562804192496e09 + uri: huggingface://PABannier/sam3.cpp/edgetam_q4_0.ggml - name: "dream-org_dream-v0-instruct-7b" # chatml url: "github:mudler/LocalAI/gallery/chatml.yaml@master"