WIP

2026-05-24 08:38:02 -04:00 · 2025-07-23 21:18:47 +02:00
98 changed files with 2244 additions and 1635 deletions
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -381,12 +381,24 @@ jobs:
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          # sycl builds
-          - build-type: 'intel'
+          - build-type: 'sycl_f32'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-intel-rerankers'
+            tag-suffix: '-gpu-intel-sycl-f32-rerankers'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "rerankers"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'sycl_f16'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-intel-sycl-f16-rerankers'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
@@ -417,36 +429,60 @@ jobs:
            backend: "llama-cpp"
            dockerfile: "./backend/Dockerfile.llama-cpp"
            context: "./"
-          - build-type: 'intel'
+          - build-type: 'sycl_f32'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-intel-vllm'
+            tag-suffix: '-gpu-intel-sycl-f32-vllm'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "vllm"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
-          - build-type: 'intel'
+          - build-type: 'sycl_f16'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-intel-transformers'
+            tag-suffix: '-gpu-intel-sycl-f16-vllm'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "vllm"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'sycl_f32'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-intel-sycl-f32-transformers'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "transformers"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
-          - build-type: 'intel'
+          - build-type: 'sycl_f16'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-intel-diffusers'
+            tag-suffix: '-gpu-intel-sycl-f16-transformers'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "transformers"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'sycl_f32'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-intel-sycl-f32-diffusers'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
@@ -454,48 +490,96 @@ jobs:
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          # SYCL additional backends
-          - build-type: 'intel'
+          - build-type: 'sycl_f32'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-intel-kokoro'
+            tag-suffix: '-gpu-intel-sycl-f32-kokoro'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "kokoro"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
-          - build-type: 'intel'
+          - build-type: 'sycl_f16'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-intel-faster-whisper'
+            tag-suffix: '-gpu-intel-sycl-f16-kokoro'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "kokoro"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'sycl_f32'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-intel-sycl-f32-faster-whisper'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "faster-whisper"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
-          - build-type: 'intel'
+          - build-type: 'sycl_f16'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-intel-coqui'
+            tag-suffix: '-gpu-intel-sycl-f16-faster-whisper'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "faster-whisper"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'sycl_f32'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-intel-sycl-f32-coqui'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "coqui"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
-          - build-type: 'intel'
+          - build-type: 'sycl_f16'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-intel-bark'
+            tag-suffix: '-gpu-intel-sycl-f16-coqui'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "coqui"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'sycl_f32'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-intel-sycl-f32-bark'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "bark"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'sycl_f16'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-intel-sycl-f16-bark'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
@@ -513,7 +597,7 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "piper"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          # bark-cpp
          - build-type: ''
@@ -526,7 +610,7 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "bark-cpp"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: ''
            cuda-major-version: ""
@@ -575,7 +659,7 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "stablediffusion-ggml"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "12"
@@ -587,7 +671,7 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "stablediffusion-ggml"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "11"
@@ -599,7 +683,7 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "stablediffusion-ggml"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: 'sycl_f32'
            cuda-major-version: ""
@@ -611,7 +695,7 @@ jobs:
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "stablediffusion-ggml"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: 'sycl_f16'
            cuda-major-version: ""
@@ -623,7 +707,7 @@ jobs:
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "stablediffusion-ggml"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: 'vulkan'
            cuda-major-version: ""
@@ -635,7 +719,7 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "stablediffusion-ggml"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "12"
@@ -647,7 +731,7 @@ jobs:
            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
            runs-on: 'ubuntu-24.04-arm'
            backend: "stablediffusion-ggml"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          # whisper
          - build-type: ''
@@ -660,7 +744,7 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "whisper"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "12"
@@ -672,7 +756,7 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "whisper"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "11"
@@ -684,7 +768,7 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "whisper"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: 'sycl_f32'
            cuda-major-version: ""
@@ -696,7 +780,7 @@ jobs:
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "whisper"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: 'sycl_f16'
            cuda-major-version: ""
@@ -708,7 +792,7 @@ jobs:
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "whisper"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: 'vulkan'
            cuda-major-version: ""
@@ -720,7 +804,7 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "whisper"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: 'cublas'
            cuda-major-version: "12"
@@ -732,7 +816,7 @@ jobs:
            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
            runs-on: 'ubuntu-24.04-arm'
            backend: "whisper"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          - build-type: 'hipblas'
            cuda-major-version: ""
@@ -744,7 +828,7 @@ jobs:
            runs-on: 'ubuntu-latest'
            skip-drivers: 'false'
            backend: "whisper"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          #silero-vad
          - build-type: ''
@@ -757,7 +841,7 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "silero-vad"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          # local-store
          - build-type: ''
@@ -770,7 +854,7 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "local-store"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
            context: "./"
          # huggingface
          - build-type: ''
@@ -783,143 +867,8 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "huggingface"
-            dockerfile: "./backend/Dockerfile.golang"
+            dockerfile: "./backend/Dockerfile.go"
-            context: "./"
+            context: "./"  
          # rfdetr
          - build-type: ''
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'auto'
            tag-suffix: '-cpu-rfdetr'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "rfdetr"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-rfdetr'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "rfdetr"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-11-rfdetr'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "rfdetr"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'intel'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-intel-rfdetr'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "rfdetr"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/arm64'
            skip-drivers: 'true'
            tag-latest: 'auto'
            tag-suffix: '-nvidia-l4t-arm64-rfdetr'
            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
            runs-on: 'ubuntu-24.04-arm'
            backend: "rfdetr"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          # exllama2
          - build-type: ''
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-cpu-exllama2'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "exllama2"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-12-exllama2'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "exllama2"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-nvidia-cuda-11-exllama2'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            backend: "exllama2"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'intel'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-gpu-intel-exllama2'
            runs-on: 'ubuntu-latest'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            skip-drivers: 'false'
            backend: "exllama2"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          - build-type: 'hipblas'
            cuda-major-version: ""
            cuda-minor-version: ""
            platforms: 'linux/amd64'
            skip-drivers: 'true'
            tag-latest: 'auto'
            tag-suffix: '-gpu-hipblas-exllama2'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            runs-on: 'ubuntu-latest'
            backend: "exllama2"
            dockerfile: "./backend/Dockerfile.python"
            context: "./backend"
          # runs out of space on the runner
          # - build-type: 'hipblas'
          #   cuda-major-version: ""
          #   cuda-minor-version: ""
          #   platforms: 'linux/amd64'
          #   tag-latest: 'auto'
          #   tag-suffix: '-gpu-hipblas-rfdetr'
          #   base-image: "rocm/dev-ubuntu-22.04:6.1"
          #   runs-on: 'ubuntu-latest'
          #   skip-drivers: 'false'
          #   backend: "rfdetr"
          #   dockerfile: "./backend/Dockerfile.python"
          #   context: "./backend"
  llama-cpp-darwin:
    runs-on: macOS-14
    strategy:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -21,7 +21,7 @@ jobs:
            variable: "BARKCPP_VERSION"
            branch: "main"
            file: "Makefile"
-          - repository: "leejet/stable-diffusion.cpp"
+          - repository: "richiejp/stable-diffusion.cpp"
            variable: "STABLEDIFFUSION_GGML_VERSION"
            branch: "master"
            file: "backend/go/stablediffusion-ggml/Makefile"
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -39,7 +39,7 @@ jobs:
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-gpu-nvidia-cuda-12'
+            tag-suffix: '-gpu-nvidia-cuda12'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -83,7 +83,7 @@ jobs:
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-nvidia-cuda-11'
+            tag-suffix: '-gpu-nvidia-cuda11'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
@@ -94,7 +94,7 @@ jobs:
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-nvidia-cuda-12'
+            tag-suffix: '-gpu-nvidia-cuda12'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
@@ -103,7 +103,7 @@ jobs:
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-vulkan'
+            tag-suffix: '-vulkan'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
--- a/.gitignore
+++ b/.gitignore
@@ -12,7 +12,6 @@ prepare-sources
 /backends
 /backend-images
 /result.yaml
 protoc
 *.log
--- a/6
+++ b/6
@@ -72,12 +72,6 @@ RUN <<EOT bash
    fi
 EOT
 RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
        echo "nvidia-l4t" > /run/localai/capability
    fi
 EOT
 # If we are building with clblas support, we need the libraries for the builds
 RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
--- a/141
+++ b/141
@@ -145,7 +145,7 @@ backends/stablediffusion-ggml: docker-build-stablediffusion-ggml docker-save-sta
 backends/whisper: docker-build-whisper docker-save-whisper build
 	./local-ai backends install "ocifile://$(abspath ./backend-images/whisper.tar)"
-
+	
 backends/silero-vad: docker-build-silero-vad docker-save-silero-vad build
 	./local-ai backends install "ocifile://$(abspath ./backend-images/silero-vad.tar)"
@@ -155,9 +155,6 @@ backends/local-store: docker-build-local-store docker-save-local-store build
 backends/huggingface: docker-build-huggingface docker-save-huggingface build
 	./local-ai backends install "ocifile://$(abspath ./backend-images/huggingface.tar)"
 backends/rfdetr: docker-build-rfdetr docker-save-rfdetr build
 	./local-ai backends install "ocifile://$(abspath ./backend-images/rfdetr.tar)"
 ########################################################
 ## AIO tests
 ########################################################
@@ -245,7 +242,10 @@ help: ## Show this help.
 ########################################################
 .PHONY: protogen
-protogen: protogen-go
+protogen: protogen-go protogen-python
 .PHONY: protogen-clean
 protogen-clean: protogen-go-clean protogen-python-clean
 protoc:
 	@OS_NAME=$$(uname -s | tr '[:upper:]' '[:lower:]'); \
@@ -290,6 +290,93 @@ protogen-go-clean:
 	$(RM) pkg/grpc/proto/backend.pb.go pkg/grpc/proto/backend_grpc.pb.go
 	$(RM) bin/*
 .PHONY: protogen-python
 protogen-python: bark-protogen coqui-protogen chatterbox-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
 .PHONY: protogen-python-clean
 protogen-python-clean: bark-protogen-clean coqui-protogen-clean chatterbox-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
 .PHONY: bark-protogen
 bark-protogen:
 	$(MAKE) -C backend/python/bark protogen
 .PHONY: bark-protogen-clean
 bark-protogen-clean:
 	$(MAKE) -C backend/python/bark protogen-clean
 .PHONY: coqui-protogen
 coqui-protogen:
 	$(MAKE) -C backend/python/coqui protogen
 .PHONY: coqui-protogen-clean
 coqui-protogen-clean:
 	$(MAKE) -C backend/python/coqui protogen-clean
 .PHONY: diffusers-protogen
 diffusers-protogen:
 	$(MAKE) -C backend/python/diffusers protogen
 .PHONY: chatterbox-protogen
 chatterbox-protogen:
 	$(MAKE) -C backend/python/chatterbox protogen
 .PHONY: diffusers-protogen-clean
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean
 .PHONY: chatterbox-protogen-clean
 chatterbox-protogen-clean:
 	$(MAKE) -C backend/python/chatterbox protogen-clean
 .PHONY: faster-whisper-protogen
 faster-whisper-protogen:
 	$(MAKE) -C backend/python/faster-whisper protogen
 .PHONY: faster-whisper-protogen-clean
 faster-whisper-protogen-clean:
 	$(MAKE) -C backend/python/faster-whisper protogen-clean
 .PHONY: exllama2-protogen
 exllama2-protogen:
 	$(MAKE) -C backend/python/exllama2 protogen
 .PHONY: exllama2-protogen-clean
 exllama2-protogen-clean:
 	$(MAKE) -C backend/python/exllama2 protogen-clean
 .PHONY: rerankers-protogen
 rerankers-protogen:
 	$(MAKE) -C backend/python/rerankers protogen
 .PHONY: rerankers-protogen-clean
 rerankers-protogen-clean:
 	$(MAKE) -C backend/python/rerankers protogen-clean
 .PHONY: transformers-protogen
 transformers-protogen:
 	$(MAKE) -C backend/python/transformers protogen
 .PHONY: transformers-protogen-clean
 transformers-protogen-clean:
 	$(MAKE) -C backend/python/transformers protogen-clean
 .PHONY: kokoro-protogen
 kokoro-protogen:
 	$(MAKE) -C backend/python/kokoro protogen
 .PHONY: kokoro-protogen-clean
 kokoro-protogen-clean:
 	$(MAKE) -C backend/python/kokoro protogen-clean
 .PHONY: vllm-protogen
 vllm-protogen:
 	$(MAKE) -C backend/python/vllm protogen
 .PHONY: vllm-protogen-clean
 vllm-protogen-clean:
 	$(MAKE) -C backend/python/vllm protogen-clean
 prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/diffusers
@@ -325,7 +412,7 @@ docker-cuda11:
 		--build-arg GO_TAGS="$(GO_TAGS)" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
 		--build-arg BUILD_TYPE=$(BUILD_TYPE) \
-		-t $(DOCKER_IMAGE)-cuda-11 .
+		-t $(DOCKER_IMAGE)-cuda11 .
 docker-aio:
 	@echo "Building AIO image with base $(BASE_IMAGE) as $(DOCKER_AIO_IMAGE)"
@@ -362,25 +449,19 @@ backend-images:
 	mkdir -p backend-images
 docker-build-llama-cpp:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:llama-cpp -f backend/Dockerfile.llama-cpp .
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg IMAGE_BASE=$(IMAGE_BASE) -t local-ai-backend:llama-cpp -f backend/Dockerfile.llama-cpp .
 docker-build-bark-cpp:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:bark-cpp -f backend/Dockerfile.golang --build-arg BACKEND=bark-cpp .
+	docker build -t local-ai-backend:bark-cpp -f backend/Dockerfile.go --build-arg BACKEND=bark-cpp .
 docker-build-piper:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:piper -f backend/Dockerfile.golang --build-arg BACKEND=piper .
+	docker build -t local-ai-backend:piper -f backend/Dockerfile.go --build-arg BACKEND=piper .
 docker-build-local-store:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:local-store -f backend/Dockerfile.golang --build-arg BACKEND=local-store .
+	docker build -t local-ai-backend:local-store -f backend/Dockerfile.go --build-arg BACKEND=local-store .
 docker-build-huggingface:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:huggingface -f backend/Dockerfile.golang --build-arg BACKEND=huggingface .
+	docker build -t local-ai-backend:huggingface -f backend/Dockerfile.go --build-arg BACKEND=huggingface .
 docker-build-rfdetr:
 	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:rfdetr -f backend/Dockerfile.python --build-arg BACKEND=rfdetr ./backend
 docker-save-rfdetr: backend-images
 	docker save local-ai-backend:rfdetr -o backend-images/rfdetr.tar
 docker-save-huggingface: backend-images
 	docker save local-ai-backend:huggingface -o backend-images/huggingface.tar
@@ -389,7 +470,7 @@ docker-save-local-store: backend-images
 	docker save local-ai-backend:local-store -o backend-images/local-store.tar
 docker-build-silero-vad:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:silero-vad -f backend/Dockerfile.golang --build-arg BACKEND=silero-vad .
+	docker build -t local-ai-backend:silero-vad -f backend/Dockerfile.go --build-arg BACKEND=silero-vad .
 docker-save-silero-vad: backend-images
 	docker save local-ai-backend:silero-vad -o backend-images/silero-vad.tar
@@ -404,46 +485,46 @@ docker-save-bark-cpp: backend-images
 	docker save local-ai-backend:bark-cpp -o backend-images/bark-cpp.tar
 docker-build-stablediffusion-ggml:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:stablediffusion-ggml -f backend/Dockerfile.golang --build-arg BACKEND=stablediffusion-ggml .
+	docker build -t local-ai-backend:stablediffusion-ggml -f backend/Dockerfile.go --build-arg BACKEND=stablediffusion-ggml .
 docker-save-stablediffusion-ggml: backend-images
 	docker save local-ai-backend:stablediffusion-ggml -o backend-images/stablediffusion-ggml.tar
 docker-build-rerankers:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:rerankers -f backend/Dockerfile.python --build-arg BACKEND=rerankers .
+	docker build -t local-ai-backend:rerankers -f backend/Dockerfile.python --build-arg BACKEND=rerankers .
 docker-build-vllm:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:vllm -f backend/Dockerfile.python --build-arg BACKEND=vllm .
+	docker build -t local-ai-backend:vllm -f backend/Dockerfile.python --build-arg BACKEND=vllm .
 docker-build-transformers:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:transformers -f backend/Dockerfile.python --build-arg BACKEND=transformers .
+	docker build -t local-ai-backend:transformers -f backend/Dockerfile.python --build-arg BACKEND=transformers .
 docker-build-diffusers:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:diffusers -f backend/Dockerfile.python --build-arg BACKEND=diffusers .
+	docker build -t local-ai-backend:diffusers -f backend/Dockerfile.python --build-arg BACKEND=diffusers .
 docker-build-kokoro:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro .
+	docker build -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro .
 docker-build-whisper:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:whisper -f backend/Dockerfile.golang --build-arg BACKEND=whisper  .
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:whisper -f backend/Dockerfile.go --build-arg BACKEND=whisper  .
 docker-save-whisper: backend-images
 	docker save local-ai-backend:whisper -o backend-images/whisper.tar
 docker-build-faster-whisper:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:faster-whisper -f backend/Dockerfile.python --build-arg BACKEND=faster-whisper .
+	docker build -t local-ai-backend:faster-whisper -f backend/Dockerfile.python --build-arg BACKEND=faster-whisper .
 docker-build-coqui:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:coqui -f backend/Dockerfile.python --build-arg BACKEND=coqui .
+	docker build -t local-ai-backend:coqui -f backend/Dockerfile.python --build-arg BACKEND=coqui .
 docker-build-bark:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:bark -f backend/Dockerfile.python --build-arg BACKEND=bark .
+	docker build -t local-ai-backend:bark -f backend/Dockerfile.python --build-arg BACKEND=bark .
 docker-build-chatterbox:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:chatterbox -f backend/Dockerfile.python --build-arg BACKEND=chatterbox .
+	docker build -t local-ai-backend:chatterbox -f backend/Dockerfile.python --build-arg BACKEND=chatterbox .
 docker-build-exllama2:
-	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:exllama2 -f backend/Dockerfile.python --build-arg BACKEND=exllama2 .
+	docker build -t local-ai-backend:exllama2 -f backend/Dockerfile.python --build-arg BACKEND=exllama2 .
 docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-exllama2
--- a/README.md
+++ b/README.md
@@ -189,14 +189,10 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 local-ai run oci://localai/phi-2:latest
 ```
 > ⚡ **Automatic Backend Detection**: When you install models from the gallery or YAML files, LocalAI automatically detects your system's GPU capabilities (NVIDIA, AMD, Intel) and downloads the appropriate backend. For advanced configuration options, see [GPU Acceleration](https://localai.io/features/gpu-acceleration/#automatic-backend-detection).
 For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)
 ## 📰 Latest project news
 - July/August 2025: 🔍 [Object Detection](https://localai.io/features/object-detection/) added to the API featuring [rf-detr](https://github.com/roboflow/rf-detr)
 - July 2025: All backends migrated outside of the main binary. LocalAI is now more lightweight, small, and automatically downloads the required backend to run the model. [Read the release notes](https://github.com/mudler/LocalAI/releases/tag/v3.2.0)
 - June 2025: [Backend management](https://github.com/mudler/LocalAI/pull/5607) has been added. Attention: extras images are going to be deprecated from the next release! Read [the backend management PR](https://github.com/mudler/LocalAI/pull/5607).
 - May 2025: [Audio input](https://github.com/mudler/LocalAI/pull/5466) and [Reranking](https://github.com/mudler/LocalAI/pull/5396) in llama.cpp backend, [Realtime API](https://github.com/mudler/LocalAI/pull/5392),  Support to Gemma, SmollVLM, and more multimodal models (available in the gallery).
 - May 2025: Important: image name changes [See release](https://github.com/mudler/LocalAI/releases/tag/v2.29.0)
@@ -229,7 +225,6 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
 - 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 🔍 [Object Detection](https://localai.io/features/object-detection/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
 - [Agentic capabilities](https://github.com/mudler/LocalAGI)
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
--- a/backend/Dockerfile.llama-cpp
+++ b/backend/Dockerfile.llama-cpp
@@ -11,6 +11,7 @@ ARG GRPC_MAKEFLAGS="-j4 -Otarget"
 ARG GRPC_VERSION=v1.65.0
 ARG CMAKE_FROM_SOURCE=false
 ARG CMAKE_VERSION=3.26.4
 ARG PROTOBUF_VERSION=v21.12
 ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
@@ -49,6 +50,14 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
    make install && \
    rm -rf /build
 RUN git clone --recurse-submodules --branch ${PROTOBUF_VERSION} https://github.com/protocolbuffers/protobuf.git && \
    mkdir -p /build/protobuf/build && \
    cd /build/protobuf/build && \
    cmake -Dprotobuf_BUILD_SHARED_LIBS=ON -Dprotobuf_BUILD_TESTS=OFF .. && \
    make && \
    make install && \
    rm -rf /build
 FROM ${BASE_IMAGE} AS builder
 ARG BACKEND=rerankers
 ARG BUILD_TYPE
@@ -180,21 +189,9 @@ COPY --from=grpc /opt/grpc /usr/local
 COPY . /LocalAI
-## Otherwise just run the normal build
+RUN make -C /LocalAI/backend/cpp/llama-cpp llama-cpp
-RUN <<EOT bash
+RUN make -C /LocalAI/backend/cpp/llama-cpp llama-cpp-grpc
-if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
+RUN make -C /LocalAI/backend/cpp/llama-cpp llama-cpp-rpc-server
        cd /LocalAI/backend/cpp/llama-cpp && make llama-cpp-fallback && \
        make llama-cpp-grpc && make llama-cpp-rpc-server; \
    else \
        cd /LocalAI/backend/cpp/llama-cpp && make llama-cpp-avx && \
        make llama-cpp-avx2 && \
        make llama-cpp-avx512 && \
        make llama-cpp-fallback && \
        make llama-cpp-grpc && \
        make llama-cpp-rpc-server; \
    fi  
 EOT
 # Copy libraries using a script to handle architecture differences
 RUN make -C /LocalAI/backend/cpp/llama-cpp package
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -20,7 +20,6 @@ service Backend {
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}
  rpc Detect(DetectOptions) returns (DetectResponse) {}
  rpc StoresSet(StoresSetOptions) returns (Result) {}
  rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
@@ -377,20 +376,3 @@ message Message {
  string role = 1;
  string content = 2;
 }
 message DetectOptions {
  string src = 1;
 }
 message Detection {
  float x = 1;
  float y = 2;
  float width = 3;
  float height = 4;
  float confidence = 5;
  string class_name = 6;
 }
 message DetectResponse {
  repeated Detection Detections = 1;
 }
--- a/backend/cpp/llama-cpp/CMakeLists.txt
+++ b/backend/cpp/llama-cpp/CMakeLists.txt
@@ -17,6 +17,8 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
    include_directories("${HOMEBREW_DEFAULT_PREFIX}/include")
 endif()
 set(Protobuf_USE_STATIC_LIBS OFF)
 set(gRPC_USE_STATIC_LIBS OFF)
 find_package(absl CONFIG REQUIRED)
 find_package(Protobuf CONFIG REQUIRED)
 find_package(gRPC CONFIG REQUIRED)
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@
-LLAMA_VERSION?=bf78f5439ee8e82e367674043303ebf8e92b4805
+LLAMA_VERSION?=acd6cb1c41676f6bbb25c2a76fa5abeb1719301e
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 CMAKE_ARGS?=
@@ -7,10 +7,9 @@ BUILD_TYPE?=
 NATIVE?=false
 ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server
 JOBS?=$(shell nproc)
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=ON -DLLAMA_CURL=OFF -DGGML_CPU_ALL_VARIANTS=ON -DGGML_BACKEND_DL=ON
 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 ifeq ($(NATIVE),false)
@@ -90,33 +89,12 @@ else
 	LLAMA_VERSION=$(LLAMA_VERSION) $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server
 endif
-llama-cpp-avx2: llama.cpp
+llama-cpp: llama.cpp
-	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-build
-	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build purge
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-build purge
-	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
+	$(info ${GREEN}I llama-cpp build info:${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx2-build" build-llama-cpp-grpc-server
+	CMAKE_ARGS="$(CMAKE_ARGS)" $(MAKE) VARIANT="llama-cpp-build" build-llama-cpp-grpc-server
-	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build/grpc-server llama-cpp-avx2
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-build/grpc-server llama-cpp
 llama-cpp-avx512: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build purge
 	$(info ${GREEN}I llama-cpp build info:avx512${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx512-build" build-llama-cpp-grpc-server
 	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build/grpc-server llama-cpp-avx512
 llama-cpp-avx: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build purge
 	$(info ${GREEN}I llama-cpp build info:avx${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-cpp-avx-build" build-llama-cpp-grpc-server
 	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build/grpc-server llama-cpp-avx
 llama-cpp-fallback: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build purge
 	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
 	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
 llama-cpp-grpc: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
@@ -161,8 +139,8 @@ grpc-server: llama.cpp llama.cpp/tools/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
-	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET)"
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)"
 else
-	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET)
+	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
 endif
 	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama-cpp/run.sh
+++ b/backend/cpp/llama-cpp/run.sh
@@ -6,34 +6,9 @@ CURDIR=$(dirname "$(realpath $0)")
 cd /
-echo "CPU info:"
+BINARY=llama-cpp
 grep -e "model\sname" /proc/cpuinfo | head -1
 grep -e "flags" /proc/cpuinfo | head -1
 BINARY=llama-cpp-fallback
 if grep -q -e "\savx\s" /proc/cpuinfo ; then
 	echo "CPU:    AVX    found OK"
 	if [ -e $CURDIR/llama-cpp-avx ]; then
 		BINARY=llama-cpp-avx
 	fi
 fi
 if grep -q -e "\savx2\s" /proc/cpuinfo ; then
 	echo "CPU:    AVX2   found OK"
 	if [ -e $CURDIR/llama-cpp-avx2 ]; then
 		BINARY=llama-cpp-avx2
 	fi
 fi
 # Check avx 512
 if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
 	echo "CPU:    AVX512F found OK"
 	if [ -e $CURDIR/llama-cpp-avx512 ]; then
 		BINARY=llama-cpp-avx512
 	fi
 fi
 ## P2P/GRPC mode
 if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
 	if [ -e $CURDIR/llama-cpp-grpc ]; then
 		BINARY=llama-cpp-grpc
@@ -56,6 +31,3 @@ fi
 echo "Using binary: $BINARY"
 exec $CURDIR/$BINARY "$@"
 # In case we fail execing, just run fallback
 exec $CURDIR/llama-cpp-fallback "$@"
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -18,8 +18,8 @@ GO_TAGS?=
 LD_FLAGS?=
 # stablediffusion.cpp (ggml)
-STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
+STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=eed97a5e1d054f9c1e7ac01982ae480411d4157e
+STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
@@ -91,18 +91,23 @@ endif
 # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
 GGML_ARCHIVE_DIR := build/ggml/src/
 ALL_ARCHIVES := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.a')
 ALL_OBJS := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.o')
 # Name of the single merged library
 COMBINED_LIB := libggmlall.a
-# Instead of using the archives generated by GGML, use the object files directly to avoid overwriting objects with the same base name
+# Rule to merge all the .a files into one
 $(COMBINED_LIB): $(ALL_ARCHIVES)
-	@echo "Merging all .o into $(COMBINED_LIB): $(ALL_OBJS)"
+	@echo "Merging all .a into $(COMBINED_LIB)"
 	rm -f $@
-	ar -qc $@ $(ALL_OBJS)
+	mkdir -p merge-tmp
 	for a in $(ALL_ARCHIVES); do \
 		( cd merge-tmp && ar x ../$$a ); \
 	done
 	( cd merge-tmp && ar rcs ../$@ *.o )
 	# Ensure we have a proper index
 	ranlib $@
 	# Clean up
 	rm -rf merge-tmp
 build/libstable-diffusion.a:
 	@echo "Building SD with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
--- a/backend/go/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/stablediffusion-ggml/gosd.cpp
@@ -53,43 +53,9 @@ sd_ctx_t* sd_c;
 sample_method_t sample_method;
 // Copied from the upstream CLI
 void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
    //SDParams* params = (SDParams*)data;
    const char* level_str;
    if (!log /*|| (!params->verbose && level <= SD_LOG_DEBUG)*/) {
        return;
    }
    switch (level) {
        case SD_LOG_DEBUG:
            level_str = "DEBUG";
            break;
        case SD_LOG_INFO:
            level_str = "INFO";
            break;
        case SD_LOG_WARN:
            level_str = "WARN";
            break;
        case SD_LOG_ERROR:
            level_str = "ERROR";
            break;
        default: /* Potential future-proofing */
            level_str = "?????";
            break;
    }
    fprintf(stderr, "[%-5s] ", level_str);
    fputs(log, stderr);
    fflush(stderr);
 }
 int load_model(char *model, char* options[], int threads, int diff) {
    fprintf (stderr, "Loading model!\n");
    sd_set_log_callback(sd_log_cb, NULL);
    char *stableDiffusionModel = "";
    if (diff == 1 ) {
        stableDiffusionModel = model;
@@ -104,8 +70,6 @@ int load_model(char *model, char* options[], int threads, int diff) {
    char *scheduler = "";
    char *sampler = "";
    fprintf(stderr, "parsing options\n");
    // If options is not NULL, parse options
    for (int i = 0; options[i] != NULL; i++) {
        char *optname = strtok(options[i], ":");
@@ -134,13 +98,10 @@ int load_model(char *model, char* options[], int threads, int diff) {
        }
    }
    fprintf(stderr, "parsed options\n");
    int sample_method_found = -1;
-    for (int m = 0; m < SAMPLE_METHOD_COUNT; m++) {
+    for (int m = 0; m < N_SAMPLE_METHODS; m++) {
        if (!strcmp(sampler, sample_method_str[m])) {
            sample_method_found = m;
            fprintf(stderr, "Found sampler: %s\n", sampler);
        }
    }
    if (sample_method_found == -1) {
@@ -150,7 +111,7 @@ int load_model(char *model, char* options[], int threads, int diff) {
    sample_method = (sample_method_t)sample_method_found;
    int schedule_found            = -1;
-    for (int d = 0; d < SCHEDULE_COUNT; d++) {
+    for (int d = 0; d < N_SCHEDULES; d++) {
        if (!strcmp(scheduler, schedule_str[d])) {
            schedule_found = d;
                fprintf (stderr, "Found scheduler: %s\n", scheduler);
@@ -164,28 +125,30 @@ int load_model(char *model, char* options[], int threads, int diff) {
    }
    schedule_t schedule = (schedule_t)schedule_found;
-
+    
    fprintf (stderr, "Creating context\n");
-    sd_ctx_params_t ctx_params;
+    sd_ctx_t* sd_ctx = new_sd_ctx(model,
-    sd_ctx_params_init(&ctx_params);
+                                  clip_l_path,
-    ctx_params.model_path = model;
+                                  clip_g_path,
-    ctx_params.clip_l_path = clip_l_path;
+                                  t5xxl_path,
-    ctx_params.clip_g_path = clip_g_path;
+                                  stableDiffusionModel,
-    ctx_params.t5xxl_path = t5xxl_path;
+                                  vae_path,
-    ctx_params.diffusion_model_path = stableDiffusionModel;
+                                  "",
-    ctx_params.vae_path = vae_path;
+                                  "",
-    ctx_params.taesd_path = "";
+                                  "",
-    ctx_params.control_net_path = "";
+                                  "",
-    ctx_params.lora_model_dir = "";
+                                  "",
-    ctx_params.embedding_dir = "";
+                                  false,
-    ctx_params.stacked_id_embed_dir = "";
+                                  false,
-    ctx_params.vae_decode_only = false;
+                                  false,
-    ctx_params.vae_tiling = false;
+                                  threads,
-    ctx_params.free_params_immediately = false;
+                                  SD_TYPE_COUNT,
-    ctx_params.n_threads = threads;
+                                  STD_DEFAULT_RNG,
-    ctx_params.rng_type = STD_DEFAULT_RNG;
+                                  schedule,
-    ctx_params.schedule = schedule;
+                                  false,
-    sd_ctx_t* sd_ctx = new_sd_ctx(&ctx_params);
+                                  false,
                                  false,
                                  false);
    if (sd_ctx == NULL) {
        fprintf (stderr, "failed loading model (generic error)\n");
@@ -206,22 +169,29 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
    fprintf (stderr, "Generating image\n");
-    sd_img_gen_params_t p;
+    results = txt2img(sd_c,
-    sd_img_gen_params_init(&p);
+                            text,
-
+                            negativeText,
-    p.prompt = text;
+                            -1, //clip_skip
-    p.negative_prompt = negativeText;
+                            cfg_scale, // sfg_scale
-    p.guidance.txt_cfg = cfg_scale;
+                            3.5f,
-    p.guidance.slg.layers = skip_layers.data();
+			    0, // eta
-    p.guidance.slg.layer_count = skip_layers.size();
+                            width,
-    p.width = width;
+                            height,
-    p.height = height;
+                            sample_method, 
-    p.sample_method = sample_method;
+                            steps,
-    p.sample_steps = steps;
+                            seed,
-    p.seed = seed;
+                            1,
-    p.input_id_images_path = "";
+                            NULL,
-
+                            0.9f,
-    results = generate_image(sd_c, &p);
+                            20.f,
                            false,
                            "",
                            skip_layers.data(),
                            skip_layers.size(),
                            0,
                            0.01,
                            0.2);
    if (results == NULL) {
        fprintf (stderr, "NO results\n");
--- a/backend/go/stablediffusion-ggml/gosd.go
+++ b/backend/go/stablediffusion-ggml/gosd.go
@@ -37,8 +37,8 @@ func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
 	size := C.size_t(unsafe.Sizeof((*C.char)(nil)))
 	length := C.size_t(len(opts.Options))
-	options = (**C.char)(C.malloc((length + 1) * size))
+	options = (**C.char)(C.malloc(length * size))
-	view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0:len(opts.Options) + 1:len(opts.Options) + 1]
+	view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0:len(opts.Options):len(opts.Options)]
 	var diffusionModel int
@@ -66,7 +66,6 @@ func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
 	for i, x := range oo {
 		view[i] = C.CString(x)
 	}
 	view[len(oo)] = nil
 	sd.cfgScale = opts.CFGScale
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -6,7 +6,7 @@ CMAKE_ARGS?=
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=e7bf0294ec9099b5fc21f5ba969805dfb2108cea
+WHISPER_CPP_VERSION?=1f5cf0b2888402d57bb17b2029b2caa97e5f3baf
 export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
 export WHISPER_DIR=$(abspath ./sources/whisper.cpp)
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -73,28 +73,6 @@
    nvidia-l4t: "nvidia-l4t-arm64-stablediffusion-ggml"
    # metal: "metal-stablediffusion-ggml"
    # darwin-x86: "darwin-x86-stablediffusion-ggml"
 - &rfdetr
  name: "rfdetr"
  alias: "rfdetr"
  license: apache-2.0
  icon: https://avatars.githubusercontent.com/u/53104118?s=200&v=4
  description: |
    RF-DETR is a real-time, transformer-based object detection model architecture developed by Roboflow and released under the Apache 2.0 license.
    RF-DETR is the first real-time model to exceed 60 AP on the Microsoft COCO benchmark alongside competitive performance at base sizes. It also achieves state-of-the-art performance on RF100-VL, an object detection benchmark that measures model domain adaptability to real world problems. RF-DETR is fastest and most accurate for its size when compared current real-time objection models.
    RF-DETR is small enough to run on the edge using Inference, making it an ideal model for deployments that need both strong accuracy and real-time performance.
  urls:
    - https://github.com/roboflow/rf-detr
  tags:
    - object-detection
    - rfdetr
    - gpu
    - cpu
  capabilities:
    nvidia: "cuda12-rfdetr"
    intel: "intel-rfdetr"
    #amd: "rocm-rfdetr"
    nvidia-l4t: "nvidia-l4t-arm64-rfdetr"
    default: "cpu-rfdetr"
 - &vllm
  name: "vllm"
  license: apache-2.0
@@ -126,13 +104,13 @@
  capabilities:
    nvidia: "cuda12-vllm"
    amd: "rocm-vllm"
-    intel: "intel-vllm"
+    intel: "intel-sycl-f16-vllm"
 - &rerankers
  name: "rerankers"
  alias: "rerankers"
  capabilities:
    nvidia: "cuda12-rerankers"
-    intel: "intel-rerankers"
+    intel: "intel-sycl-f16-rerankers"
    amd: "rocm-rerankers"
 - &transformers
  name: "transformers"
@@ -149,7 +127,7 @@
    - multimodal
  capabilities:
    nvidia: "cuda12-transformers"
-    intel: "intel-transformers"
+    intel: "intel-sycl-f16-transformers"
    amd: "rocm-transformers"
 - &diffusers
  name: "diffusers"
@@ -166,7 +144,7 @@
  alias: "diffusers"
  capabilities:
    nvidia: "cuda12-diffusers"
-    intel: "intel-diffusers"
+    intel: "intel-sycl-f32-diffusers"
    amd: "rocm-diffusers"
 - &exllama2
  name: "exllama2"
@@ -182,7 +160,8 @@
  alias: "exllama2"
  capabilities:
    nvidia: "cuda12-exllama2"
-    intel: "intel-exllama2"
+    intel: "intel-sycl-f32-exllama2"
    amd: "rocm-exllama2"
 - &faster-whisper
  icon: https://avatars.githubusercontent.com/u/1520500?s=200&v=4
  description: |
@@ -197,7 +176,7 @@
  name: "faster-whisper"
  capabilities:
    nvidia: "cuda12-faster-whisper"
-    intel: "intel-faster-whisper"
+    intel: "intel-sycl-f32-faster-whisper"
    amd: "rocm-faster-whisper"
 - &kokoro
  icon: https://avatars.githubusercontent.com/u/166769057?v=4
@@ -215,7 +194,7 @@
  name: "kokoro"
  capabilities:
    nvidia: "cuda12-kokoro"
-    intel: "intel-kokoro"
+    intel: "intel-sycl-f32-kokoro"
    amd: "rocm-kokoro"
 - &coqui
  urls:
@@ -236,7 +215,7 @@
  alias: "coqui"
  capabilities:
    nvidia: "cuda12-coqui"
-    intel: "intel-coqui"
+    intel: "intel-sycl-f32-coqui"
    amd: "rocm-coqui"
  icon: https://avatars.githubusercontent.com/u/1338804?s=200&v=4
 - &bark
@@ -252,7 +231,7 @@
  alias: "bark"
  capabilities:
    cuda: "cuda12-bark"
-    intel: "intel-bark"
+    intel: "intel-sycl-f32-bark"
    rocm: "rocm-bark"
  icon: https://avatars.githubusercontent.com/u/99442120?s=200&v=4
 - &barkcpp
@@ -279,8 +258,6 @@
  icon: https://github.com/PABannier/bark.cpp/raw/main/assets/banner.png
  name: "bark-cpp"
  uri: "quay.io/go-skynet/local-ai-backends:latest-bark-cpp"
  mirrors:
    - localai/localai-backends:latest-bark-cpp
  alias: "bark-cpp"
 - &chatterbox
  urls:
@@ -303,8 +280,6 @@
  urls:
    - https://github.com/rhasspy/piper
    - https://github.com/mudler/go-piper
  mirrors:
    - localai/localai-backends:latest-piper
  license: MIT
  description: |
     A fast, local neural text to speech system
@@ -317,8 +292,6 @@
  icon: https://user-images.githubusercontent.com/12515440/89997349-b3523080-dc94-11ea-9906-ca2e8bc50535.png
  urls:
    - https://github.com/snakers4/silero-vad
  mirrors:
    - localai/localai-backends:latest-cpu-silero-vad
  description: |
    Silero VAD: pre-trained enterprise-grade Voice Activity Detector.
    Silero VAD is a voice activity detection model that can be used to detect whether a given audio contains speech or not.
@@ -330,8 +303,6 @@
 - &local-store
  name: "local-store"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-local-store"
  mirrors:
    - localai/localai-backends:latest-cpu-local-store
  urls:
    - https://github.com/mudler/LocalAI
  description: |
@@ -345,8 +316,6 @@
 - &huggingface
  name: "huggingface"
  uri: "quay.io/go-skynet/local-ai-backends:latest-huggingface"
  mirrors:
    - localai/localai-backends:latest-huggingface
  icon: https://huggingface.co/front/assets/huggingface_logo-noborder.svg
  urls:
    - https://huggingface.co/docs/hub/en/api
@@ -359,721 +328,469 @@
 - !!merge <<: *huggingface
  name: "huggingface-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-huggingface"
  mirrors:
    - localai/localai-backends:master-huggingface
 - !!merge <<: *local-store
  name: "local-store-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-local-store"
  mirrors:
    - localai/localai-backends:master-cpu-local-store
 - !!merge <<: *silero-vad
  name: "silero-vad-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-silero-vad"
  mirrors:
    - localai/localai-backends:master-cpu-silero-vad
 - !!merge <<: *piper
  name: "piper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-piper"
  mirrors:
    - localai/localai-backends:master-piper
 ## llama-cpp
 - !!merge <<: *llamacpp
  name: "darwin-x86-llama-cpp"
  uri: "quay.io/go-skynet/local-ai-backends:latest-darwin-x86-llama-cpp"
  mirrors:
    - localai/localai-backends:latest-darwin-x86-llama-cpp
 - !!merge <<: *llamacpp
  name: "darwin-x86-llama-cpp-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-darwin-x86-llama-cpp"
  mirrors:
    - localai/localai-backends:master-darwin-x86-llama-cpp
 - !!merge <<: *llamacpp
  name: "nvidia-l4t-arm64-llama-cpp"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-llama-cpp"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-arm64-llama-cpp
 - !!merge <<: *llamacpp
  name: "nvidia-l4t-arm64-llama-cpp-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-llama-cpp"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-arm64-llama-cpp
 - !!merge <<: *llamacpp
  name: "cpu-llama-cpp"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-llama-cpp"
  mirrors:
    - localai/localai-backends:latest-cpu-llama-cpp
 - !!merge <<: *llamacpp
  name: "cpu-llama-cpp-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-llama-cpp"
  mirrors:
    - localai/localai-backends:master-cpu-llama-cpp
 - !!merge <<: *llamacpp
  name: "cuda11-llama-cpp"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-llama-cpp"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-11-llama-cpp
 - !!merge <<: *llamacpp
  name: "cuda12-llama-cpp"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-llama-cpp"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-llama-cpp
 - !!merge <<: *llamacpp
  name: "rocm-llama-cpp"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-llama-cpp"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-llama-cpp
 - !!merge <<: *llamacpp
  name: "intel-sycl-f32-llama-cpp"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-llama-cpp"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f32-llama-cpp
 - !!merge <<: *llamacpp
  name: "intel-sycl-f16-llama-cpp"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-llama-cpp"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f16-llama-cpp
 - !!merge <<: *llamacpp
  name: "vulkan-llama-cpp"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-llama-cpp"
  mirrors:
    - localai/localai-backends:latest-gpu-vulkan-llama-cpp
 - !!merge <<: *llamacpp
  name: "vulkan-llama-cpp-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-llama-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-vulkan-llama-cpp
 - !!merge <<: *llamacpp
  name: "metal-llama-cpp"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-llama-cpp"
  mirrors:
    - localai/localai-backends:latest-metal-darwin-arm64-llama-cpp
 - !!merge <<: *llamacpp
  name: "metal-llama-cpp-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-llama-cpp"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-llama-cpp
 - !!merge <<: *llamacpp
  name: "cuda11-llama-cpp-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-llama-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-11-llama-cpp
 - !!merge <<: *llamacpp
  name: "cuda12-llama-cpp-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-llama-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-llama-cpp
 - !!merge <<: *llamacpp
  name: "rocm-llama-cpp-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-llama-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-llama-cpp
 - !!merge <<: *llamacpp
  name: "intel-sycl-f32-llama-cpp-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-llama-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f32-llama-cpp
 - !!merge <<: *llamacpp
  name: "intel-sycl-f16-llama-cpp-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-llama-cpp"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f16-llama-cpp
 ## whisper
 - !!merge <<: *whispercpp
  name: "nvidia-l4t-arm64-whisper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-whisper"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-arm64-whisper
 - !!merge <<: *whispercpp
  name: "nvidia-l4t-arm64-whisper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-whisper"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-arm64-whisper
 - !!merge <<: *whispercpp
  name: "cpu-whisper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-whisper"
  mirrors:
    - localai/localai-backends:latest-cpu-whisper
 - !!merge <<: *whispercpp
  name: "cpu-whisper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-whisper"
  mirrors:
    - localai/localai-backends:master-cpu-whisper
 - !!merge <<: *whispercpp
  name: "cuda11-whisper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-whisper"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-11-whisper
 - !!merge <<: *whispercpp
  name: "cuda12-whisper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-whisper"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-whisper
 - !!merge <<: *whispercpp
  name: "rocm-whisper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-whisper"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-whisper
 - !!merge <<: *whispercpp
  name: "intel-sycl-f32-whisper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-whisper"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f32-whisper
 - !!merge <<: *whispercpp
  name: "intel-sycl-f16-whisper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-whisper"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f16-whisper
 - !!merge <<: *whispercpp
  name: "vulkan-whisper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-whisper"
  mirrors:
    - localai/localai-backends:latest-gpu-vulkan-whisper
 - !!merge <<: *whispercpp
  name: "vulkan-whisper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-whisper"
  mirrors:
    - localai/localai-backends:master-gpu-vulkan-whisper
 - !!merge <<: *whispercpp
  name: "metal-whisper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-whisper"
  mirrors:
    - localai/localai-backends:latest-metal-darwin-arm64-whisper
 - !!merge <<: *whispercpp
  name: "metal-whisper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-whisper"
  mirrors:
    - localai/localai-backends:master-metal-darwin-arm64-whisper
 - !!merge <<: *whispercpp
  name: "cuda11-whisper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-whisper"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-11-whisper
 - !!merge <<: *whispercpp
  name: "cuda12-whisper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-whisper"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-whisper
 - !!merge <<: *whispercpp
  name: "rocm-whisper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-whisper"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-whisper
 - !!merge <<: *whispercpp
  name: "intel-sycl-f32-whisper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-whisper"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f32-whisper
 - !!merge <<: *whispercpp
  name: "intel-sycl-f16-whisper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-whisper"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f16-whisper
 ## stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cpu-stablediffusion-ggml"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-stablediffusion-ggml"
  mirrors:
    - localai/localai-backends:latest-cpu-stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cpu-stablediffusion-ggml-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-stablediffusion-ggml"
  mirrors:
    - localai/localai-backends:master-cpu-stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "vulkan-stablediffusion-ggml"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-vulkan-stablediffusion-ggml"
  mirrors:
    - localai/localai-backends:latest-gpu-vulkan-stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "vulkan-stablediffusion-ggml-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-vulkan-stablediffusion-ggml"
  mirrors:
    - localai/localai-backends:master-gpu-vulkan-stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cuda12-stablediffusion-ggml"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-stablediffusion-ggml"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "intel-sycl-f32-stablediffusion-ggml"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-stablediffusion-ggml"
 - !!merge <<: *stablediffusionggml
  name: "intel-sycl-f16-stablediffusion-ggml"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-stablediffusion-ggml"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-sycl-f16-stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cuda11-stablediffusion-ggml"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-stablediffusion-ggml"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-11-stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cuda12-stablediffusion-ggml-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-stablediffusion-ggml"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "intel-sycl-f32-stablediffusion-ggml-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-stablediffusion-ggml"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f32-stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "intel-sycl-f16-stablediffusion-ggml-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-stablediffusion-ggml"
  mirrors:
    - localai/localai-backends:master-gpu-intel-sycl-f16-stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "cuda11-stablediffusion-ggml-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-stablediffusion-ggml"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-11-stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "nvidia-l4t-arm64-stablediffusion-ggml-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-arm64-stablediffusion-ggml"
  mirrors:
    - localai/localai-backends:master-nvidia-l4t-arm64-stablediffusion-ggml
 - !!merge <<: *stablediffusionggml
  name: "nvidia-l4t-arm64-stablediffusion-ggml"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-stablediffusion-ggml"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-arm64-stablediffusion-ggml
 # vllm
 - !!merge <<: *vllm
  name: "vllm-development"
  capabilities:
    nvidia: "cuda12-vllm-development"
    amd: "rocm-vllm-development"
-    intel: "intel-vllm-development"
+    intel: "intel-sycl-f16-vllm-development"
 - !!merge <<: *vllm
  name: "cuda12-vllm"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-vllm
 - !!merge <<: *vllm
  name: "rocm-vllm"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-vllm"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-vllm
 - !!merge <<: *vllm
-  name: "intel-vllm"
+  name: "intel-sycl-f32-vllm"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-vllm"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-vllm"
-  mirrors:
+- !!merge <<: *vllm
-    - localai/localai-backends:latest-gpu-intel-vllm
+  name: "intel-sycl-f16-vllm"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-vllm"
 - !!merge <<: *vllm
  name: "cuda12-vllm-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-vllm"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-vllm
 - !!merge <<: *vllm
  name: "rocm-vllm-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-vllm"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-vllm
 - !!merge <<: *vllm
-  name: "intel-vllm-development"
+  name: "intel-sycl-f32-vllm-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-vllm"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-vllm"
-  mirrors:
+- !!merge <<: *vllm
-    - localai/localai-backends:master-gpu-intel-vllm
+  name: "intel-sycl-f16-vllm-development"
-# rfdetr
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-vllm"
 - !!merge <<: *rfdetr
  name: "rfdetr-development"
  capabilities:
    nvidia: "cuda12-rfdetr-development"
    intel: "intel-rfdetr-development"
    #amd: "rocm-rfdetr-development"
    nvidia-l4t: "nvidia-l4t-arm64-rfdetr-development"
    default: "cpu-rfdetr-development"
 - !!merge <<: *rfdetr
  name: "cuda12-rfdetr"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-rfdetr"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-rfdetr
 - !!merge <<: *rfdetr
  name: "intel-rfdetr"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-rfdetr"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-rfdetr
 # - !!merge <<: *rfdetr
 #   name: "rocm-rfdetr"
 #   uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-hipblas-rfdetr"
 #   mirrors:
 #     - localai/localai-backends:latest-gpu-hipblas-rfdetr
 - !!merge <<: *rfdetr
  name: "nvidia-l4t-arm64-rfdetr"
  uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-arm64-rfdetr"
  mirrors:
    - localai/localai-backends:latest-nvidia-l4t-arm64-rfdetr
 - !!merge <<: *rfdetr
  name: "cpu-rfdetr"
  uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-rfdetr"
  mirrors:
    - localai/localai-backends:latest-cpu-rfdetr
 - !!merge <<: *rfdetr
  name: "cuda12-rfdetr-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-rfdetr"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-rfdetr
 - !!merge <<: *rfdetr
  name: "intel-rfdetr-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-rfdetr"
  mirrors:
    - localai/localai-backends:master-gpu-intel-rfdetr
 # - !!merge <<: *rfdetr
 #   name: "rocm-rfdetr-development"
 #   uri: "quay.io/go-skynet/local-ai-backends:master-gpu-hipblas-rfdetr"
 #   mirrors:
 #     - localai/localai-backends:master-gpu-hipblas-rfdetr
 - !!merge <<: *rfdetr
  name: "cpu-rfdetr-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-cpu-rfdetr"
  mirrors:
    - localai/localai-backends:master-cpu-rfdetr
 - !!merge <<: *rfdetr
  name: "intel-rfdetr"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-rfdetr"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-rfdetr
 ## Rerankers
 - !!merge <<: *rerankers
  name: "rerankers-development"
  capabilities:
    nvidia: "cuda12-rerankers-development"
-    intel: "intel-rerankers-development"
+    intel: "intel-sycl-f16-rerankers-development"
    amd: "rocm-rerankers-development"
 - !!merge <<: *rerankers
  name: "cuda11-rerankers"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-rerankers"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-11-rerankers
 - !!merge <<: *rerankers
  name: "cuda12-rerankers"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-rerankers"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-rerankers
 - !!merge <<: *rerankers
-  name: "intel-rerankers"
+  name: "intel-sycl-f32-rerankers"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-rerankers"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-rerankers"
-  mirrors:
+- !!merge <<: *rerankers
-    - localai/localai-backends:latest-gpu-intel-rerankers
+  name: "intel-sycl-f16-rerankers"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-rerankers"
 - !!merge <<: *rerankers
  name: "rocm-rerankers"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-rerankers"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-rerankers
 - !!merge <<: *rerankers
  name: "cuda11-rerankers-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-rerankers"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-11-rerankers
 - !!merge <<: *rerankers
  name: "cuda12-rerankers-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-rerankers"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-rerankers
 - !!merge <<: *rerankers
  name: "rocm-rerankers-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-rerankers"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-rerankers
 - !!merge <<: *rerankers
-  name: "intel-rerankers-development"
+  name: "intel-sycl-f32-rerankers-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-rerankers"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-rerankers"
-  mirrors:
+- !!merge <<: *rerankers
-    - localai/localai-backends:master-gpu-intel-rerankers
+  name: "intel-sycl-f16-rerankers-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-rerankers"
 ## Transformers
 - !!merge <<: *transformers
  name: "transformers-development"
  capabilities:
    nvidia: "cuda12-transformers-development"
-    intel: "intel-transformers-development"
+    intel: "intel-sycl-f16-transformers-development"
    amd: "rocm-transformers-development"
 - !!merge <<: *transformers
  name: "cuda12-transformers"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-transformers"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-transformers
 - !!merge <<: *transformers
  name: "rocm-transformers"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-transformers"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-transformers
 - !!merge <<: *transformers
-  name: "intel-transformers"
+  name: "intel-sycl-f32-transformers"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-transformers"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-transformers"
-  mirrors:
+- !!merge <<: *transformers
-    - localai/localai-backends:latest-gpu-intel-transformers
+  name: "intel-sycl-f16-transformers"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-transformers"
 - !!merge <<: *transformers
  name: "cuda11-transformers-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-transformers"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-11-transformers
 - !!merge <<: *transformers
  name: "cuda11-transformers"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-transformers"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-11-transformers
 - !!merge <<: *transformers
  name: "cuda12-transformers-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-transformers"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-transformers
 - !!merge <<: *transformers
  name: "rocm-transformers-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-transformers"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-transformers
 - !!merge <<: *transformers
-  name: "intel-transformers-development"
+  name: "intel-sycl-f32-transformers-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-transformers"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-transformers"
-  mirrors:
+- !!merge <<: *transformers
-    - localai/localai-backends:master-gpu-intel-transformers
+  name: "intel-sycl-f16-transformers-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-transformers"
 ## Diffusers
 - !!merge <<: *diffusers
  name: "diffusers-development"
  capabilities:
    nvidia: "cuda12-diffusers-development"
-    intel: "intel-diffusers-development"
+    intel: "intel-sycl-f32-diffusers-development"
    amd: "rocm-diffusers-development"
 - !!merge <<: *diffusers
  name: "cuda12-diffusers"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-diffusers"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-diffusers
 - !!merge <<: *diffusers
  name: "rocm-diffusers"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-diffusers"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-diffusers
 - !!merge <<: *diffusers
  name: "cuda11-diffusers"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-diffusers"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-11-diffusers
 - !!merge <<: *diffusers
-  name: "intel-diffusers"
+  name: "intel-sycl-f32-diffusers"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-diffusers"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-diffusers"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-diffusers
 - !!merge <<: *diffusers
  name: "cuda11-diffusers-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-diffusers"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-11-diffusers
 - !!merge <<: *diffusers
  name: "cuda12-diffusers-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-diffusers"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-diffusers
 - !!merge <<: *diffusers
  name: "rocm-diffusers-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-diffusers"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-diffusers
 - !!merge <<: *diffusers
-  name: "intel-diffusers-development"
+  name: "intel-sycl-f32-diffusers-development"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-diffusers"
+  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-diffusers"
  mirrors:
    - localai/localai-backends:master-gpu-intel-diffusers
  ## exllama2
 - !!merge <<: *exllama2
  name: "exllama2-development"
  capabilities:
    nvidia: "cuda12-exllama2-development"
-    intel: "intel-exllama2-development"
+    intel: "intel-sycl-f32-exllama2-development"
    amd: "rocm-exllama2-development"
 - !!merge <<: *exllama2
  name: "cuda11-exllama2"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-exllama2"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-11-exllama2
 - !!merge <<: *exllama2
  name: "cuda12-exllama2"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-exllama2"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-exllama2
 - !!merge <<: *exllama2
  name: "cuda11-exllama2-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-exllama2"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-11-exllama2
 - !!merge <<: *exllama2
  name: "cuda12-exllama2-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-exllama2"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-exllama2
 ## kokoro
 - !!merge <<: *kokoro
  name: "kokoro-development"
  capabilities:
    nvidia: "cuda12-kokoro-development"
-    intel: "intel-kokoro-development"
+    intel: "intel-sycl-f32-kokoro-development"
    amd: "rocm-kokoro-development"
 - !!merge <<: *kokoro
  name: "cuda11-kokoro-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-kokoro"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-11-kokoro
 - !!merge <<: *kokoro
  name: "cuda12-kokoro-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-kokoro"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-kokoro
 - !!merge <<: *kokoro
  name: "rocm-kokoro-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-kokoro"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-kokoro
 - !!merge <<: *kokoro
-  name: "intel-kokoro"
+  name: "sycl-f32-kokoro"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-kokoro"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-kokoro"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-kokoro
 - !!merge <<: *kokoro
-  name: "intel-kokoro-development"
+  name: "sycl-f16-kokoro"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-kokoro"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-kokoro"
-  mirrors:
+- !!merge <<: *kokoro
-    - localai/localai-backends:master-gpu-intel-kokoro
+  name: "sycl-f16-kokoro-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-kokoro"
 - !!merge <<: *kokoro
  name: "sycl-f32-kokoro-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-kokoro"
 - !!merge <<: *kokoro
  name: "cuda11-kokoro"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-kokoro"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-11-kokoro
 - !!merge <<: *kokoro
  name: "cuda12-kokoro"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-kokoro"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-kokoro
 - !!merge <<: *kokoro
  name: "rocm-kokoro"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-kokoro"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-kokoro
 ## faster-whisper
 - !!merge <<: *faster-whisper
  name: "faster-whisper-development"
  capabilities:
    nvidia: "cuda12-faster-whisper-development"
-    intel: "intel-faster-whisper-development"
+    intel: "intel-sycl-f32-faster-whisper-development"
    amd: "rocm-faster-whisper-development"
 - !!merge <<: *faster-whisper
  name: "cuda11-faster-whisper"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-faster-whisper"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-11-faster-whisper
 - !!merge <<: *faster-whisper
  name: "cuda12-faster-whisper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-faster-whisper"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-faster-whisper
 - !!merge <<: *faster-whisper
  name: "rocm-faster-whisper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-faster-whisper"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-faster-whisper
 - !!merge <<: *faster-whisper
-  name: "intel-faster-whisper"
+  name: "sycl-f32-faster-whisper"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-faster-whisper"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-faster-whisper"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-faster-whisper
 - !!merge <<: *faster-whisper
-  name: "intel-faster-whisper-development"
+  name: "sycl-f16-faster-whisper"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-faster-whisper"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-faster-whisper"
-  mirrors:
+- !!merge <<: *faster-whisper
-    - localai/localai-backends:master-gpu-intel-faster-whisper
+  name: "sycl-f32-faster-whisper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-faster-whisper"
 - !!merge <<: *faster-whisper
  name: "sycl-f16-faster-whisper-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-faster-whisper"
 ## coqui
 - !!merge <<: *coqui
  name: "coqui-development"
  capabilities:
    nvidia: "cuda12-coqui-development"
-    intel: "intel-coqui-development"
+    intel: "intel-sycl-f32-coqui-development"
    amd: "rocm-coqui-development"
 - !!merge <<: *coqui
  name: "cuda11-coqui"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-coqui"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-11-coqui
 - !!merge <<: *coqui
  name: "cuda12-coqui"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-coqui"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-coqui
 - !!merge <<: *coqui
  name: "cuda11-coqui-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-coqui"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-11-coqui
 - !!merge <<: *coqui
  name: "cuda12-coqui-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-coqui"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-coqui
 - !!merge <<: *coqui
  name: "rocm-coqui-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-coqui"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-coqui
 - !!merge <<: *coqui
-  name: "intel-coqui"
+  name: "sycl-f32-coqui"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-coqui"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-coqui"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-coqui
 - !!merge <<: *coqui
-  name: "intel-coqui-development"
+  name: "sycl-f16-coqui"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-coqui"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-coqui"
-  mirrors:
+- !!merge <<: *coqui
-    - localai/localai-backends:master-gpu-intel-coqui
+  name: "sycl-f32-coqui-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-coqui"
 - !!merge <<: *coqui
  name: "sycl-f16-coqui-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-coqui"
 - !!merge <<: *coqui
  name: "rocm-coqui"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-coqui"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-coqui
 ## bark
 - !!merge <<: *bark
  name: "bark-development"
  capabilities:
    nvidia: "cuda12-bark-development"
-    intel: "intel-bark-development"
+    intel: "intel-sycl-f32-bark-development"
    amd: "rocm-bark-development"
 - !!merge <<: *bark
  name: "cuda11-bark-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-bark"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-11-bark
 - !!merge <<: *bark
  name: "cuda11-bark"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-bark"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-11-bark
 - !!merge <<: *bark
  name: "rocm-bark-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-bark"
  mirrors:
    - localai/localai-backends:master-gpu-rocm-hipblas-bark
 - !!merge <<: *bark
-  name: "intel-bark"
+  name: "sycl-f32-bark"
-  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-bark"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f32-bark"
  mirrors:
    - localai/localai-backends:latest-gpu-intel-bark
 - !!merge <<: *bark
-  name: "intel-bark-development"
+  name: "sycl-f16-bark"
-  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-bark"
+  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-sycl-f16-bark"
-  mirrors:
+- !!merge <<: *bark
-    - localai/localai-backends:master-gpu-intel-bark
+  name: "sycl-f32-bark-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f32-bark"
 - !!merge <<: *bark
  name: "sycl-f16-bark-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-sycl-f16-bark"
 - !!merge <<: *bark
  name: "cuda12-bark"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-bark"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-bark
 - !!merge <<: *bark
  name: "rocm-bark"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-bark"
  mirrors:
    - localai/localai-backends:latest-gpu-rocm-hipblas-bark
 - !!merge <<: *bark
  name: "cuda12-bark-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-bark"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-bark
 - !!merge <<: *barkcpp
  name: "bark-cpp-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-bark-cpp"
@@ -1086,20 +803,12 @@
 - !!merge <<: *chatterbox
  name: "cuda12-chatterbox-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-chatterbox"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-12-chatterbox
 - !!merge <<: *chatterbox
  name: "cuda11-chatterbox"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-11-chatterbox"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-11-chatterbox
 - !!merge <<: *chatterbox
  name: "cuda11-chatterbox-development"
  uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-11-chatterbox"
  mirrors:
    - localai/localai-backends:master-gpu-nvidia-cuda-11-chatterbox
 - !!merge <<: *chatterbox
  name: "cuda12-chatterbox"
  uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-chatterbox"
  mirrors:
    - localai/localai-backends:latest-gpu-nvidia-cuda-12-chatterbox
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -111,7 +111,7 @@ function ensureVenv() {
 #  - requirements-${BUILD_TYPE}.txt
 #  - requirements-${BUILD_PROFILE}.txt
 #
-# BUILD_PROFILE is a pore specific version of BUILD_TYPE, ex: cuda-11 or cuda-12
+# BUILD_PROFILE is a pore specific version of BUILD_TYPE, ex: cuda11 or cuda12
 # it can also include some options that we do not have BUILD_TYPES for, ex: intel
 #
 # NOTE: for BUILD_PROFILE==intel, this function does NOT automatically use the Intel python package index.
--- a/backend/python/common/template/protogen.sh
+++ b/backend/python/common/template/protogen.sh
@@ -8,6 +8,4 @@ else
    source $backend_dir/../common/libbackend.sh
 fi
 ensureVenv
 python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/rfdetr/Makefile
+++ b/backend/python/rfdetr/Makefile
@@ -1,20 +0,0 @@
 .DEFAULT_GOAL := install
 .PHONY: install
 install:
 	bash install.sh
 	$(MAKE) protogen
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	bash protogen.sh
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/rfdetr/backend.py
+++ b/backend/python/rfdetr/backend.py
@@ -1,174 +0,0 @@
 #!/usr/bin/env python3
 """
 gRPC server for RFDETR object detection models.
 """
 from concurrent import futures
 import argparse
 import signal
 import sys
 import os
 import time
 import base64
 import backend_pb2
 import backend_pb2_grpc
 import grpc
 import requests
 import supervision as sv
 from inference import get_model
 from PIL import Image
 from io import BytesIO
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    """
    A gRPC servicer for the RFDETR backend service.
    This class implements the gRPC methods for object detection using RFDETR models.
    """
    def __init__(self):
        self.model = None
        self.model_name = None
    def Health(self, request, context):
        """
        A gRPC method that returns the health status of the backend service.
        Args:
            request: A HealthMessage object that contains the request parameters.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            A Reply object that contains the health status of the backend service.
        """
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        """
        A gRPC method that loads a RFDETR model into memory.
        Args:
            request: A ModelOptions object that contains the model parameters.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            A Result object that contains the result of the LoadModel operation.
        """
        model_name = request.Model
        try:
            # Load the RFDETR model
            self.model = get_model(model_name)
            self.model_name = model_name
            print(f'Loaded RFDETR model: {model_name}')
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Failed to load model: {err}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def Detect(self, request, context):
        """
        A gRPC method that performs object detection on an image.
        Args:
            request: A DetectOptions object that contains the image source.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            A DetectResponse object that contains the detection results.
        """
        if self.model is None:
            print(f"Model is None")
            return backend_pb2.DetectResponse()
        print(f"Model is not None")
        try:
            print(f"Decoding image")
            # Decode the base64 image
            print(f"Image data: {request.src}")
            image_data = base64.b64decode(request.src)
            image = Image.open(BytesIO(image_data))
            # Perform inference
            predictions = self.model.infer(image, confidence=0.5)[0]
            # Convert to proto format
            proto_detections = []
            for i in range(len(predictions.predictions)):
                pred = predictions.predictions[i]
                print(f"Prediction: {pred}")
                proto_detection = backend_pb2.Detection(
                    x=float(pred.x),
                    y=float(pred.y),
                    width=float(pred.width),
                    height=float(pred.height),
                    confidence=float(pred.confidence),
                    class_name=pred.class_name
                )
                proto_detections.append(proto_detection)
            return backend_pb2.DetectResponse(Detections=proto_detections)
        except Exception as err:
            print(f"Detection error: {err}")
            return backend_pb2.DetectResponse()
    def Status(self, request, context):
        """
        A gRPC method that returns the status of the backend service.
        Args:
            request: A HealthMessage object that contains the request parameters.
            context: A grpc.ServicerContext object that provides information about the RPC.
        Returns:
            A StatusResponse object that contains the status information.
        """
        state = backend_pb2.StatusResponse.READY if self.model is not None else backend_pb2.StatusResponse.UNINITIALIZED
        return backend_pb2.StatusResponse(state=state)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("[RFDETR] Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("[RFDETR] Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the RFDETR gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    print(f"[RFDETR] startup: {args}", file=sys.stderr)
    serve(args.addr)
--- a/backend/python/rfdetr/install.sh
+++ b/backend/python/rfdetr/install.sh
@@ -1,19 +0,0 @@
 #!/bin/bash
 set -e
 backend_dir=$(dirname $0)
 if [ -d $backend_dir/common ]; then
    source $backend_dir/common/libbackend.sh
 else
    source $backend_dir/../common/libbackend.sh
 fi
 # This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
 # This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
 # We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
 # the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
 installRequirements
--- a/backend/python/rfdetr/protogen.sh
+++ b/backend/python/rfdetr/protogen.sh
@@ -1,13 +0,0 @@
 #!/bin/bash
 set -e
 backend_dir=$(dirname $0)
 if [ -d $backend_dir/common ]; then
    source $backend_dir/common/libbackend.sh
 else
    source $backend_dir/../common/libbackend.sh
 fi
 ensureVenv
 python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/rfdetr/requirements-cpu.txt
+++ b/backend/python/rfdetr/requirements-cpu.txt
@@ -1,7 +0,0 @@
 rfdetr
 opencv-python
 accelerate
 peft
 inference
 torch==2.7.1
 optimum-quanto
--- a/backend/python/rfdetr/requirements-cublas11.txt
+++ b/backend/python/rfdetr/requirements-cublas11.txt
@@ -1,8 +0,0 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.7.1+cu118
 rfdetr
 opencv-python
 accelerate
 inference
 peft
 optimum-quanto
--- a/backend/python/rfdetr/requirements-cublas12.txt
+++ b/backend/python/rfdetr/requirements-cublas12.txt
@@ -1,7 +0,0 @@
 torch==2.7.1
 rfdetr
 opencv-python
 accelerate
 inference
 peft
 optimum-quanto
--- a/backend/python/rfdetr/requirements-hipblas.txt
+++ b/backend/python/rfdetr/requirements-hipblas.txt
@@ -1,9 +0,0 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.3
 torch==2.7.1+rocm6.3
 torchvision==0.22.1+rocm6.3
 rfdetr
 opencv-python
 accelerate
 inference
 peft
 optimum-quanto
--- a/backend/python/rfdetr/requirements-intel.txt
+++ b/backend/python/rfdetr/requirements-intel.txt
@@ -1,13 +0,0 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 intel-extension-for-pytorch==2.3.110+xpu
 torch==2.3.1+cxx11.abi
 torchvision==0.18.1+cxx11.abi
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
 setuptools
 rfdetr
 inference
 opencv-python
 accelerate
 peft
 optimum-quanto
--- a/backend/python/rfdetr/requirements.txt
+++ b/backend/python/rfdetr/requirements.txt
@@ -1,3 +0,0 @@
 grpcio==1.71.0
 protobuf
 grpcio-tools
--- a/backend/python/rfdetr/run.sh
+++ b/backend/python/rfdetr/run.sh
@@ -1,9 +0,0 @@
 #!/bin/bash
 backend_dir=$(dirname $0)
 if [ -d $backend_dir/common ]; then
    source $backend_dir/common/libbackend.sh
 else
    source $backend_dir/../common/libbackend.sh
 fi
 startBackend $@
--- a/backend/python/rfdetr/test.sh
+++ b/backend/python/rfdetr/test.sh
@@ -1,11 +0,0 @@
 #!/bin/bash
 set -e
 backend_dir=$(dirname $0)
 if [ -d $backend_dir/common ]; then
    source $backend_dir/common/libbackend.sh
 else
    source $backend_dir/../common/libbackend.sh
 fi
 runUnittests
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -2,8 +2,8 @@ package application
 import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/templates"
 )
 type Application struct {
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -10,8 +10,8 @@ import (
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/internal"
 	coreStartup "github.com/mudler/LocalAI/core/startup"
 	"github.com/mudler/LocalAI/pkg/model"
 	pkgStartup "github.com/mudler/LocalAI/pkg/startup"
 	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/rs/zerolog/log"
 )
@@ -55,11 +55,11 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}
 	}
-	if err := coreStartup.InstallModels(options.Galleries, options.BackendGalleries, options.ModelPath, options.BackendsPath, options.EnforcePredownloadScans, options.AutoloadBackendGalleries, nil, options.ModelsURL...); err != nil {
+	if err := pkgStartup.InstallModels(options.Galleries, options.BackendGalleries, options.ModelPath, options.BackendsPath, options.EnforcePredownloadScans, options.AutoloadBackendGalleries, nil, options.ModelsURL...); err != nil {
 		log.Error().Err(err).Msg("error installing models")
 	}
-	if err := coreStartup.InstallExternalBackends(options.BackendGalleries, options.BackendsPath, nil, options.ExternalBackends...); err != nil {
+	if err := pkgStartup.InstallExternalBackends(options.BackendGalleries, options.BackendsPath, nil, options.ExternalBackends...); err != nil {
 		log.Error().Err(err).Msg("error installing external backends")
 	}
--- a/core/backend/detection.go
+++ b/core/backend/detection.go
@@ -1,34 +0,0 @@
 package backend
 import (
 	"context"
 	"fmt"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 func Detection(
 	sourceFile string,
 	loader *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	backendConfig config.BackendConfig,
 ) (*proto.DetectResponse, error) {
 	opts := ModelOptions(backendConfig, appConfig)
 	detectionModel, err := loader.Load(opts...)
 	if err != nil {
 		return nil, err
 	}
 	defer loader.Close()
 	if detectionModel == nil {
 		return nil, fmt.Errorf("could not load detection model")
 	}
 	res, err := detectionModel.Detect(context.Background(), &proto.DetectOptions{
 		Src: sourceFile,
 	})
 	return res, err
 }
--- a/core/cli/backends.go
+++ b/core/cli/backends.go
@@ -8,7 +8,7 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
-	"github.com/mudler/LocalAI/core/startup"
+	"github.com/mudler/LocalAI/pkg/startup"
 	"github.com/rs/zerolog/log"
 	"github.com/schollz/progressbar/v3"
 )
--- a/core/cli/models.go
+++ b/core/cli/models.go
@@ -9,8 +9,8 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/startup"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/startup"
 	"github.com/rs/zerolog/log"
 	"github.com/schollz/progressbar/v3"
 )
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -25,6 +25,7 @@ type RunCMD struct {
 	ModelsPath                   string        `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 	GeneratedContentPath         string        `env:"LOCALAI_GENERATED_CONTENT_PATH,GENERATED_CONTENT_PATH" type:"path" default:"/tmp/generated/content" help:"Location for generated content (e.g. images, audio, videos)" group:"storage"`
 	UploadPath                   string        `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"`
 	ConfigPath                   string        `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"`
 	LocalaiConfigDir             string        `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"`
 	LocalaiConfigDirPollInterval time.Duration `env:"LOCALAI_CONFIG_DIR_POLL_INTERVAL" help:"Typically the config path picks up changes automatically, but if your system has broken fsnotify events, set this to an interval to poll the LocalAI Config Dir (example: 1m)" group:"storage"`
 	// The alias on this option is there to preserve functionality with the old `--config-file` parameter
@@ -87,6 +88,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithDebug(zerolog.GlobalLevel() <= zerolog.DebugLevel),
 		config.WithGeneratedContentDir(r.GeneratedContentPath),
 		config.WithUploadDir(r.UploadPath),
 		config.WithConfigsDir(r.ConfigPath),
 		config.WithDynamicConfigDir(r.LocalaiConfigDir),
 		config.WithDynamicConfigDirPollInterval(r.LocalaiConfigDirPollInterval),
 		config.WithF16(r.F16),
--- a/core/cli/util.go
+++ b/core/cli/util.go
@@ -72,7 +72,7 @@ func (u *CreateOCIImageCMD) Run(ctx *cliContext.Context) error {
 }
 func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
-	if len(u.Args) == 0 {
+	if u.Args == nil || len(u.Args) == 0 {
 		return fmt.Errorf("no GGUF file provided")
 	}
 	// We try to guess only if we don't have a template defined already
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -21,7 +21,8 @@ type ApplicationConfig struct {
 	Debug                               bool
 	GeneratedContentDir                 string
-	UploadDir string
+	ConfigsDir string
 	UploadDir  string
 	DynamicConfigsDir             string
 	DynamicConfigsDirPollInterval time.Duration
@@ -301,6 +302,12 @@ func WithUploadDir(uploadDir string) AppOption {
 	}
 }
 func WithConfigsDir(configsDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.ConfigsDir = configsDir
 	}
 }
 func WithDynamicConfigDir(dynamicConfigsDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.DynamicConfigsDir = dynamicConfigsDir
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -458,7 +458,6 @@ const (
 	FLAG_TOKENIZE         BackendConfigUsecases = 0b001000000000
 	FLAG_VAD              BackendConfigUsecases = 0b010000000000
 	FLAG_VIDEO            BackendConfigUsecases = 0b100000000000
 	FLAG_DETECTION        BackendConfigUsecases = 0b1000000000000
 	// Common Subsets
 	FLAG_LLM BackendConfigUsecases = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
@@ -480,7 +479,6 @@ func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
 		"FLAG_VAD":              FLAG_VAD,
 		"FLAG_LLM":              FLAG_LLM,
 		"FLAG_VIDEO":            FLAG_VIDEO,
 		"FLAG_DETECTION":        FLAG_DETECTION,
 	}
 }
@@ -574,12 +572,6 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 		}
 	}
 	if (u & FLAG_DETECTION) == FLAG_DETECTION {
 		if c.Backend != "rfdetr" {
 			return false
 		}
 	}
 	if (u & FLAG_SOUND_GENERATION) == FLAG_SOUND_GENERATION {
 		if c.Backend != "transformers-musicgen" {
 			return false
--- a/core/gallery/backend_types.go
+++ b/core/gallery/backend_types.go
@@ -2,8 +2,7 @@ package gallery
 import (
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/pkg/system"
+	"github.com/mudler/LocalAI/core/system"
 	"github.com/rs/zerolog/log"
 )
 // BackendMetadata represents the metadata stored in a JSON file for each installed backend
@@ -24,7 +23,6 @@ type GalleryBackend struct {
 	Metadata        `json:",inline" yaml:",inline"`
 	Alias           string            `json:"alias,omitempty" yaml:"alias,omitempty"`
 	URI             string            `json:"uri,omitempty" yaml:"uri,omitempty"`
 	Mirrors         []string          `json:"mirrors,omitempty" yaml:"mirrors,omitempty"`
 	CapabilitiesMap map[string]string `json:"capabilities,omitempty" yaml:"capabilities,omitempty"`
 }
@@ -35,11 +33,9 @@ func (backend *GalleryBackend) FindBestBackendFromMeta(systemState *system.Syste
 	realBackend := backend.CapabilitiesMap[systemState.Capability(backend.CapabilitiesMap)]
 	if realBackend == "" {
 		log.Debug().Str("backend", backend.Name).Str("reportedCapability", systemState.Capability(backend.CapabilitiesMap)).Msg("No backend found for reported capability")
 		return nil
 	}
 	log.Debug().Str("backend", backend.Name).Str("reportedCapability", systemState.Capability(backend.CapabilitiesMap)).Msg("Found backend for reported capability")
 	return backends.FindByName(realBackend)
 }
--- a/core/gallery/backends.go
+++ b/core/gallery/backends.go
@@ -8,9 +8,9 @@ import (
 	"time"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/system"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/system"
 	"github.com/rs/zerolog/log"
 )
@@ -146,18 +146,7 @@ func InstallBackend(basePath string, config *GalleryBackend, downloadStatus func
 	uri := downloader.URI(config.URI)
 	if err := uri.DownloadFile(backendPath, "", 1, 1, downloadStatus); err != nil {
-		success := false
+		return fmt.Errorf("failed to download backend %q: %v", config.URI, err)
 		// Try to download from mirrors
 		for _, mirror := range config.Mirrors {
 			if err := downloader.URI(mirror).DownloadFile(backendPath, "", 1, 1, downloadStatus); err == nil {
 				success = true
 				break
 			}
 		}
 		if !success {
 			return fmt.Errorf("failed to download backend %q: %v", config.URI, err)
 		}
 	}
 	// Create metadata for the backend
--- a/core/gallery/backends_test.go
+++ b/core/gallery/backends_test.go
@@ -7,7 +7,7 @@ import (
 	"runtime"
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/pkg/system"
+	"github.com/mudler/LocalAI/core/system"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	"gopkg.in/yaml.v2"
--- a/core/gallery/gallery.go
+++ b/core/gallery/gallery.go
@@ -95,7 +95,7 @@ func FindGalleryElement[T GalleryElement](models []T, name string, basePath stri
 	if !strings.Contains(name, "@") {
 		for _, m := range models {
-			if strings.EqualFold(strings.ToLower(m.GetName()), strings.ToLower(name)) {
+			if strings.EqualFold(m.GetName(), name) {
 				model = m
 				break
 			}
@@ -103,7 +103,7 @@ func FindGalleryElement[T GalleryElement](models []T, name string, basePath stri
 	} else {
 		for _, m := range models {
-			if strings.EqualFold(strings.ToLower(name), strings.ToLower(fmt.Sprintf("%s@%s", m.GetGallery().Name, m.GetName()))) {
+			if strings.EqualFold(name, fmt.Sprintf("%s@%s", m.GetGallery().Name, m.GetName())) {
 				model = m
 				break
 			}
--- a/core/gallery/models.go
+++ b/core/gallery/models.go
@@ -10,8 +10,8 @@ import (
 	"dario.cat/mergo"
 	"github.com/mudler/LocalAI/core/config"
 	lconfig "github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/system"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/system"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -10,8 +10,10 @@ import (
 	"github.com/dave-gray101/v2keyauth"
 	"github.com/gofiber/websocket/v2"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/http/routes"
@@ -197,6 +199,11 @@ func API(application *application.Application) (*fiber.App, error) {
 		router.Use(csrf.New())
 	}
 	// Load config jsons
 	utils.LoadConfig(application.ApplicationConfig().UploadDir, openai.UploadedFilesFile, &openai.UploadedFiles)
 	utils.LoadConfig(application.ApplicationConfig().ConfigsDir, openai.AssistantsConfigFile, &openai.Assistants)
 	utils.LoadConfig(application.ApplicationConfig().ConfigsDir, openai.AssistantsFileConfigFile, &openai.AssistantFiles)
 	galleryService := services.NewGalleryService(application.ApplicationConfig(), application.ModelLoader())
 	err = galleryService.Start(application.ApplicationConfig().Context, application.BackendLoader())
 	if err != nil {
--- a/core/http/endpoints/localai/detection.go
+++ b/core/http/endpoints/localai/detection.go
@@ -1,59 +0,0 @@
 package localai
 import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
 )
 // DetectionEndpoint is the LocalAI Detection endpoint https://localai.io/docs/api-reference/detection
 // @Summary Detects objects in the input image.
 // @Param request body schema.DetectionRequest true "query params"
 // @Success 200 {object} schema.DetectionResponse "Response"
 // @Router /v1/detection [post]
 func DetectionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.DetectionRequest)
 		if !ok || input.Model == "" {
 			return fiber.ErrBadRequest
 		}
 		cfg, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
 		if !ok || cfg == nil {
 			return fiber.ErrBadRequest
 		}
 		log.Debug().Str("image", input.Image).Str("modelFile", "modelFile").Str("backend", cfg.Backend).Msg("Detection")
 		image, err := utils.GetContentURIAsBase64(input.Image)
 		if err != nil {
 			return err
 		}
 		res, err := backend.Detection(image, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
 		response := schema.DetectionResponse{
 			Detections: make([]schema.Detection, len(res.Detections)),
 		}
 		for i, detection := range res.Detections {
 			response.Detections[i] = schema.Detection{
 				X:         detection.X,
 				Y:         detection.Y,
 				Width:     detection.Width,
 				Height:    detection.Height,
 				ClassName: detection.ClassName,
 			}
 		}
 		return c.JSON(response)
 	}
 }
--- a/core/http/endpoints/localai/gallery.go
+++ b/core/http/endpoints/localai/gallery.go
@@ -15,10 +15,9 @@ import (
 )
 type ModelGalleryEndpointService struct {
-	galleries        []config.Gallery
+	galleries      []config.Gallery
-	backendGalleries []config.Gallery
+	modelPath      string
-	modelPath        string
+	galleryApplier *services.GalleryService
 	galleryApplier   *services.GalleryService
 }
 type GalleryModel struct {
@@ -26,12 +25,11 @@ type GalleryModel struct {
 	gallery.GalleryModel
 }
-func CreateModelGalleryEndpointService(galleries []config.Gallery, backendGalleries []config.Gallery, modelPath string, galleryApplier *services.GalleryService) ModelGalleryEndpointService {
+func CreateModelGalleryEndpointService(galleries []config.Gallery, modelPath string, galleryApplier *services.GalleryService) ModelGalleryEndpointService {
 	return ModelGalleryEndpointService{
-		galleries:        galleries,
+		galleries:      galleries,
-		backendGalleries: backendGalleries,
+		modelPath:      modelPath,
-		modelPath:        modelPath,
+		galleryApplier: galleryApplier,
 		galleryApplier:   galleryApplier,
 	}
 }
@@ -81,7 +79,6 @@ func (mgs *ModelGalleryEndpointService) ApplyModelGalleryEndpoint() func(c *fibe
 			ID:                 uuid.String(),
 			GalleryElementName: input.ID,
 			Galleries:          mgs.galleries,
 			BackendGalleries:   mgs.backendGalleries,
 		}
 		return c.JSON(schema.GalleryResponse{ID: uuid.String(), StatusURL: fmt.Sprintf("%smodels/jobs/%s", utils.BaseURL(c), uuid.String())})
--- a/core/http/endpoints/openai/assistant.go
+++ b/core/http/endpoints/openai/assistant.go
@@ -0,0 +1,522 @@
 package openai
 import (
 	"fmt"
 	"net/http"
 	"sort"
 	"strconv"
 	"strings"
 	"sync/atomic"
 	"time"
 	"github.com/gofiber/fiber/v2"
 	"github.com/microcosm-cc/bluemonday"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
 )
 // ToolType defines a type for tool options
 type ToolType string
 const (
 	CodeInterpreter ToolType = "code_interpreter"
 	Retrieval       ToolType = "retrieval"
 	Function        ToolType = "function"
 	MaxCharacterInstructions  = 32768
 	MaxCharacterDescription   = 512
 	MaxCharacterName          = 256
 	MaxToolsSize              = 128
 	MaxFileIdSize             = 20
 	MaxCharacterMetadataKey   = 64
 	MaxCharacterMetadataValue = 512
 )
 type Tool struct {
 	Type ToolType `json:"type"`
 }
 // Assistant represents the structure of an assistant object from the OpenAI API.
 type Assistant struct {
 	ID           string            `json:"id"`                     // The unique identifier of the assistant.
 	Object       string            `json:"object"`                 // Object type, which is "assistant".
 	Created      int64             `json:"created"`                // The time at which the assistant was created.
 	Model        string            `json:"model"`                  // The model ID used by the assistant.
 	Name         string            `json:"name,omitempty"`         // The name of the assistant.
 	Description  string            `json:"description,omitempty"`  // The description of the assistant.
 	Instructions string            `json:"instructions,omitempty"` // The system instructions that the assistant uses.
 	Tools        []Tool            `json:"tools,omitempty"`        // A list of tools enabled on the assistant.
 	FileIDs      []string          `json:"file_ids,omitempty"`     // A list of file IDs attached to this assistant.
 	Metadata     map[string]string `json:"metadata,omitempty"`     // Set of key-value pairs attached to the assistant.
 }
 var (
 	Assistants           = []Assistant{} // better to return empty array instead of "null"
 	AssistantsConfigFile = "assistants.json"
 )
 type AssistantRequest struct {
 	Model        string            `json:"model"`
 	Name         string            `json:"name,omitempty"`
 	Description  string            `json:"description,omitempty"`
 	Instructions string            `json:"instructions,omitempty"`
 	Tools        []Tool            `json:"tools,omitempty"`
 	FileIDs      []string          `json:"file_ids,omitempty"`
 	Metadata     map[string]string `json:"metadata,omitempty"`
 }
 // CreateAssistantEndpoint is the OpenAI Assistant API endpoint https://platform.openai.com/docs/api-reference/assistants/createAssistant
 // @Summary Create an assistant with a model and instructions.
 // @Param request body AssistantRequest true "query params"
 // @Success 200 {object} Assistant "Response"
 // @Router /v1/assistants [post]
 func CreateAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		request := new(AssistantRequest)
 		if err := c.BodyParser(request); err != nil {
 			log.Warn().AnErr("Unable to parse AssistantRequest", err)
 			return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{"error": "Cannot parse JSON"})
 		}
 		if !modelExists(cl, ml, request.Model) {
 			log.Warn().Msgf("Model: %s was not found in list of models.", request.Model)
 			return c.Status(fiber.StatusBadRequest).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Model %q not found", request.Model)))
 		}
 		if request.Tools == nil {
 			request.Tools = []Tool{}
 		}
 		if request.FileIDs == nil {
 			request.FileIDs = []string{}
 		}
 		if request.Metadata == nil {
 			request.Metadata = make(map[string]string)
 		}
 		id := "asst_" + strconv.FormatInt(generateRandomID(), 10)
 		assistant := Assistant{
 			ID:           id,
 			Object:       "assistant",
 			Created:      time.Now().Unix(),
 			Model:        request.Model,
 			Name:         request.Name,
 			Description:  request.Description,
 			Instructions: request.Instructions,
 			Tools:        request.Tools,
 			FileIDs:      request.FileIDs,
 			Metadata:     request.Metadata,
 		}
 		Assistants = append(Assistants, assistant)
 		utils.SaveConfig(appConfig.ConfigsDir, AssistantsConfigFile, Assistants)
 		return c.Status(fiber.StatusOK).JSON(assistant)
 	}
 }
 var currentId int64 = 0
 func generateRandomID() int64 {
 	atomic.AddInt64(&currentId, 1)
 	return currentId
 }
 // ListAssistantsEndpoint is the OpenAI Assistant API endpoint to list assistents https://platform.openai.com/docs/api-reference/assistants/listAssistants
 // @Summary List available assistents
 // @Param limit query int false "Limit the number of assistants returned"
 // @Param order query string false "Order of assistants returned"
 // @Param after query string false "Return assistants created after the given ID"
 // @Param before query string false "Return assistants created before the given ID"
 // @Success 200 {object} []Assistant "Response"
 // @Router /v1/assistants [get]
 func ListAssistantsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		// Because we're altering the existing assistants list we should just duplicate it for now.
 		returnAssistants := Assistants
 		// Parse query parameters
 		limitQuery := c.Query("limit", "20")
 		orderQuery := c.Query("order", "desc")
 		afterQuery := c.Query("after")
 		beforeQuery := c.Query("before")
 		// Convert string limit to integer
 		limit, err := strconv.Atoi(limitQuery)
 		if err != nil {
 			return c.Status(http.StatusBadRequest).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Invalid limit query value: %s", limitQuery)))
 		}
 		// Sort assistants
 		sort.SliceStable(returnAssistants, func(i, j int) bool {
 			if orderQuery == "asc" {
 				return returnAssistants[i].Created < returnAssistants[j].Created
 			}
 			return returnAssistants[i].Created > returnAssistants[j].Created
 		})
 		// After and before cursors
 		if afterQuery != "" {
 			returnAssistants = filterAssistantsAfterID(returnAssistants, afterQuery)
 		}
 		if beforeQuery != "" {
 			returnAssistants = filterAssistantsBeforeID(returnAssistants, beforeQuery)
 		}
 		// Apply limit
 		if limit < len(returnAssistants) {
 			returnAssistants = returnAssistants[:limit]
 		}
 		return c.JSON(returnAssistants)
 	}
 }
 // FilterAssistantsBeforeID filters out those assistants whose ID comes before the given ID
 // We assume that the assistants are already sorted
 func filterAssistantsBeforeID(assistants []Assistant, id string) []Assistant {
 	idInt, err := strconv.Atoi(id)
 	if err != nil {
 		return assistants // Return original slice if invalid id format is provided
 	}
 	var filteredAssistants []Assistant
 	for _, assistant := range assistants {
 		aid, err := strconv.Atoi(strings.TrimPrefix(assistant.ID, "asst_"))
 		if err != nil {
 			continue // Skip if invalid id in assistant
 		}
 		if aid < idInt {
 			filteredAssistants = append(filteredAssistants, assistant)
 		}
 	}
 	return filteredAssistants
 }
 // FilterAssistantsAfterID filters out those assistants whose ID comes after the given ID
 // We assume that the assistants are already sorted
 func filterAssistantsAfterID(assistants []Assistant, id string) []Assistant {
 	idInt, err := strconv.Atoi(id)
 	if err != nil {
 		return assistants // Return original slice if invalid id format is provided
 	}
 	var filteredAssistants []Assistant
 	for _, assistant := range assistants {
 		aid, err := strconv.Atoi(strings.TrimPrefix(assistant.ID, "asst_"))
 		if err != nil {
 			continue // Skip if invalid id in assistant
 		}
 		if aid > idInt {
 			filteredAssistants = append(filteredAssistants, assistant)
 		}
 	}
 	return filteredAssistants
 }
 func modelExists(cl *config.BackendConfigLoader, ml *model.ModelLoader, modelName string) (found bool) {
 	found = false
 	models, err := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 	if err != nil {
 		return
 	}
 	for _, model := range models {
 		if model == modelName {
 			found = true
 			return
 		}
 	}
 	return
 }
 // DeleteAssistantEndpoint is the OpenAI Assistant API endpoint to delete assistents https://platform.openai.com/docs/api-reference/assistants/deleteAssistant
 // @Summary Delete assistents
 // @Success 200 {object} schema.DeleteAssistantResponse "Response"
 // @Router /v1/assistants/{assistant_id} [delete]
 func DeleteAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		assistantID := c.Params("assistant_id")
 		if assistantID == "" {
 			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id is required")
 		}
 		for i, assistant := range Assistants {
 			if assistant.ID == assistantID {
 				Assistants = append(Assistants[:i], Assistants[i+1:]...)
 				utils.SaveConfig(appConfig.ConfigsDir, AssistantsConfigFile, Assistants)
 				return c.Status(fiber.StatusOK).JSON(schema.DeleteAssistantResponse{
 					ID:      assistantID,
 					Object:  "assistant.deleted",
 					Deleted: true,
 				})
 			}
 		}
 		log.Warn().Msgf("Unable to find assistant %s for deletion", assistantID)
 		return c.Status(fiber.StatusNotFound).JSON(schema.DeleteAssistantResponse{
 			ID:      assistantID,
 			Object:  "assistant.deleted",
 			Deleted: false,
 		})
 	}
 }
 // GetAssistantEndpoint is the OpenAI Assistant API endpoint to get assistents https://platform.openai.com/docs/api-reference/assistants/getAssistant
 // @Summary Get assistent data
 // @Success 200 {object} Assistant "Response"
 // @Router /v1/assistants/{assistant_id} [get]
 func GetAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		assistantID := c.Params("assistant_id")
 		if assistantID == "" {
 			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id is required")
 		}
 		for _, assistant := range Assistants {
 			if assistant.ID == assistantID {
 				return c.Status(fiber.StatusOK).JSON(assistant)
 			}
 		}
 		return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant with id: %s", assistantID)))
 	}
 }
 type AssistantFile struct {
 	ID          string `json:"id"`
 	Object      string `json:"object"`
 	CreatedAt   int64  `json:"created_at"`
 	AssistantID string `json:"assistant_id"`
 }
 var (
 	AssistantFiles           []AssistantFile
 	AssistantsFileConfigFile = "assistantsFile.json"
 )
 func CreateAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		request := new(schema.AssistantFileRequest)
 		if err := c.BodyParser(request); err != nil {
 			return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{"error": "Cannot parse JSON"})
 		}
 		assistantID := c.Params("assistant_id")
 		if assistantID == "" {
 			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id is required")
 		}
 		for _, assistant := range Assistants {
 			if assistant.ID == assistantID {
 				if len(assistant.FileIDs) > MaxFileIdSize {
 					return c.Status(fiber.StatusBadRequest).SendString(fmt.Sprintf("Max files %d for assistant %s reached.", MaxFileIdSize, assistant.Name))
 				}
 				for _, file := range UploadedFiles {
 					if file.ID == request.FileID {
 						assistant.FileIDs = append(assistant.FileIDs, request.FileID)
 						assistantFile := AssistantFile{
 							ID:          file.ID,
 							Object:      "assistant.file",
 							CreatedAt:   time.Now().Unix(),
 							AssistantID: assistant.ID,
 						}
 						AssistantFiles = append(AssistantFiles, assistantFile)
 						utils.SaveConfig(appConfig.ConfigsDir, AssistantsFileConfigFile, AssistantFiles)
 						return c.Status(fiber.StatusOK).JSON(assistantFile)
 					}
 				}
 				return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find file_id: %s", request.FileID)))
 			}
 		}
 		return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find %q", assistantID)))
 	}
 }
 func ListAssistantFilesEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	type ListAssistantFiles struct {
 		Data   []schema.File
 		Object string
 	}
 	return func(c *fiber.Ctx) error {
 		assistantID := c.Params("assistant_id")
 		if assistantID == "" {
 			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id is required")
 		}
 		limitQuery := c.Query("limit", "20")
 		order := c.Query("order", "desc")
 		limit, err := strconv.Atoi(limitQuery)
 		if err != nil || limit < 1 || limit > 100 {
 			limit = 20 // Default to 20 if there's an error or the limit is out of bounds
 		}
 		// Sort files by CreatedAt depending on the order query parameter
 		if order == "asc" {
 			sort.Slice(AssistantFiles, func(i, j int) bool {
 				return AssistantFiles[i].CreatedAt < AssistantFiles[j].CreatedAt
 			})
 		} else { // default to "desc"
 			sort.Slice(AssistantFiles, func(i, j int) bool {
 				return AssistantFiles[i].CreatedAt > AssistantFiles[j].CreatedAt
 			})
 		}
 		// Limit the number of files returned
 		var limitedFiles []AssistantFile
 		hasMore := false
 		if len(AssistantFiles) > limit {
 			hasMore = true
 			limitedFiles = AssistantFiles[:limit]
 		} else {
 			limitedFiles = AssistantFiles
 		}
 		response := map[string]interface{}{
 			"object": "list",
 			"data":   limitedFiles,
 			"first_id": func() string {
 				if len(limitedFiles) > 0 {
 					return limitedFiles[0].ID
 				}
 				return ""
 			}(),
 			"last_id": func() string {
 				if len(limitedFiles) > 0 {
 					return limitedFiles[len(limitedFiles)-1].ID
 				}
 				return ""
 			}(),
 			"has_more": hasMore,
 		}
 		return c.Status(fiber.StatusOK).JSON(response)
 	}
 }
 func ModifyAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		request := new(AssistantRequest)
 		if err := c.BodyParser(request); err != nil {
 			log.Warn().AnErr("Unable to parse AssistantRequest", err)
 			return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{"error": "Cannot parse JSON"})
 		}
 		assistantID := c.Params("assistant_id")
 		if assistantID == "" {
 			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id is required")
 		}
 		for i, assistant := range Assistants {
 			if assistant.ID == assistantID {
 				newAssistant := Assistant{
 					ID:           assistantID,
 					Object:       assistant.Object,
 					Created:      assistant.Created,
 					Model:        request.Model,
 					Name:         request.Name,
 					Description:  request.Description,
 					Instructions: request.Instructions,
 					Tools:        request.Tools,
 					FileIDs:      request.FileIDs, // todo: should probably verify fileids exist
 					Metadata:     request.Metadata,
 				}
 				// Remove old one and replace with new one
 				Assistants = append(Assistants[:i], Assistants[i+1:]...)
 				Assistants = append(Assistants, newAssistant)
 				utils.SaveConfig(appConfig.ConfigsDir, AssistantsConfigFile, Assistants)
 				return c.Status(fiber.StatusOK).JSON(newAssistant)
 			}
 		}
 		return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant with id: %s", assistantID)))
 	}
 }
 func DeleteAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		assistantID := c.Params("assistant_id")
 		fileId := c.Params("file_id")
 		if assistantID == "" {
 			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id and file_id are required")
 		}
 		// First remove file from assistant
 		for i, assistant := range Assistants {
 			if assistant.ID == assistantID {
 				for j, fileId := range assistant.FileIDs {
 					Assistants[i].FileIDs = append(Assistants[i].FileIDs[:j], Assistants[i].FileIDs[j+1:]...)
 					// Check if the file exists in the assistantFiles slice
 					for i, assistantFile := range AssistantFiles {
 						if assistantFile.ID == fileId {
 							// Remove the file from the assistantFiles slice
 							AssistantFiles = append(AssistantFiles[:i], AssistantFiles[i+1:]...)
 							utils.SaveConfig(appConfig.ConfigsDir, AssistantsFileConfigFile, AssistantFiles)
 							return c.Status(fiber.StatusOK).JSON(schema.DeleteAssistantFileResponse{
 								ID:      fileId,
 								Object:  "assistant.file.deleted",
 								Deleted: true,
 							})
 						}
 					}
 				}
 				log.Warn().Msgf("Unable to locate file_id: %s in assistants: %s. Continuing to delete assistant file.", fileId, assistantID)
 				for i, assistantFile := range AssistantFiles {
 					if assistantFile.AssistantID == assistantID {
 						AssistantFiles = append(AssistantFiles[:i], AssistantFiles[i+1:]...)
 						utils.SaveConfig(appConfig.ConfigsDir, AssistantsFileConfigFile, AssistantFiles)
 						return c.Status(fiber.StatusNotFound).JSON(schema.DeleteAssistantFileResponse{
 							ID:      fileId,
 							Object:  "assistant.file.deleted",
 							Deleted: true,
 						})
 					}
 				}
 			}
 		}
 		log.Warn().Msgf("Unable to find assistant: %s", assistantID)
 		return c.Status(fiber.StatusNotFound).JSON(schema.DeleteAssistantFileResponse{
 			ID:      fileId,
 			Object:  "assistant.file.deleted",
 			Deleted: false,
 		})
 	}
 }
 func GetAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		assistantID := c.Params("assistant_id")
 		fileId := c.Params("file_id")
 		if assistantID == "" {
 			return c.Status(fiber.StatusBadRequest).SendString("parameter assistant_id and file_id are required")
 		}
 		for _, assistantFile := range AssistantFiles {
 			if assistantFile.AssistantID == assistantID {
 				if assistantFile.ID == fileId {
 					return c.Status(fiber.StatusOK).JSON(assistantFile)
 				}
 				return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant file with file_id: %s", fileId)))
 			}
 		}
 		return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant file with assistant_id: %s", assistantID)))
 	}
 }
--- a/core/http/endpoints/openai/assistant_test.go
+++ b/core/http/endpoints/openai/assistant_test.go
@@ -0,0 +1,460 @@
 package openai
 import (
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"path/filepath"
 	"strings"
 	"testing"
 	"time"
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/stretchr/testify/assert"
 )
 var configsDir string = "/tmp/localai/configs"
 type MockLoader struct {
 	models []string
 }
 func tearDown() func() {
 	return func() {
 		UploadedFiles = []schema.File{}
 		Assistants = []Assistant{}
 		AssistantFiles = []AssistantFile{}
 		_ = os.Remove(filepath.Join(configsDir, AssistantsConfigFile))
 		_ = os.Remove(filepath.Join(configsDir, AssistantsFileConfigFile))
 	}
 }
 func TestAssistantEndpoints(t *testing.T) {
 	// Preparing the mocked objects
 	cl := &config.BackendConfigLoader{}
 	//configsDir := "/tmp/localai/configs"
 	modelPath := "/tmp/localai/model"
 	var ml = model.NewModelLoader(modelPath, false)
 	appConfig := &config.ApplicationConfig{
 		ConfigsDir:    configsDir,
 		UploadLimitMB: 10,
 		UploadDir:     "test_dir",
 		ModelPath:     modelPath,
 	}
 	_ = os.RemoveAll(appConfig.ConfigsDir)
 	_ = os.MkdirAll(appConfig.ConfigsDir, 0750)
 	_ = os.MkdirAll(modelPath, 0750)
 	os.Create(filepath.Join(modelPath, "ggml-gpt4all-j"))
 	app := fiber.New(fiber.Config{
 		BodyLimit: 20 * 1024 * 1024, // sets the limit to 20MB.
 	})
 	// Create a Test Server
 	app.Get("/assistants", ListAssistantsEndpoint(cl, ml, appConfig))
 	app.Post("/assistants", CreateAssistantEndpoint(cl, ml, appConfig))
 	app.Delete("/assistants/:assistant_id", DeleteAssistantEndpoint(cl, ml, appConfig))
 	app.Get("/assistants/:assistant_id", GetAssistantEndpoint(cl, ml, appConfig))
 	app.Post("/assistants/:assistant_id", ModifyAssistantEndpoint(cl, ml, appConfig))
 	app.Post("/files", UploadFilesEndpoint(cl, appConfig))
 	app.Get("/assistants/:assistant_id/files", ListAssistantFilesEndpoint(cl, ml, appConfig))
 	app.Post("/assistants/:assistant_id/files", CreateAssistantFileEndpoint(cl, ml, appConfig))
 	app.Delete("/assistants/:assistant_id/files/:file_id", DeleteAssistantFileEndpoint(cl, ml, appConfig))
 	app.Get("/assistants/:assistant_id/files/:file_id", GetAssistantFileEndpoint(cl, ml, appConfig))
 	t.Run("CreateAssistantEndpoint", func(t *testing.T) {
 		t.Cleanup(tearDown())
 		ar := &AssistantRequest{
 			Model:        "ggml-gpt4all-j",
 			Name:         "3.5-turbo",
 			Description:  "Test Assistant",
 			Instructions: "You are computer science teacher answering student questions",
 			Tools:        []Tool{{Type: Function}},
 			FileIDs:      nil,
 			Metadata:     nil,
 		}
 		resultAssistant, resp, err := createAssistant(app, *ar)
 		assert.NoError(t, err)
 		assert.Equal(t, fiber.StatusOK, resp.StatusCode)
 		assert.Equal(t, 1, len(Assistants))
 		//t.Cleanup(cleanupAllAssistants(t, app, []string{resultAssistant.ID}))
 		assert.Equal(t, ar.Name, resultAssistant.Name)
 		assert.Equal(t, ar.Model, resultAssistant.Model)
 		assert.Equal(t, ar.Tools, resultAssistant.Tools)
 		assert.Equal(t, ar.Description, resultAssistant.Description)
 		assert.Equal(t, ar.Instructions, resultAssistant.Instructions)
 		assert.Equal(t, ar.FileIDs, resultAssistant.FileIDs)
 		assert.Equal(t, ar.Metadata, resultAssistant.Metadata)
 	})
 	t.Run("ListAssistantsEndpoint", func(t *testing.T) {
 		var ids []string
 		var resultAssistant []Assistant
 		for i := 0; i < 4; i++ {
 			ar := &AssistantRequest{
 				Model:        "ggml-gpt4all-j",
 				Name:         fmt.Sprintf("3.5-turbo-%d", i),
 				Description:  fmt.Sprintf("Test Assistant - %d", i),
 				Instructions: fmt.Sprintf("You are computer science teacher answering student questions - %d", i),
 				Tools:        []Tool{{Type: Function}},
 				FileIDs:      []string{"fid-1234"},
 				Metadata:     map[string]string{"meta": "data"},
 			}
 			//var err error
 			ra, _, err := createAssistant(app, *ar)
 			// Because we create the assistants so fast all end up with the same created time.
 			time.Sleep(time.Second)
 			resultAssistant = append(resultAssistant, ra)
 			assert.NoError(t, err)
 			ids = append(ids, resultAssistant[i].ID)
 		}
 		t.Cleanup(cleanupAllAssistants(t, app, ids))
 		tests := []struct {
 			name                 string
 			reqURL               string
 			expectedStatus       int
 			expectedResult       []Assistant
 			expectedStringResult string
 		}{
 			{
 				name:           "Valid Usage - limit only",
 				reqURL:         "/assistants?limit=2",
 				expectedStatus: http.StatusOK,
 				expectedResult: Assistants[:2], // Expecting the first two assistants
 			},
 			{
 				name:           "Valid Usage - order asc",
 				reqURL:         "/assistants?order=asc",
 				expectedStatus: http.StatusOK,
 				expectedResult: Assistants, // Expecting all assistants in ascending order
 			},
 			{
 				name:           "Valid Usage - order desc",
 				reqURL:         "/assistants?order=desc",
 				expectedStatus: http.StatusOK,
 				expectedResult: []Assistant{Assistants[3], Assistants[2], Assistants[1], Assistants[0]}, // Expecting all assistants in descending order
 			},
 			{
 				name:           "Valid Usage - after specific ID",
 				reqURL:         "/assistants?after=2",
 				expectedStatus: http.StatusOK,
 				// Note this is correct because it's put in descending order already
 				expectedResult: Assistants[:3], // Expecting assistants after (excluding) ID 2
 			},
 			{
 				name:           "Valid Usage - before specific ID",
 				reqURL:         "/assistants?before=4",
 				expectedStatus: http.StatusOK,
 				expectedResult: Assistants[2:], // Expecting assistants before (excluding) ID 3.
 			},
 			{
 				name:                 "Invalid Usage - non-integer limit",
 				reqURL:               "/assistants?limit=two",
 				expectedStatus:       http.StatusBadRequest,
 				expectedStringResult: "Invalid limit query value: two",
 			},
 			{
 				name:           "Invalid Usage - non-existing id in after",
 				reqURL:         "/assistants?after=100",
 				expectedStatus: http.StatusOK,
 				expectedResult: []Assistant(nil), // Expecting empty list as there are no IDs above 100
 			},
 		}
 		for _, tt := range tests {
 			t.Run(tt.name, func(t *testing.T) {
 				request := httptest.NewRequest(http.MethodGet, tt.reqURL, nil)
 				response, err := app.Test(request)
 				assert.NoError(t, err)
 				assert.Equal(t, tt.expectedStatus, response.StatusCode)
 				if tt.expectedStatus != fiber.StatusOK {
 					all, _ := io.ReadAll(response.Body)
 					assert.Equal(t, tt.expectedStringResult, string(all))
 				} else {
 					var result []Assistant
 					err = json.NewDecoder(response.Body).Decode(&result)
 					assert.NoError(t, err)
 					assert.Equal(t, tt.expectedResult, result)
 				}
 			})
 		}
 	})
 	t.Run("DeleteAssistantEndpoint", func(t *testing.T) {
 		ar := &AssistantRequest{
 			Model:        "ggml-gpt4all-j",
 			Name:         "3.5-turbo",
 			Description:  "Test Assistant",
 			Instructions: "You are computer science teacher answering student questions",
 			Tools:        []Tool{{Type: Function}},
 			FileIDs:      nil,
 			Metadata:     nil,
 		}
 		resultAssistant, _, err := createAssistant(app, *ar)
 		assert.NoError(t, err)
 		target := fmt.Sprintf("/assistants/%s", resultAssistant.ID)
 		deleteReq := httptest.NewRequest(http.MethodDelete, target, nil)
 		_, err = app.Test(deleteReq)
 		assert.NoError(t, err)
 		assert.Equal(t, 0, len(Assistants))
 	})
 	t.Run("GetAssistantEndpoint", func(t *testing.T) {
 		ar := &AssistantRequest{
 			Model:        "ggml-gpt4all-j",
 			Name:         "3.5-turbo",
 			Description:  "Test Assistant",
 			Instructions: "You are computer science teacher answering student questions",
 			Tools:        []Tool{{Type: Function}},
 			FileIDs:      nil,
 			Metadata:     nil,
 		}
 		resultAssistant, _, err := createAssistant(app, *ar)
 		assert.NoError(t, err)
 		t.Cleanup(cleanupAllAssistants(t, app, []string{resultAssistant.ID}))
 		target := fmt.Sprintf("/assistants/%s", resultAssistant.ID)
 		request := httptest.NewRequest(http.MethodGet, target, nil)
 		response, err := app.Test(request)
 		assert.NoError(t, err)
 		var getAssistant Assistant
 		err = json.NewDecoder(response.Body).Decode(&getAssistant)
 		assert.NoError(t, err)
 		assert.Equal(t, resultAssistant.ID, getAssistant.ID)
 	})
 	t.Run("ModifyAssistantEndpoint", func(t *testing.T) {
 		ar := &AssistantRequest{
 			Model:        "ggml-gpt4all-j",
 			Name:         "3.5-turbo",
 			Description:  "Test Assistant",
 			Instructions: "You are computer science teacher answering student questions",
 			Tools:        []Tool{{Type: Function}},
 			FileIDs:      nil,
 			Metadata:     nil,
 		}
 		resultAssistant, _, err := createAssistant(app, *ar)
 		assert.NoError(t, err)
 		modifiedAr := &AssistantRequest{
 			Model:        "ggml-gpt4all-j",
 			Name:         "4.0-turbo",
 			Description:  "Modified Test Assistant",
 			Instructions: "You are math teacher answering student questions",
 			Tools:        []Tool{{Type: CodeInterpreter}},
 			FileIDs:      nil,
 			Metadata:     nil,
 		}
 		modifiedArJson, err := json.Marshal(modifiedAr)
 		assert.NoError(t, err)
 		target := fmt.Sprintf("/assistants/%s", resultAssistant.ID)
 		request := httptest.NewRequest(http.MethodPost, target, strings.NewReader(string(modifiedArJson)))
 		request.Header.Set(fiber.HeaderContentType, "application/json")
 		modifyResponse, err := app.Test(request)
 		assert.NoError(t, err)
 		var getAssistant Assistant
 		err = json.NewDecoder(modifyResponse.Body).Decode(&getAssistant)
 		assert.NoError(t, err)
 		t.Cleanup(cleanupAllAssistants(t, app, []string{getAssistant.ID}))
 		assert.Equal(t, resultAssistant.ID, getAssistant.ID) // IDs should match even if contents change
 		assert.Equal(t, modifiedAr.Tools, getAssistant.Tools)
 		assert.Equal(t, modifiedAr.Name, getAssistant.Name)
 		assert.Equal(t, modifiedAr.Instructions, getAssistant.Instructions)
 		assert.Equal(t, modifiedAr.Description, getAssistant.Description)
 	})
 	t.Run("CreateAssistantFileEndpoint", func(t *testing.T) {
 		t.Cleanup(tearDown())
 		file, assistant, err := createFileAndAssistant(t, app, appConfig)
 		assert.NoError(t, err)
 		afr := schema.AssistantFileRequest{FileID: file.ID}
 		af, _, err := createAssistantFile(app, afr, assistant.ID)
 		assert.NoError(t, err)
 		assert.Equal(t, assistant.ID, af.AssistantID)
 	})
 	t.Run("ListAssistantFilesEndpoint", func(t *testing.T) {
 		t.Cleanup(tearDown())
 		file, assistant, err := createFileAndAssistant(t, app, appConfig)
 		assert.NoError(t, err)
 		afr := schema.AssistantFileRequest{FileID: file.ID}
 		af, _, err := createAssistantFile(app, afr, assistant.ID)
 		assert.NoError(t, err)
 		assert.Equal(t, assistant.ID, af.AssistantID)
 	})
 	t.Run("GetAssistantFileEndpoint", func(t *testing.T) {
 		t.Cleanup(tearDown())
 		file, assistant, err := createFileAndAssistant(t, app, appConfig)
 		assert.NoError(t, err)
 		afr := schema.AssistantFileRequest{FileID: file.ID}
 		af, _, err := createAssistantFile(app, afr, assistant.ID)
 		assert.NoError(t, err)
 		t.Cleanup(cleanupAssistantFile(t, app, af.ID, af.AssistantID))
 		target := fmt.Sprintf("/assistants/%s/files/%s", assistant.ID, file.ID)
 		request := httptest.NewRequest(http.MethodGet, target, nil)
 		response, err := app.Test(request)
 		assert.NoError(t, err)
 		var assistantFile AssistantFile
 		err = json.NewDecoder(response.Body).Decode(&assistantFile)
 		assert.NoError(t, err)
 		assert.Equal(t, af.ID, assistantFile.ID)
 		assert.Equal(t, af.AssistantID, assistantFile.AssistantID)
 	})
 	t.Run("DeleteAssistantFileEndpoint", func(t *testing.T) {
 		t.Cleanup(tearDown())
 		file, assistant, err := createFileAndAssistant(t, app, appConfig)
 		assert.NoError(t, err)
 		afr := schema.AssistantFileRequest{FileID: file.ID}
 		af, _, err := createAssistantFile(app, afr, assistant.ID)
 		assert.NoError(t, err)
 		cleanupAssistantFile(t, app, af.ID, af.AssistantID)()
 		assert.Empty(t, AssistantFiles)
 	})
 }
 func createFileAndAssistant(t *testing.T, app *fiber.App, o *config.ApplicationConfig) (schema.File, Assistant, error) {
 	ar := &AssistantRequest{
 		Model:        "ggml-gpt4all-j",
 		Name:         "3.5-turbo",
 		Description:  "Test Assistant",
 		Instructions: "You are computer science teacher answering student questions",
 		Tools:        []Tool{{Type: Function}},
 		FileIDs:      nil,
 		Metadata:     nil,
 	}
 	assistant, _, err := createAssistant(app, *ar)
 	if err != nil {
 		return schema.File{}, Assistant{}, err
 	}
 	t.Cleanup(cleanupAllAssistants(t, app, []string{assistant.ID}))
 	file := CallFilesUploadEndpointWithCleanup(t, app, "test.txt", "file", "fine-tune", 5, o)
 	t.Cleanup(func() {
 		_, err := CallFilesDeleteEndpoint(t, app, file.ID)
 		assert.NoError(t, err)
 	})
 	return file, assistant, nil
 }
 func createAssistantFile(app *fiber.App, afr schema.AssistantFileRequest, assistantId string) (AssistantFile, *http.Response, error) {
 	afrJson, err := json.Marshal(afr)
 	if err != nil {
 		return AssistantFile{}, nil, err
 	}
 	target := fmt.Sprintf("/assistants/%s/files", assistantId)
 	request := httptest.NewRequest(http.MethodPost, target, strings.NewReader(string(afrJson)))
 	request.Header.Set(fiber.HeaderContentType, "application/json")
 	request.Header.Set("OpenAi-Beta", "assistants=v1")
 	resp, err := app.Test(request)
 	if err != nil {
 		return AssistantFile{}, resp, err
 	}
 	var assistantFile AssistantFile
 	all, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return AssistantFile{}, resp, err
 	}
 	err = json.NewDecoder(strings.NewReader(string(all))).Decode(&assistantFile)
 	if err != nil {
 		return AssistantFile{}, resp, err
 	}
 	return assistantFile, resp, nil
 }
 func createAssistant(app *fiber.App, ar AssistantRequest) (Assistant, *http.Response, error) {
 	assistant, err := json.Marshal(ar)
 	if err != nil {
 		return Assistant{}, nil, err
 	}
 	request := httptest.NewRequest(http.MethodPost, "/assistants", strings.NewReader(string(assistant)))
 	request.Header.Set(fiber.HeaderContentType, "application/json")
 	request.Header.Set("OpenAi-Beta", "assistants=v1")
 	resp, err := app.Test(request)
 	if err != nil {
 		return Assistant{}, resp, err
 	}
 	bodyString, err := io.ReadAll(resp.Body)
 	if err != nil {
 		return Assistant{}, resp, err
 	}
 	var resultAssistant Assistant
 	err = json.NewDecoder(strings.NewReader(string(bodyString))).Decode(&resultAssistant)
 	return resultAssistant, resp, err
 }
 func cleanupAllAssistants(t *testing.T, app *fiber.App, ids []string) func() {
 	return func() {
 		for _, assistant := range ids {
 			target := fmt.Sprintf("/assistants/%s", assistant)
 			deleteReq := httptest.NewRequest(http.MethodDelete, target, nil)
 			_, err := app.Test(deleteReq)
 			if err != nil {
 				t.Fatalf("Failed to delete assistant %s: %v", assistant, err)
 			}
 		}
 	}
 }
 func cleanupAssistantFile(t *testing.T, app *fiber.App, fileId, assistantId string) func() {
 	return func() {
 		target := fmt.Sprintf("/assistants/%s/files/%s", assistantId, fileId)
 		request := httptest.NewRequest(http.MethodDelete, target, nil)
 		request.Header.Set(fiber.HeaderContentType, "application/json")
 		request.Header.Set("OpenAi-Beta", "assistants=v1")
 		resp, err := app.Test(request)
 		assert.NoError(t, err)
 		var dafr schema.DeleteAssistantFileResponse
 		err = json.NewDecoder(resp.Body).Decode(&dafr)
 		assert.NoError(t, err)
 		assert.True(t, dafr.Deleted)
 	}
 }
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -15,8 +15,8 @@ import (
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/templates"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
@@ -175,7 +175,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
 		textContentToReturn = ""
 		id = uuid.New().String()
 		created = int(time.Now().Unix())
-
+		
 		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
 		if !ok || input.Model == "" {
 			return fiber.ErrBadRequest
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -15,9 +15,9 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/templates"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
--- a/core/http/endpoints/openai/edit.go
+++ b/core/http/endpoints/openai/edit.go
@@ -12,8 +12,8 @@ import (
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/templates"
 	"github.com/rs/zerolog/log"
 )
--- a/core/http/endpoints/openai/files.go
+++ b/core/http/endpoints/openai/files.go
@@ -0,0 +1,194 @@
 package openai
 import (
 	"errors"
 	"fmt"
 	"os"
 	"path/filepath"
 	"sync/atomic"
 	"time"
 	"github.com/microcosm-cc/bluemonday"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/pkg/utils"
 )
 var UploadedFiles []schema.File
 const UploadedFilesFile = "uploadedFiles.json"
 // UploadFilesEndpoint https://platform.openai.com/docs/api-reference/files/create
 func UploadFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		file, err := c.FormFile("file")
 		if err != nil {
 			return err
 		}
 		// Check the file size
 		if file.Size > int64(appConfig.UploadLimitMB*1024*1024) {
 			return c.Status(fiber.StatusBadRequest).SendString(fmt.Sprintf("File size %d exceeds upload limit %d", file.Size, appConfig.UploadLimitMB))
 		}
 		purpose := c.FormValue("purpose", "") //TODO put in purpose dirs
 		if purpose == "" {
 			return c.Status(fiber.StatusBadRequest).SendString("Purpose is not defined")
 		}
 		// Sanitize the filename to prevent directory traversal
 		filename := utils.SanitizeFileName(file.Filename)
 		savePath := filepath.Join(appConfig.UploadDir, filename)
 		// Check if file already exists
 		if _, err := os.Stat(savePath); !os.IsNotExist(err) {
 			return c.Status(fiber.StatusBadRequest).SendString("File already exists")
 		}
 		err = c.SaveFile(file, savePath)
 		if err != nil {
 			return c.Status(fiber.StatusInternalServerError).SendString("Failed to save file: " + bluemonday.StrictPolicy().Sanitize(err.Error()))
 		}
 		f := schema.File{
 			ID:        fmt.Sprintf("file-%d", getNextFileId()),
 			Object:    "file",
 			Bytes:     int(file.Size),
 			CreatedAt: time.Now(),
 			Filename:  file.Filename,
 			Purpose:   purpose,
 		}
 		UploadedFiles = append(UploadedFiles, f)
 		utils.SaveConfig(appConfig.UploadDir, UploadedFilesFile, UploadedFiles)
 		return c.Status(fiber.StatusOK).JSON(f)
 	}
 }
 var currentFileId int64 = 0
 func getNextFileId() int64 {
 	atomic.AddInt64(&currentId, 1)
 	return currentId
 }
 // ListFilesEndpoint https://platform.openai.com/docs/api-reference/files/list
 // @Summary List files.
 // @Success 200 {object} schema.ListFiles "Response"
 // @Router /v1/files [get]
 func ListFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		var listFiles schema.ListFiles
 		purpose := c.Query("purpose")
 		if purpose == "" {
 			listFiles.Data = UploadedFiles
 		} else {
 			for _, f := range UploadedFiles {
 				if purpose == f.Purpose {
 					listFiles.Data = append(listFiles.Data, f)
 				}
 			}
 		}
 		listFiles.Object = "list"
 		return c.Status(fiber.StatusOK).JSON(listFiles)
 	}
 }
 func getFileFromRequest(c *fiber.Ctx) (*schema.File, error) {
 	id := c.Params("file_id")
 	if id == "" {
 		return nil, fmt.Errorf("file_id parameter is required")
 	}
 	for _, f := range UploadedFiles {
 		if id == f.ID {
 			return &f, nil
 		}
 	}
 	return nil, fmt.Errorf("unable to find file id %s", id)
 }
 // GetFilesEndpoint is the OpenAI API endpoint to get files https://platform.openai.com/docs/api-reference/files/retrieve
 // @Summary Returns information about a specific file.
 // @Success 200 {object} schema.File "Response"
 // @Router /v1/files/{file_id} [get]
 func GetFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		file, err := getFileFromRequest(c)
 		if err != nil {
 			return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
 		}
 		return c.JSON(file)
 	}
 }
 type DeleteStatus struct {
 	Id      string
 	Object  string
 	Deleted bool
 }
 // DeleteFilesEndpoint is the OpenAI API endpoint to delete files https://platform.openai.com/docs/api-reference/files/delete
 // @Summary Delete a file.
 // @Success 200 {object} DeleteStatus "Response"
 // @Router /v1/files/{file_id} [delete]
 func DeleteFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		file, err := getFileFromRequest(c)
 		if err != nil {
 			return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
 		}
 		err = os.Remove(filepath.Join(appConfig.UploadDir, file.Filename))
 		if err != nil {
 			// If the file doesn't exist then we should just continue to remove it
 			if !errors.Is(err, os.ErrNotExist) {
 				return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to delete file: %s, %v", file.Filename, err)))
 			}
 		}
 		// Remove upload from list
 		for i, f := range UploadedFiles {
 			if f.ID == file.ID {
 				UploadedFiles = append(UploadedFiles[:i], UploadedFiles[i+1:]...)
 				break
 			}
 		}
 		utils.SaveConfig(appConfig.UploadDir, UploadedFilesFile, UploadedFiles)
 		return c.JSON(DeleteStatus{
 			Id:      file.ID,
 			Object:  "file",
 			Deleted: true,
 		})
 	}
 }
 // GetFilesContentsEndpoint is the OpenAI API endpoint to get files content https://platform.openai.com/docs/api-reference/files/retrieve-contents
 // @Summary Returns information about a specific file.
 // @Success	200		{string}	binary				"file"
 // @Router /v1/files/{file_id}/content [get]
 // GetFilesContentsEndpoint
 func GetFilesContentsEndpoint(cm *config.BackendConfigLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		file, err := getFileFromRequest(c)
 		if err != nil {
 			return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
 		}
 		fileContents, err := os.ReadFile(filepath.Join(appConfig.UploadDir, file.Filename))
 		if err != nil {
 			return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
 		}
 		return c.Send(fileContents)
 	}
 }
--- a/core/http/endpoints/openai/files_test.go
+++ b/core/http/endpoints/openai/files_test.go
@@ -0,0 +1,301 @@
 package openai
 import (
 	"encoding/json"
 	"fmt"
 	"io"
 	"mime/multipart"
 	"net/http"
 	"net/http/httptest"
 	"os"
 	"path/filepath"
 	"strings"
 	"github.com/rs/zerolog/log"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/gofiber/fiber/v2"
 	utils2 "github.com/mudler/LocalAI/pkg/utils"
 	"github.com/stretchr/testify/assert"
 	"testing"
 )
 func startUpApp() (app *fiber.App, option *config.ApplicationConfig, loader *config.BackendConfigLoader) {
 	// Preparing the mocked objects
 	loader = &config.BackendConfigLoader{}
 	option = &config.ApplicationConfig{
 		UploadLimitMB: 10,
 		UploadDir:     "test_dir",
 	}
 	_ = os.RemoveAll(option.UploadDir)
 	app = fiber.New(fiber.Config{
 		BodyLimit: 20 * 1024 * 1024, // sets the limit to 20MB.
 	})
 	// Create a Test Server
 	app.Post("/files", UploadFilesEndpoint(loader, option))
 	app.Get("/files", ListFilesEndpoint(loader, option))
 	app.Get("/files/:file_id", GetFilesEndpoint(loader, option))
 	app.Delete("/files/:file_id", DeleteFilesEndpoint(loader, option))
 	app.Get("/files/:file_id/content", GetFilesContentsEndpoint(loader, option))
 	return
 }
 func TestUploadFileExceedSizeLimit(t *testing.T) {
 	// Preparing the mocked objects
 	loader := &config.BackendConfigLoader{}
 	option := &config.ApplicationConfig{
 		UploadLimitMB: 10,
 		UploadDir:     "test_dir",
 	}
 	_ = os.RemoveAll(option.UploadDir)
 	app := fiber.New(fiber.Config{
 		BodyLimit: 20 * 1024 * 1024, // sets the limit to 20MB.
 	})
 	// Create a Test Server
 	app.Post("/files", UploadFilesEndpoint(loader, option))
 	app.Get("/files", ListFilesEndpoint(loader, option))
 	app.Get("/files/:file_id", GetFilesEndpoint(loader, option))
 	app.Delete("/files/:file_id", DeleteFilesEndpoint(loader, option))
 	app.Get("/files/:file_id/content", GetFilesContentsEndpoint(loader, option))
 	t.Run("UploadFilesEndpoint file size exceeds limit", func(t *testing.T) {
 		t.Cleanup(tearDown())
 		resp, err := CallFilesUploadEndpoint(t, app, "foo.txt", "file", "fine-tune", 11, option)
 		assert.NoError(t, err)
 		assert.Equal(t, fiber.StatusBadRequest, resp.StatusCode)
 		assert.Contains(t, bodyToString(resp, t), "exceeds upload limit")
 	})
 	t.Run("UploadFilesEndpoint purpose not defined", func(t *testing.T) {
 		t.Cleanup(tearDown())
 		resp, _ := CallFilesUploadEndpoint(t, app, "foo.txt", "file", "", 5, option)
 		assert.Equal(t, fiber.StatusBadRequest, resp.StatusCode)
 		assert.Contains(t, bodyToString(resp, t), "Purpose is not defined")
 	})
 	t.Run("UploadFilesEndpoint file already exists", func(t *testing.T) {
 		t.Cleanup(tearDown())
 		f1 := CallFilesUploadEndpointWithCleanup(t, app, "foo.txt", "file", "fine-tune", 5, option)
 		resp, err := CallFilesUploadEndpoint(t, app, "foo.txt", "file", "fine-tune", 5, option)
 		fmt.Println(f1)
 		fmt.Printf("ERror: %v\n", err)
 		fmt.Printf("resp: %+v\n", resp)
 		assert.Equal(t, fiber.StatusBadRequest, resp.StatusCode)
 		assert.Contains(t, bodyToString(resp, t), "File already exists")
 	})
 	t.Run("UploadFilesEndpoint file uploaded successfully", func(t *testing.T) {
 		t.Cleanup(tearDown())
 		file := CallFilesUploadEndpointWithCleanup(t, app, "test.txt", "file", "fine-tune", 5, option)
 		// Check if file exists in the disk
 		testName := strings.Split(t.Name(), "/")[1]
 		fileName := testName + "-test.txt"
 		filePath := filepath.Join(option.UploadDir, utils2.SanitizeFileName(fileName))
 		_, err := os.Stat(filePath)
 		assert.False(t, os.IsNotExist(err))
 		assert.Equal(t, file.Bytes, 5242880)
 		assert.NotEmpty(t, file.CreatedAt)
 		assert.Equal(t, file.Filename, fileName)
 		assert.Equal(t, file.Purpose, "fine-tune")
 	})
 	t.Run("ListFilesEndpoint without purpose parameter", func(t *testing.T) {
 		t.Cleanup(tearDown())
 		resp, err := CallListFilesEndpoint(t, app, "")
 		assert.NoError(t, err)
 		assert.Equal(t, 200, resp.StatusCode)
 		listFiles := responseToListFile(t, resp)
 		if len(listFiles.Data) != len(UploadedFiles) {
 			t.Errorf("Expected %v files, got %v files", len(UploadedFiles), len(listFiles.Data))
 		}
 	})
 	t.Run("ListFilesEndpoint with valid purpose parameter", func(t *testing.T) {
 		t.Cleanup(tearDown())
 		_ = CallFilesUploadEndpointWithCleanup(t, app, "test.txt", "file", "fine-tune", 5, option)
 		resp, err := CallListFilesEndpoint(t, app, "fine-tune")
 		assert.NoError(t, err)
 		listFiles := responseToListFile(t, resp)
 		if len(listFiles.Data) != 1 {
 			t.Errorf("Expected 1 file, got %v files", len(listFiles.Data))
 		}
 	})
 	t.Run("ListFilesEndpoint with invalid query parameter", func(t *testing.T) {
 		t.Cleanup(tearDown())
 		resp, err := CallListFilesEndpoint(t, app, "not-so-fine-tune")
 		assert.NoError(t, err)
 		assert.Equal(t, 200, resp.StatusCode)
 		listFiles := responseToListFile(t, resp)
 		if len(listFiles.Data) != 0 {
 			t.Errorf("Expected 0 file, got %v files", len(listFiles.Data))
 		}
 	})
 	t.Run("GetFilesContentsEndpoint get file content", func(t *testing.T) {
 		t.Cleanup(tearDown())
 		req := httptest.NewRequest("GET", "/files", nil)
 		resp, _ := app.Test(req)
 		assert.Equal(t, 200, resp.StatusCode)
 		var listFiles schema.ListFiles
 		if err := json.Unmarshal(bodyToByteArray(resp, t), &listFiles); err != nil {
 			t.Errorf("Failed to decode response: %v", err)
 			return
 		}
 		if len(listFiles.Data) != 0 {
 			t.Errorf("Expected 0 file, got %v files", len(listFiles.Data))
 		}
 	})
 }
 func CallListFilesEndpoint(t *testing.T, app *fiber.App, purpose string) (*http.Response, error) {
 	var target string
 	if purpose != "" {
 		target = fmt.Sprintf("/files?purpose=%s", purpose)
 	} else {
 		target = "/files"
 	}
 	req := httptest.NewRequest("GET", target, nil)
 	return app.Test(req)
 }
 func CallFilesContentEndpoint(t *testing.T, app *fiber.App, fileId string) (*http.Response, error) {
 	request := httptest.NewRequest("GET", "/files?file_id="+fileId, nil)
 	return app.Test(request)
 }
 func CallFilesUploadEndpoint(t *testing.T, app *fiber.App, fileName, tag, purpose string, fileSize int, appConfig *config.ApplicationConfig) (*http.Response, error) {
 	testName := strings.Split(t.Name(), "/")[1]
 	// Create a file that exceeds the limit
 	file := createTestFile(t, testName+"-"+fileName, fileSize, appConfig)
 	// Creating a new HTTP Request
 	body, writer := newMultipartFile(file.Name(), tag, purpose)
 	req := httptest.NewRequest(http.MethodPost, "/files", body)
 	req.Header.Set(fiber.HeaderContentType, writer.FormDataContentType())
 	return app.Test(req)
 }
 func CallFilesUploadEndpointWithCleanup(t *testing.T, app *fiber.App, fileName, tag, purpose string, fileSize int, appConfig *config.ApplicationConfig) schema.File {
 	// Create a file that exceeds the limit
 	testName := strings.Split(t.Name(), "/")[1]
 	file := createTestFile(t, testName+"-"+fileName, fileSize, appConfig)
 	// Creating a new HTTP Request
 	body, writer := newMultipartFile(file.Name(), tag, purpose)
 	req := httptest.NewRequest(http.MethodPost, "/files", body)
 	req.Header.Set(fiber.HeaderContentType, writer.FormDataContentType())
 	resp, err := app.Test(req)
 	assert.NoError(t, err)
 	f := responseToFile(t, resp)
 	//id := f.ID
 	//t.Cleanup(func() {
 	//	_, err := CallFilesDeleteEndpoint(t, app, id)
 	//	assert.NoError(t, err)
 	//	assert.Empty(t, UploadedFiles)
 	//})
 	return f
 }
 func CallFilesDeleteEndpoint(t *testing.T, app *fiber.App, fileId string) (*http.Response, error) {
 	target := fmt.Sprintf("/files/%s", fileId)
 	req := httptest.NewRequest(http.MethodDelete, target, nil)
 	return app.Test(req)
 }
 // Helper to create multi-part file
 func newMultipartFile(filePath, tag, purpose string) (*strings.Reader, *multipart.Writer) {
 	body := new(strings.Builder)
 	writer := multipart.NewWriter(body)
 	file, _ := os.Open(filePath)
 	defer file.Close()
 	part, _ := writer.CreateFormFile(tag, filepath.Base(filePath))
 	io.Copy(part, file)
 	if purpose != "" {
 		_ = writer.WriteField("purpose", purpose)
 	}
 	writer.Close()
 	return strings.NewReader(body.String()), writer
 }
 // Helper to create test files
 func createTestFile(t *testing.T, name string, sizeMB int, option *config.ApplicationConfig) *os.File {
 	err := os.MkdirAll(option.UploadDir, 0750)
 	if err != nil {
 		t.Fatalf("Error MKDIR: %v", err)
 	}
 	file, err := os.Create(name)
 	assert.NoError(t, err)
 	file.WriteString(strings.Repeat("a", sizeMB*1024*1024)) // sizeMB MB File
 	t.Cleanup(func() {
 		os.Remove(name)
 		os.RemoveAll(option.UploadDir)
 	})
 	return file
 }
 func bodyToString(resp *http.Response, t *testing.T) string {
 	return string(bodyToByteArray(resp, t))
 }
 func bodyToByteArray(resp *http.Response, t *testing.T) []byte {
 	bodyBytes, err := io.ReadAll(resp.Body)
 	if err != nil {
 		t.Fatal(err)
 	}
 	return bodyBytes
 }
 func responseToFile(t *testing.T, resp *http.Response) schema.File {
 	var file schema.File
 	responseToString := bodyToString(resp, t)
 	err := json.NewDecoder(strings.NewReader(responseToString)).Decode(&file)
 	if err != nil {
 		t.Errorf("Failed to decode response: %s", err)
 	}
 	return file
 }
 func responseToListFile(t *testing.T, resp *http.Response) schema.ListFiles {
 	var listFiles schema.ListFiles
 	responseToString := bodyToString(resp, t)
 	err := json.NewDecoder(strings.NewReader(responseToString)).Decode(&listFiles)
 	if err != nil {
 		log.Error().Err(err).Msg("failed to decode response")
 	}
 	return listFiles
 }
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -16,12 +16,12 @@ import (
 	"github.com/mudler/LocalAI/core/application"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai/types"
 	"github.com/mudler/LocalAI/core/templates"
 	laudio "github.com/mudler/LocalAI/pkg/audio"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/sound"
 	"github.com/mudler/LocalAI/pkg/templates"
 	"google.golang.org/grpc"
@@ -29,8 +29,8 @@ import (
 )
 const (
-	localSampleRate  = 16000
+	localSampleRate       = 16000
-	remoteSampleRate = 24000
+	remoteSampleRate      = 24000
 )
 // A model can be "emulated" that is: transcribe audio to text -> feed text to the LLM -> generate audio as result
@@ -210,9 +210,9 @@ func registerRealtime(application *application.Application) func(c *websocket.Co
 					// TODO: Need some way to pass this to the backend
 					Threshold: 0.5,
 					// TODO: This is ignored and the amount of padding is random at present
-					PrefixPaddingMs:   30,
+					PrefixPaddingMs: 30,
 					SilenceDurationMs: 500,
-					CreateResponse:    func() *bool { t := true; return &t }(),
+					CreateResponse: func() *bool { t := true; return &t }(),
 				},
 			},
 			InputAudioTranscription: &types.InputAudioTranscription{
@@ -233,7 +233,7 @@ func registerRealtime(application *application.Application) func(c *websocket.Co
 		// TODO: The API has no way to configure the VAD model or other models that make up a pipeline to fake any-to-any
 		//       So possibly we could have a way to configure a composite model that can be used in situations where any-to-any is expected
 		pipeline := config.Pipeline{
-			VAD:           "silero-vad",
+			VAD: "silero-vad",
 			Transcription: session.InputAudioTranscription.Model,
 		}
@@ -567,8 +567,8 @@ func updateTransSession(session *Session, update *types.ClientSession, cl *confi
 	trCur := session.InputAudioTranscription
 	if trUpd != nil && trUpd.Model != "" && trUpd.Model != trCur.Model {
-		pipeline := config.Pipeline{
+		pipeline := config.Pipeline {
-			VAD:           "silero-vad",
+			VAD: "silero-vad",
 			Transcription: trUpd.Model,
 		}
@@ -684,7 +684,7 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
 				sendEvent(c, types.InputAudioBufferClearedEvent{
 					ServerEventBase: types.ServerEventBase{
 						EventID: "event_TODO",
-						Type:    types.ServerEventTypeInputAudioBufferCleared,
+						Type: types.ServerEventTypeInputAudioBufferCleared,
 					},
 				})
@@ -697,7 +697,7 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
 				sendEvent(c, types.InputAudioBufferSpeechStartedEvent{
 					ServerEventBase: types.ServerEventBase{
 						EventID: "event_TODO",
-						Type:    types.ServerEventTypeInputAudioBufferSpeechStarted,
+						Type: types.ServerEventTypeInputAudioBufferSpeechStarted,
 					},
 					AudioStartMs: time.Now().Sub(startTime).Milliseconds(),
 				})
@@ -719,7 +719,7 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
 				sendEvent(c, types.InputAudioBufferSpeechStoppedEvent{
 					ServerEventBase: types.ServerEventBase{
 						EventID: "event_TODO",
-						Type:    types.ServerEventTypeInputAudioBufferSpeechStopped,
+						Type: types.ServerEventTypeInputAudioBufferSpeechStopped,
 					},
 					AudioEndMs: time.Now().Sub(startTime).Milliseconds(),
 				})
@@ -728,9 +728,9 @@ func handleVAD(cfg *config.BackendConfig, evaluator *templates.Evaluator, sessio
 				sendEvent(c, types.InputAudioBufferCommittedEvent{
 					ServerEventBase: types.ServerEventBase{
 						EventID: "event_TODO",
-						Type:    types.ServerEventTypeInputAudioBufferCommitted,
+						Type: types.ServerEventTypeInputAudioBufferCommitted,
 					},
-					ItemID:         generateItemID(),
+					ItemID: generateItemID(),
 					PreviousItemID: "TODO",
 				})
@@ -833,9 +833,9 @@ func commitUtterance(ctx context.Context, utt []byte, cfg *config.BackendConfig,
 func runVAD(ctx context.Context, session *Session, adata []int16) ([]*proto.VADSegment, error) {
 	soundIntBuffer := &audio.IntBuffer{
-		Format:         &audio.Format{SampleRate: localSampleRate, NumChannels: 1},
+		Format: &audio.Format{SampleRate: localSampleRate, NumChannels: 1},
 		SourceBitDepth: 16,
-		Data:           sound.ConvertInt16ToInt(adata),
+		Data: sound.ConvertInt16ToInt(adata),
 	}
 	float32Data := soundIntBuffer.AsFloat32Buffer().Data
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -11,9 +11,9 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/templates"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/gofiber/fiber/v2"
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -23,7 +23,7 @@ func RegisterLocalAIRoutes(router *fiber.App,
 	// LocalAI API endpoints
 	if !appConfig.DisableGalleryEndpoint {
-		modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(appConfig.Galleries, appConfig.BackendGalleries, appConfig.ModelPath, galleryService)
+		modelGalleryEndpointService := localai.CreateModelGalleryEndpointService(appConfig.Galleries, appConfig.ModelPath, galleryService)
 		router.Post("/models/apply", modelGalleryEndpointService.ApplyModelGalleryEndpoint())
 		router.Post("/models/delete/:name", modelGalleryEndpointService.DeleteModelGalleryEndpoint())
@@ -41,11 +41,6 @@ func RegisterLocalAIRoutes(router *fiber.App,
 		router.Get("/backends/jobs/:uuid", backendGalleryEndpointService.GetOpStatusEndpoint())
 	}
 	router.Post("/v1/detection",
 		requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_DETECTION)),
 		requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.DetectionRequest) }),
 		localai.DetectionEndpoint(cl, ml, appConfig))
 	router.Post("/tts",
 		requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TTS)),
 		requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.TTSRequest) }),
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -54,6 +54,38 @@ func RegisterOpenAIRoutes(app *fiber.App,
 	app.Post("/completions", completionChain...)
 	app.Post("/v1/engines/:model/completions", completionChain...)
 	// assistant
 	app.Get("/v1/assistants", openai.ListAssistantsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Get("/assistants", openai.ListAssistantsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Post("/v1/assistants", openai.CreateAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Post("/assistants", openai.CreateAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Delete("/v1/assistants/:assistant_id", openai.DeleteAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Delete("/assistants/:assistant_id", openai.DeleteAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Get("/v1/assistants/:assistant_id", openai.GetAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Get("/assistants/:assistant_id", openai.GetAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Post("/v1/assistants/:assistant_id", openai.ModifyAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Post("/assistants/:assistant_id", openai.ModifyAssistantEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Get("/v1/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Get("/assistants/:assistant_id/files", openai.ListAssistantFilesEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Post("/v1/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Post("/assistants/:assistant_id/files", openai.CreateAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Delete("/v1/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Delete("/assistants/:assistant_id/files/:file_id", openai.DeleteAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Get("/v1/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Get("/assistants/:assistant_id/files/:file_id", openai.GetAssistantFileEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	// files
 	app.Post("/v1/files", openai.UploadFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
 	app.Post("/files", openai.UploadFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
 	app.Get("/v1/files", openai.ListFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
 	app.Get("/files", openai.ListFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
 	app.Get("/v1/files/:file_id", openai.GetFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
 	app.Get("/files/:file_id", openai.GetFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
 	app.Delete("/v1/files/:file_id", openai.DeleteFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
 	app.Delete("/files/:file_id", openai.DeleteFilesEndpoint(application.BackendLoader(), application.ApplicationConfig()))
 	app.Get("/v1/files/:file_id/content", openai.GetFilesContentsEndpoint(application.BackendLoader(), application.ApplicationConfig()))
 	app.Get("/files/:file_id/content", openai.GetFilesContentsEndpoint(application.BackendLoader(), application.ApplicationConfig()))
 	// embeddings
 	embeddingChain := []fiber.Handler{
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_EMBEDDINGS)),
--- a/core/http/routes/ui_gallery.go
+++ b/core/http/routes/ui_gallery.go
@@ -180,7 +180,6 @@ func registerGalleryRoutes(app *fiber.App, cl *config.BackendConfigLoader, appCo
 			ID:                 uid,
 			GalleryElementName: galleryID,
 			Galleries:          appConfig.Galleries,
 			BackendGalleries:   appConfig.BackendGalleries,
 		}
 		go func() {
 			galleryService.ModelGalleryChannel <- op
@@ -220,7 +219,6 @@ func registerGalleryRoutes(app *fiber.App, cl *config.BackendConfigLoader, appCo
 			Delete:             true,
 			GalleryElementName: galleryName,
 			Galleries:          appConfig.Galleries,
 			BackendGalleries:   appConfig.BackendGalleries,
 		}
 		go func() {
 			galleryService.ModelGalleryChannel <- op
--- a/core/http/views/backends.html
+++ b/core/http/views/backends.html
@@ -90,14 +90,6 @@
                        hx-indicator=".htmx-indicator">
                        <i class="fas fa-headphones mr-2"></i>Whisper
                    </button>
                    <button hx-post="browse/search/backends" 
                        class="inline-flex items-center rounded-full px-4 py-2 text-sm font-medium bg-red-900/60 text-red-200 border border-red-700/50 hover:bg-red-800 transition duration-200 ease-in-out"
                        hx-target="#search-results" 
                        hx-vals='{"search": "object-detection"}'
                        onclick="hidePagination()"
                        hx-indicator=".htmx-indicator">
                        <i class="fas fa-eye mr-2"></i>Object detection
                    </button>
                </div>
            </div>
        </div>
--- a/core/http/views/models.html
+++ b/core/http/views/models.html
@@ -115,14 +115,6 @@
                        hx-indicator=".htmx-indicator">
                        <i class="fas fa-headphones mr-2"></i>Audio transcription
                    </button>
                    <button hx-post="browse/search/models"
                        class="inline-flex items-center rounded-full px-4 py-2 text-sm font-medium bg-red-900/60 text-red-200 border border-red-700/50 hover:bg-red-800 transition duration-200 ease-in-out"
                        hx-target="#search-results" 
                        hx-vals='{"search": "object-detection"}'
                        onclick="hidePagination()"
                        hx-indicator=".htmx-indicator">
                        <i class="fas fa-eye mr-2"></i>Object detection
                    </button>
                </div>
            </div>
--- a/core/p2p/p2p.go
+++ b/core/p2p/p2p.go
@@ -278,7 +278,6 @@ func ensureService(ctx context.Context, n *node.Node, nd *NodeData, sserv string
 			port, err := freeport.GetFreePort()
 			if err != nil {
 				zlog.Error().Err(err).Msgf("Could not allocate a free port for %s", nd.ID)
 				cancel()
 				return
 			}
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -120,20 +120,3 @@ type SystemInformationResponse struct {
 	Backends []string       `json:"backends"`
 	Models   []SysInfoModel `json:"loaded_models"`
 }
 type DetectionRequest struct {
 	BasicModelRequest
 	Image string `json:"image"`
 }
 type DetectionResponse struct {
 	Detections []Detection `json:"detections"`
 }
 type Detection struct {
 	X         float32 `json:"x"`
 	Y         float32 `json:"y"`
 	Width     float32 `json:"width"`
 	Height    float32 `json:"height"`
 	ClassName string  `json:"class_name"`
 }
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -2,6 +2,7 @@ package schema
 import (
 	"context"
 	"time"
 	functions "github.com/mudler/LocalAI/pkg/functions"
 )
@@ -114,6 +115,37 @@ type OpenAIModel struct {
 	Object string `json:"object"`
 }
 type DeleteAssistantResponse struct {
 	ID      string `json:"id"`
 	Object  string `json:"object"`
 	Deleted bool   `json:"deleted"`
 }
 // File represents the structure of a file object from the OpenAI API.
 type File struct {
 	ID        string    `json:"id"`         // Unique identifier for the file
 	Object    string    `json:"object"`     // Type of the object (e.g., "file")
 	Bytes     int       `json:"bytes"`      // Size of the file in bytes
 	CreatedAt time.Time `json:"created_at"` // The time at which the file was created
 	Filename  string    `json:"filename"`   // The name of the file
 	Purpose   string    `json:"purpose"`    // The purpose of the file (e.g., "fine-tune", "classifications", etc.)
 }
 type ListFiles struct {
 	Data   []File
 	Object string
 }
 type AssistantFileRequest struct {
 	FileID string `json:"file_id"`
 }
 type DeleteAssistantFileResponse struct {
 	ID      string `json:"id"`
 	Object  string `json:"object"`
 	Deleted bool   `json:"deleted"`
 }
 type ImageGenerationResponseFormat string
 type ChatCompletionResponseFormatType string
--- a/core/services/backends.go
+++ b/core/services/backends.go
@@ -2,7 +2,7 @@ package services
 import (
 	"github.com/mudler/LocalAI/core/gallery"
-	"github.com/mudler/LocalAI/pkg/system"
+	"github.com/mudler/LocalAI/core/system"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
@@ -24,7 +24,6 @@ func (g *GalleryService) backendHandler(op *GalleryOp[gallery.GalleryBackend], s
 		g.modelLoader.DeleteExternalBackend(op.GalleryElementName)
 	} else {
 		log.Warn().Msgf("installing backend %s", op.GalleryElementName)
 		log.Debug().Msgf("backend galleries: %v", g.appConfig.BackendGalleries)
 		err = gallery.InstallBackendFromGallery(g.appConfig.BackendGalleries, systemState, op.GalleryElementName, g.appConfig.BackendsPath, progressCallback, true)
 		if err == nil {
 			err = gallery.RegisterBackends(g.appConfig.BackendsPath, g.modelLoader)
--- a/core/services/gallery.go
+++ b/core/services/gallery.go
@@ -7,8 +7,8 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/system"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/system"
 	"github.com/rs/zerolog/log"
 )
--- a/core/services/models.go
+++ b/core/services/models.go
@@ -7,7 +7,7 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
-	"github.com/mudler/LocalAI/pkg/system"
+	"github.com/mudler/LocalAI/core/system"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"gopkg.in/yaml.v2"
 )
--- a/core/system/capabilities.go
+++ b/core/system/capabilities.go
@@ -25,24 +25,20 @@ func (s *SystemState) Capability(capMap map[string]string) string {
 	// Check if the reported capability is in the map
 	if _, exists := capMap[reportedCapability]; exists {
 		log.Debug().Str("reportedCapability", reportedCapability).Any("capMap", capMap).Msg("Using reported capability")
 		return reportedCapability
 	}
 	log.Debug().Str("reportedCapability", reportedCapability).Any("capMap", capMap).Msg("The requested capability was not found, using default capability")
 	// Otherwise, return the default capability (catch-all)
 	return defaultCapability
 }
 func (s *SystemState) getSystemCapabilities() string {
 	if os.Getenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY") != "" {
 		log.Debug().Str("LOCALAI_FORCE_META_BACKEND_CAPABILITY", os.Getenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY")).Msg("Using forced capability")
 		return os.Getenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY")
 	}
 	capabilityRunFile := "/run/localai/capability"
 	if os.Getenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY_RUN_FILE") != "" {
 		log.Debug().Str("LOCALAI_FORCE_META_BACKEND_CAPABILITY_RUN_FILE", os.Getenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY_RUN_FILE")).Msg("Using forced capability run file")
 		capabilityRunFile = os.Getenv("LOCALAI_FORCE_META_BACKEND_CAPABILITY_RUN_FILE")
 	}
@@ -52,37 +48,31 @@ func (s *SystemState) getSystemCapabilities() string {
 	if _, err := os.Stat(capabilityRunFile); err == nil {
 		capability, err := os.ReadFile(capabilityRunFile)
 		if err == nil {
-			log.Debug().Str("capability", string(capability)).Msg("Using capability from run file")
+			return string(capability)
 			return strings.Trim(strings.TrimSpace(string(capability)), "\n")
 		}
 	}
 	// If we are on mac and arm64, we will return metal
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		log.Debug().Msg("Using metal capability")
 		return metal
 	}
 	// If we are on mac and x86, we will return darwin-x86
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "amd64" {
 		log.Debug().Msg("Using darwin-x86 capability")
 		return darwinX86
 	}
 	// If arm64 on linux and a nvidia gpu is detected, we will return nvidia-l4t
 	if runtime.GOOS == "linux" && runtime.GOARCH == "arm64" {
 		if s.GPUVendor == "nvidia" {
 			log.Debug().Msg("Using nvidia-l4t capability")
 			return nvidiaL4T
 		}
 	}
 	if s.GPUVendor == "" {
 		log.Debug().Msg("Using default capability")
 		return defaultCapability
 	}
 	log.Debug().Str("GPUVendor", s.GPUVendor).Msg("Using GPU vendor capability")
 	return s.GPUVendor
 }
--- a/docs/content/docs/features/GPU-acceleration.md
+++ b/docs/content/docs/features/GPU-acceleration.md
@@ -15,16 +15,6 @@ This section contains instruction on how to use LocalAI with GPU acceleration.
 For acceleration for AMD or Metal HW is still in development, for additional details see the [build]({{%relref "docs/getting-started/build#Acceleration" %}})
 {{% /alert %}}
 ## Automatic Backend Detection
 When you install a model from the gallery (or a YAML file), LocalAI intelligently detects the required backend and your system's capabilities, then downloads the correct version for you. Whether you're running on a standard CPU, an NVIDIA GPU, an AMD GPU, or an Intel GPU, LocalAI handles it automatically.
 For advanced use cases or to override auto-detection, you can use the `LOCALAI_FORCE_META_BACKEND_CAPABILITY` environment variable. Here are the available options:
 - `default`: Forces CPU-only backend. This is the fallback if no specific hardware is detected.
 - `nvidia`: Forces backends compiled with CUDA support for NVIDIA GPUs.
 - `amd`: Forces backends compiled with ROCm support for AMD GPUs.
 - `intel`: Forces backends compiled with SYCL/oneAPI support for Intel GPUs.
 ## Model configuration
@@ -81,8 +71,8 @@ To use CUDA, use the images with the `cublas` tag, for example.
 The image list is on [quay](https://quay.io/repository/go-skynet/local-ai?tab=tags):
- CUDA `11` tags: `master-gpu-nvidia-cuda-11`, `v1.40.0-gpu-nvidia-cuda-11`, ...
+- CUDA `11` tags: `master-gpu-nvidia-cuda11`, `v1.40.0-gpu-nvidia-cuda11`, ...
- CUDA `12` tags: `master-gpu-nvidia-cuda-12`, `v1.40.0-gpu-nvidia-cuda-12`, ...
+- CUDA `12` tags: `master-gpu-nvidia-cuda12`, `v1.40.0-gpu-nvidia-cuda12`, ...
 In addition to the commands to run LocalAI normally, you need to specify `--gpus all` to docker, for example:
--- a/docs/content/docs/features/backends.md
+++ b/docs/content/docs/features/backends.md
@@ -96,8 +96,8 @@ Your backend container should:
 For getting started, see the available backends in LocalAI here: https://github.com/mudler/LocalAI/tree/master/backend . 
 - For Python based backends there is a template that can be used as starting point: https://github.com/mudler/LocalAI/tree/master/backend/python/common/template . 
- For Golang based backends, you can see the `bark-cpp` backend as an example: https://github.com/mudler/LocalAI/tree/master/backend/go/bark-cpp
+- For Golang based backends, you can see the `bark-cpp` backend as an example: https://github.com/mudler/LocalAI/tree/master/backend/go/bark
- For C++ based backends, you can see the `llama-cpp` backend as an example: https://github.com/mudler/LocalAI/tree/master/backend/cpp/llama-cpp
+- For C++ based backends, you can see the `llama-cpp` backend as an example: https://github.com/mudler/LocalAI/tree/master/backend/cpp/llama
 ### Publishing Your Backend
--- a/docs/content/docs/features/object-detection.md
+++ b/docs/content/docs/features/object-detection.md
@@ -1,193 +0,0 @@
 +++
 disableToc = false
 title = "🔍 Object detection"
 weight = 13
 url = "/features/object-detection/"
 +++
 LocalAI supports object detection through various backends. This feature allows you to identify and locate objects within images with high accuracy and real-time performance. Currently, [RF-DETR](https://github.com/roboflow/rf-detr) is available as an implementation.
 ## Overview
 Object detection in LocalAI is implemented through dedicated backends that can identify and locate objects within images. Each backend provides different capabilities and model architectures.
 **Key Features:**
 - Real-time object detection
 - High accuracy detection with bounding boxes
 - Support for multiple hardware accelerators (CPU, NVIDIA GPU, Intel GPU, AMD GPU)
 - Structured detection results with confidence scores
 - Easy integration through the `/v1/detection` endpoint
 ## Usage
 ### Detection Endpoint
 LocalAI provides a dedicated `/v1/detection` endpoint for object detection tasks. This endpoint is specifically designed for object detection and returns structured detection results with bounding boxes and confidence scores.
 ### API Reference
 To perform object detection, send a POST request to the `/v1/detection` endpoint:
 ```bash
 curl -X POST http://localhost:8080/v1/detection \
  -H "Content-Type: application/json" \
  -d '{
    "model": "rfdetr-base",
    "image": "https://media.roboflow.com/dog.jpeg"
  }'
 ```
 ### Request Format
 The request body should contain:
 - `model`: The name of the object detection model (e.g., "rfdetr-base")
 - `image`: The image to analyze, which can be:
  - A URL to an image
  - A base64-encoded image
 ### Response Format
 The API returns a JSON response with detected objects:
 ```json
 {
  "detections": [
    {
      "x": 100.5,
      "y": 150.2,
      "width": 200.0,
      "height": 300.0,
      "confidence": 0.95,
      "class_name": "dog"
    },
    {
      "x": 400.0,
      "y": 200.0,
      "width": 150.0,
      "height": 250.0,
      "confidence": 0.87,
      "class_name": "person"
    }
  ]
 }
 ```
 Each detection includes:
 - `x`, `y`: Coordinates of the bounding box top-left corner
 - `width`, `height`: Dimensions of the bounding box
 - `confidence`: Detection confidence score (0.0 to 1.0)
 - `class_name`: The detected object class
 ## Backends
 ### RF-DETR Backend
 The RF-DETR backend is implemented as a Python-based gRPC service that integrates seamlessly with LocalAI. It provides object detection capabilities using the RF-DETR model architecture and supports multiple hardware configurations:
 - **CPU**: Optimized for CPU inference
 - **NVIDIA GPU**: CUDA acceleration for NVIDIA GPUs
 - **Intel GPU**: Intel oneAPI optimization
 - **AMD GPU**: ROCm acceleration for AMD GPUs
 - **NVIDIA Jetson**: Optimized for ARM64 NVIDIA Jetson devices
 #### Setup
 1. **Using the Model Gallery (Recommended)**
   The easiest way to get started is using the model gallery. The `rfdetr-base` model is available in the official LocalAI gallery:
   ```bash
   # Install and run the rfdetr-base model
   local-ai run rfdetr-base
   ```
   You can also install it through the web interface by navigating to the Models section and searching for "rfdetr-base".
 2. **Manual Configuration**
   Create a model configuration file in your `models` directory:
   ```yaml
   name: rfdetr
   backend: rfdetr
   parameters:
     model: rfdetr-base
   ```
 #### Available Models
 Currently, the following model is available in the [Model Gallery]({{%relref "docs/features/model-gallery" %}}):
 - **rfdetr-base**: Base model with balanced performance and accuracy
 You can browse and install this model through the LocalAI web interface or using the command line.
 ## Examples
 ### Basic Object Detection
 ```bash
 # Detect objects in an image from URL
 curl -X POST http://localhost:8080/v1/detection \
  -H "Content-Type: application/json" \
  -d '{
    "model": "rfdetr-base",
    "image": "https://example.com/image.jpg"
  }'
 ```
 ### Base64 Image Detection
 ```bash
 # Convert image to base64 and send
 base64_image=$(base64 -w 0 image.jpg)
 curl -X POST http://localhost:8080/v1/detection \
  -H "Content-Type: application/json" \
  -d "{
    \"model\": \"rfdetr-base\",
    \"image\": \"data:image/jpeg;base64,$base64_image\"
  }"
 ```
 ## Troubleshooting
 ### Common Issues
 1. **Model Loading Errors**
   - Ensure the model file is properly downloaded
   - Check available disk space
   - Verify model compatibility with your backend version
 2. **Low Detection Accuracy**
   - Ensure good image quality and lighting
   - Check if objects are clearly visible
   - Consider using a larger model for better accuracy
 3. **Slow Performance**
   - Enable GPU acceleration if available
   - Use a smaller model for faster inference
   - Optimize image resolution
 ### Debug Mode
 Enable debug logging for troubleshooting:
 ```bash
 local-ai run --debug rfdetr-base
 ```
 ## Object Detection Category
 LocalAI includes a dedicated **object-detection** category for models and backends that specialize in identifying and locating objects within images. This category currently includes:
 - **RF-DETR**: Real-time transformer-based object detection
 Additional object detection models and backends will be added to this category in the future. You can filter models by the `object-detection` tag in the model gallery to find all available object detection models.
 ## Related Features
 - [🎨 Image generation]({{%relref "docs/features/image-generation" %}}): Generate images with AI
 - [📖 Text generation]({{%relref "docs/features/text-generation" %}}): Generate text with language models
 - [🔍 GPT Vision]({{%relref "docs/features/gpt-vision" %}}): Analyze images with language models
 - [🚀 GPU acceleration]({{%relref "docs/features/GPU-acceleration" %}}): Optimize performance with GPU acceleration 
--- a/docs/content/docs/getting-started/build.md
+++ b/docs/content/docs/getting-started/build.md
@@ -9,11 +9,13 @@ ico = "rocket_launch"
 ### Build
-LocalAI can be built as a container image or as a single, portable binary. Note that some model architectures might require Python libraries, which are not included in the binary.
+LocalAI can be built as a container image or as a single, portable binary. Note that some model architectures might require Python libraries, which are not included in the binary. The binary contains only the core backends written in Go and C++. 
 LocalAI's extensible architecture allows you to add your own backends, which can be written in any language, and as such the container images contains also the Python dependencies to run all the available backends (for example, in order to run backends like __Diffusers__ that allows to generate images and videos from text).
-This section contains instructions on how to build LocalAI from source.
+In some cases you might want to re-build LocalAI from source (for instance to leverage Apple Silicon acceleration), or to build a custom container image with your own backends. This section contains instructions on how to build LocalAI from source.
 #### Build LocalAI locally
@@ -22,6 +24,7 @@ This section contains instructions on how to build LocalAI from source.
 In order to build LocalAI locally, you need the following requirements:
 - Golang >= 1.21
 - Cmake/make
 - GCC
 - GRPC
@@ -33,14 +36,20 @@ To install the dependencies follow the instructions below:
 Install `xcode` from the App Store
 ```bash
-brew install go protobuf protoc-gen-go protoc-gen-go-grpc wget
+brew install abseil cmake go grpc protobuf protoc-gen-go protoc-gen-go-grpc python wget
 ```
 After installing the above dependencies, you need to install grpcio-tools from PyPI. You could do this via a pip --user install or a virtualenv.
 ```bash
 pip install --user grpcio-tools
 ```
 {{% /tab %}}
 {{% tab tabName="Debian" %}}
 ```bash
-apt install golang make protobuf-compiler-grpc
+apt install cmake golang libgrpc-dev make protobuf-compiler-grpc python3-grpc-tools
 ```
 After you have golang installed and working, you can install the required binaries for compiling the golang protobuf components via the following commands
@@ -54,8 +63,10 @@ go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f1
 {{% /tab %}}
 {{% tab tabName="From source" %}}
 Specify `BUILD_GRPC_FOR_BACKEND_LLAMA=true` to build automatically the gRPC dependencies
 ```bash
-make build
+make ... BUILD_GRPC_FOR_BACKEND_LLAMA=true build
 ```
 {{% /tab %}}
@@ -72,6 +83,36 @@ make build
 This should produce the binary `local-ai`
 Here is the list of the variables available that can be used to customize the build:
 | Variable | Default | Description |
 | ---------------------| ------- | ----------- |
 | `BUILD_TYPE`         |   None      | Build type. Available: `cublas`, `openblas`, `clblas`, `metal`,`hipblas`, `sycl_f16`, `sycl_f32` |
 | `GO_TAGS`            |   `tts stablediffusion`      | Go tags. Available: `stablediffusion`, `tts` |
 | `CLBLAST_DIR`        |         | Specify a CLBlast directory |
 | `CUDA_LIBPATH`       |         | Specify a CUDA library path |
 | `BUILD_API_ONLY` | false | Set to true to build only the API (no backends will be built) |
 {{% alert note %}}
 #### CPU flagset compatibility
 LocalAI uses different backends based on ggml and llama.cpp to run models. If your CPU doesn't support common instruction sets, you can disable them during build:
 ```
 CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_AVX=OFF -DGGML_FMA=OFF" make build
 ```
 To have effect on the container image, you need to set `REBUILD=true`:
 ```
 docker run  quay.io/go-skynet/localai
 docker run --rm -ti -p 8080:8080 -e DEBUG=true -e MODELS_PATH=/models -e THREADS=1 -e REBUILD=true -e CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_AVX=OFF -DGGML_FMA=OFF" -v $PWD/models:/models quay.io/go-skynet/local-ai:latest
 ```
 {{% /alert %}}
 #### Container image
 Requirements:
@@ -112,9 +153,6 @@ wget https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q2_K.gguf -O
 # Use a template from the examples
 cp -rf prompt-templates/ggml-gpt4all-j.tmpl models/phi-2.Q2_K.tmpl
 # Install the llama-cpp backend
 ./local-ai backends install llama-cpp
 # Run LocalAI
 ./local-ai --models-path=./models/ --debug=true
@@ -148,53 +186,131 @@ sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer
 ```
 # reinstall build dependencies
-brew reinstall go grpc protobuf wget
+brew reinstall abseil cmake go grpc protobuf wget
 make clean
 make build
 ```
-## Build backends
+**Requirements**: OpenCV, Gomp
-LocalAI have several backends available for installation in the backend gallery. The backends can be also built by source. As backends might vary from language and dependencies that they require, the documentation will provide generic guidance for few of the backends, which can be applied with some slight modifications also to the others.
+Image generation requires `GO_TAGS=stablediffusion` to be set during build:
 ### Manually
 Typically each backend include a Makefile which allow to package the backend.
 In the LocalAI repository, for instance you can build `bark-cpp` by doing:
 ```
-git clone https://github.com/go-skynet/LocalAI.git
+make GO_TAGS=stablediffusion build
 # Build the bark-cpp backend (requires cmake)
 make -C LocalAI/backend/go/bark-cpp build package
 # Build vllm backend (requires python)
 make -C LocalAI/backend/python/vllm
 ```
-### With Docker
+### Build with Text to audio support
-Building with docker is simpler as abstracts away all the requirement, and focuses on building the final OCI images that are available in the gallery. This allows for instance also to build locally a backend and install it with LocalAI. You can refer to [Backends](https://localai.io/backends/) for general guidance on how to install and develop backends.
+**Requirements**: piper-phonemize
-In the LocalAI repository, you can build `bark-cpp` by doing:
+Text to audio support is experimental and requires `GO_TAGS=tts` to be set during build:
 ```
-git clone https://github.com/go-skynet/LocalAI.git
+make GO_TAGS=tts build
 # Build the bark-cpp backend (requires docker)
 make docker-build-bark-cpp
 ```
-Note that `make` is only by convenience, in reality it just runs a simple `docker` command as:
+### Acceleration
 #### OpenBLAS
 Software acceleration.
 Requirements: OpenBLAS
 ```
 make BUILD_TYPE=openblas build
 ```
 #### CuBLAS
 Nvidia Acceleration.
 Requirement: Nvidia CUDA toolkit
 Note: CuBLAS support is experimental, and has not been tested on real HW. please report any issues you find!
 ```
 make BUILD_TYPE=cublas build
 ```
 More informations available in the upstream PR: https://github.com/ggerganov/llama.cpp/pull/1412
 #### Hipblas (AMD GPU with ROCm on Arch Linux)
 Packages:
 ```
 pacman -S base-devel git rocm-hip-sdk rocm-opencl-sdk opencv clblast grpc
 ```
 Library links:
 ```
 export CGO_CFLAGS="-I/usr/include/opencv4"
 export CGO_CXXFLAGS="-I/usr/include/opencv4"
 export CGO_LDFLAGS="-L/opt/rocm/hip/lib -lamdhip64 -L/opt/rocm/lib -lOpenCL -L/usr/lib -lclblast -lrocblas -lhipblas -lrocrand -lomp -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link"
 ```
 Build:
 ```
 make BUILD_TYPE=hipblas GPU_TARGETS=gfx1030
 ```
 #### ClBLAS
 AMD/Intel GPU acceleration.
 Requirement: OpenCL, CLBlast
 ```
 make BUILD_TYPE=clblas build
 ```
 To specify a clblast dir set: `CLBLAST_DIR`
 #### Intel GPU acceleration
 Intel GPU acceleration is supported via SYCL.
 Requirements: [Intel oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html) (see also [llama.cpp setup installations instructions](https://github.com/ggerganov/llama.cpp/blob/d71ac90985854b0905e1abba778e407e17f9f887/README-sycl.md?plain=1#L56))
 ```
 make BUILD_TYPE=sycl_f16 build # for float16
 make BUILD_TYPE=sycl_f32 build # for float32
 ```
 #### Metal (Apple Silicon)
 ```
 make build
 # correct build type is automatically used on mac (BUILD_TYPE=metal)
 # Set `gpu_layers: 256` (or equal to the number of model layers) to your YAML model config file and `f16: true`
 ```
 ### Windows compatibility
 Make sure to give enough resources to the running container. See https://github.com/go-skynet/LocalAI/issues/2
 ### Examples
 More advanced build options are available, for instance to build only a single backend.
 #### Build only a single backend
 You can control the backends that are built by setting the `GRPC_BACKENDS` environment variable. For instance, to build only the `llama-cpp` backend only:
 ```bash
-docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:bark-cpp -f LocalAI/backend/Dockerfile.golang --build-arg BACKEND=bark-cpp .               
+make GRPC_BACKENDS=backend-assets/grpc/llama-cpp build
 ```
-Note:
+By default, all the backends are built.
- BUILD_TYPE can be either: `cublas`, `hipblas`, `sycl_f16`, `sycl_f32`, `metal`.
+#### Specific llama.cpp version
- BASE_IMAGE is tested on `ubuntu:22.04` (and defaults to it)
+
 To build with a specific version of llama.cpp, set `CPPLLAMA_VERSION` to the tag or wanted sha:
 ```
 CPPLLAMA_VERSION=<sha> make build
 ```
--- a/docs/content/docs/getting-started/container-images.md
+++ b/docs/content/docs/getting-started/container-images.md
@@ -163,9 +163,9 @@ Standard container images do not have pre-installed models.
 | Description | Quay | Docker Hub                                                  |
 | --- | --- |-------------------------------------------------------------|
-| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-gpu-nvidia-cuda-11` | `localai/localai:master-gpu-nvidia-cuda-11`                      |
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-gpu-nvidia-cuda11` | `localai/localai:master-gpu-nvidia-cuda11`                      |
 | Latest tag | `quay.io/go-skynet/local-ai:latest-gpu-nvidia-cuda-11` | `localai/localai:latest-gpu-nvidia-cuda-11`                      |
-| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-gpu-nvidia-cuda-11` | `localai/localai:{{< version >}}-gpu-nvidia-cuda-11`             |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-gpu-nvidia-cuda11` | `localai/localai:{{< version >}}-gpu-nvidia-cuda11`             |
 {{% /tab %}}
@@ -173,9 +173,9 @@ Standard container images do not have pre-installed models.
 | Description | Quay | Docker Hub                                                  |
 | --- | --- |-------------------------------------------------------------|
-| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-gpu-nvidia-cuda-12` | `localai/localai:master-gpu-nvidia-cuda-12`                      |
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-gpu-nvidia-cuda12` | `localai/localai:master-gpu-nvidia-cuda12`                      |
 | Latest tag | `quay.io/go-skynet/local-ai:latest-gpu-nvidia-cuda-12` | `localai/localai:latest-gpu-nvidia-cuda-12`                 |
-| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-gpu-nvidia-cuda-12` | `localai/localai:{{< version >}}-gpu-nvidia-cuda-12`             |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-gpu-nvidia-cuda12` | `localai/localai:{{< version >}}-gpu-nvidia-cuda12`             |
 {{% /tab %}}
--- a/docs/content/docs/getting-started/quickstart.md
+++ b/docs/content/docs/getting-started/quickstart.md
@@ -106,9 +106,6 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 local-ai run oci://localai/phi-2:latest
 ```
 {{% alert icon="⚡" %}}
 **Automatic Backend Detection**: When you install models from the gallery or YAML files, LocalAI automatically detects your system's GPU capabilities (NVIDIA, AMD, Intel) and downloads the appropriate backend. For advanced configuration options, see [GPU Acceleration]({{% relref "docs/features/gpu-acceleration#automatic-backend-detection" %}}).
 {{% /alert %}}
 For a full list of options, refer to the [Installer Options]({{% relref "docs/advanced/installer" %}}) documentation.
@@ -157,7 +154,7 @@ For instructions on using AIO images, see [Using container images]({{% relref "d
 LocalAI is part of the Local family stack, along with LocalAGI and LocalRecall.
-[LocalAGI](https://github.com/mudler/LocalAGI) is a powerful, self-hostable AI Agent platform designed for maximum privacy and flexibility which encompassess and uses all the software stack. It provides a complete drop-in replacement for OpenAI's Responses APIs with advanced agentic capabilities, working entirely locally on consumer-grade hardware (CPU and GPU).
+[LocalAGI](https://github.com/mudler/LocalAGI) is a powerful, self-hostable AI Agent platform designed for maximum privacy and flexibility which encompassess and uses all the softwre stack. It provides a complete drop-in replacement for OpenAI's Responses APIs with advanced agentic capabilities, working entirely locally on consumer-grade hardware (CPU and GPU).
 ### Quick Start
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v3.2.3"
+  "version": "v3.1.1"
 }
--- a/docs/static/install.sh
+++ b/docs/static/install.sh
@@ -660,7 +660,7 @@ install_docker() {
    IMAGE_TAG=
    if [ "$USE_VULKAN" = true ]; then
-        IMAGE_TAG=${LOCALAI_VERSION}-gpu-vulkan
+        IMAGE_TAG=${LOCALAI_VERSION}-vulkan
        info "Starting LocalAI Docker container..."
        $SUDO docker run -v local-ai-data:/models \
@@ -672,7 +672,7 @@ install_docker() {
            -d -p $PORT:8080 --name local-ai localai/localai:$IMAGE_TAG $STARTCOMMAND
    elif [ "$HAS_CUDA" ]; then
        # Default to CUDA 12
-        IMAGE_TAG=${LOCALAI_VERSION}-gpu-nvidia-cuda-12
+        IMAGE_TAG=${LOCALAI_VERSION}-gpu-nvidia-cuda12
        # AIO
        if [ "$USE_AIO" = true ]; then
            IMAGE_TAG=${LOCALAI_VERSION}-aio-gpu-nvidia-cuda-12
@@ -757,7 +757,7 @@ install_binary_darwin() {
    [ "$(uname -s)" = "Darwin" ] || fatal 'This script is intended to run on macOS only.'
    info "Downloading LocalAI ${LOCALAI_VERSION}..."
-    curl --fail --show-error --location --progress-bar -o $TEMP_DIR/local-ai "https://github.com/mudler/LocalAI/releases/download/${LOCALAI_VERSION}/local-ai-${LOCALAI_VERSION}-darwin-${ARCH}"
+    curl --fail --show-error --location --progress-bar -o $TEMP_DIR/local-ai "https://github.com/mudler/LocalAI/releases/download/${LOCALAI_VERSION}/local-ai-Darwin-${ARCH}"
    info "Installing to /usr/local/bin/local-ai"
    install -o0 -g0 -m755 $TEMP_DIR/local-ai /usr/local/bin/local-ai
@@ -789,7 +789,7 @@ install_binary() {
    fi
    info "Downloading LocalAI ${LOCALAI_VERSION}..."
-    curl --fail --location --progress-bar -o $TEMP_DIR/local-ai "https://github.com/mudler/LocalAI/releases/download/${LOCALAI_VERSION}/local-ai-${LOCALAI_VERSION}-linux-${ARCH}"
+    curl --fail --location --progress-bar -o $TEMP_DIR/local-ai "https://github.com/mudler/LocalAI/releases/download/${LOCALAI_VERSION}/local-ai-Linux-${ARCH}"
    for BINDIR in /usr/local/bin /usr/bin /bin; do
        echo $PATH | grep -q $BINDIR && break || continue
@@ -868,7 +868,7 @@ OS="$(uname -s)"
 ARCH=$(uname -m)
 case "$ARCH" in
-    x86_64) ARCH="amd64" ;;
+    x86_64) ARCH="x86_64" ;;
    aarch64|arm64) ARCH="arm64" ;;
    *) fatal "Unsupported architecture: $ARCH" ;;
 esac
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,26 +1,4 @@
 ---
 - &rfdetr
  name: "rfdetr-base"
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  icon: https://avatars.githubusercontent.com/u/53104118?s=200&v=4
  license: apache-2.0
  description: |
    RF-DETR is a real-time, transformer-based object detection model architecture developed by Roboflow and released under the Apache 2.0 license.
    RF-DETR is the first real-time model to exceed 60 AP on the Microsoft COCO benchmark alongside competitive performance at base sizes. It also achieves state-of-the-art performance on RF100-VL, an object detection benchmark that measures model domain adaptability to real world problems. RF-DETR is fastest and most accurate for its size when compared current real-time objection models.
    RF-DETR is small enough to run on the edge using Inference, making it an ideal model for deployments that need both strong accuracy and real-time performance.
  tags:
    - object-detection
    - rfdetr
    - gpu
    - cpu
  urls:
    - https://github.com/roboflow/rf-detr
  overrides:
    backend: rfdetr
    parameters:
      model: rfdetr-base
    known_usecases:
      - detection
 - name: "dream-org_dream-v0-instruct-7b"
  # chatml
  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
--- a/pkg/grpc/backend.go
+++ b/pkg/grpc/backend.go
@@ -9,7 +9,7 @@ import (
 var embeds = map[string]*embedBackend{}
-func Provide(addr string, llm AIModel) {
+func Provide(addr string, llm LLM) {
 	embeds[addr] = &embedBackend{s: &server{llm: llm}}
 }
@@ -42,7 +42,6 @@ type Backend interface {
 	GenerateVideo(ctx context.Context, in *pb.GenerateVideoRequest, opts ...grpc.CallOption) (*pb.Result, error)
 	TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOption) (*pb.Result, error)
 	SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequest, opts ...grpc.CallOption) (*pb.Result, error)
 	Detect(ctx context.Context, in *pb.DetectOptions, opts ...grpc.CallOption) (*pb.DetectResponse, error)
 	AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...grpc.CallOption) (*pb.TranscriptResult, error)
 	TokenizeString(ctx context.Context, in *pb.PredictOptions, opts ...grpc.CallOption) (*pb.TokenizationResponse, error)
 	Status(ctx context.Context) (*pb.StatusResponse, error)
--- a/pkg/grpc/base/base.go
+++ b/pkg/grpc/base/base.go
@@ -69,10 +69,6 @@ func (llm *Base) SoundGeneration(*pb.SoundGenerationRequest) error {
 	return fmt.Errorf("unimplemented")
 }
 func (llm *Base) Detect(*pb.DetectOptions) (pb.DetectResponse, error) {
 	return pb.DetectResponse{}, fmt.Errorf("unimplemented")
 }
 func (llm *Base) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationResponse, error) {
 	return pb.TokenizationResponse{}, fmt.Errorf("unimplemented")
 }
--- a/pkg/grpc/client.go
+++ b/pkg/grpc/client.go
@@ -504,25 +504,3 @@ func (c *Client) VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.CallOp
 	client := pb.NewBackendClient(conn)
 	return client.VAD(ctx, in, opts...)
 }
 func (c *Client) Detect(ctx context.Context, in *pb.DetectOptions, opts ...grpc.CallOption) (*pb.DetectResponse, error) {
 	if !c.parallel {
 		c.opMutex.Lock()
 		defer c.opMutex.Unlock()
 	}
 	c.setBusy(true)
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
 	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
 		grpc.WithDefaultCallOptions(
 			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
 			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
 		))
 	if err != nil {
 		return nil, err
 	}
 	defer conn.Close()
 	client := pb.NewBackendClient(conn)
 	return client.Detect(ctx, in, opts...)
 }
--- a/pkg/grpc/embed.go
+++ b/pkg/grpc/embed.go
@@ -59,10 +59,6 @@ func (e *embedBackend) SoundGeneration(ctx context.Context, in *pb.SoundGenerati
 	return e.s.SoundGeneration(ctx, in)
 }
 func (e *embedBackend) Detect(ctx context.Context, in *pb.DetectOptions, opts ...grpc.CallOption) (*pb.DetectResponse, error) {
 	return e.s.Detect(ctx, in)
 }
 func (e *embedBackend) AudioTranscription(ctx context.Context, in *pb.TranscriptRequest, opts ...grpc.CallOption) (*pb.TranscriptResult, error) {
 	return e.s.AudioTranscription(ctx, in)
 }
--- a/pkg/grpc/interface.go
+++ b/pkg/grpc/interface.go
@@ -4,7 +4,7 @@ import (
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
-type AIModel interface {
+type LLM interface {
 	Busy() bool
 	Lock()
 	Unlock()
@@ -15,7 +15,6 @@ type AIModel interface {
 	Embeddings(*pb.PredictOptions) ([]float32, error)
 	GenerateImage(*pb.GenerateImageRequest) error
 	GenerateVideo(*pb.GenerateVideoRequest) error
 	Detect(*pb.DetectOptions) (pb.DetectResponse, error)
 	AudioTranscription(*pb.TranscriptRequest) (pb.TranscriptResult, error)
 	TTS(*pb.TTSRequest) error
 	SoundGeneration(*pb.SoundGenerationRequest) error
--- a/pkg/grpc/server.go
+++ b/pkg/grpc/server.go
@@ -22,7 +22,7 @@ import (
 // server is used to implement helloworld.GreeterServer.
 type server struct {
 	pb.UnimplementedBackendServer
-	llm AIModel
+	llm LLM
 }
 func (s *server) Health(ctx context.Context, in *pb.HealthMessage) (*pb.Reply, error) {
@@ -111,18 +111,6 @@ func (s *server) SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequ
 	return &pb.Result{Message: "Sound Generation audio generated", Success: true}, nil
 }
 func (s *server) Detect(ctx context.Context, in *pb.DetectOptions) (*pb.DetectResponse, error) {
 	if s.llm.Locking() {
 		s.llm.Lock()
 		defer s.llm.Unlock()
 	}
 	res, err := s.llm.Detect(in)
 	if err != nil {
 		return nil, err
 	}
 	return &res, nil
 }
 func (s *server) AudioTranscription(ctx context.Context, in *pb.TranscriptRequest) (*pb.TranscriptResult, error) {
 	if s.llm.Locking() {
 		s.llm.Lock()
@@ -263,7 +251,7 @@ func (s *server) VAD(ctx context.Context, in *pb.VADRequest) (*pb.VADResponse, e
 	return &res, nil
 }
-func StartServer(address string, model AIModel) error {
+func StartServer(address string, model LLM) error {
 	lis, err := net.Listen("tcp", address)
 	if err != nil {
 		return err
@@ -281,7 +269,7 @@ func StartServer(address string, model AIModel) error {
 	return nil
 }
-func RunServer(address string, model AIModel) (func() error, error) {
+func RunServer(address string, model LLM) (func() error, error) {
 	lis, err := net.Listen("tcp", address)
 	if err != nil {
 		return nil, err
--- a/core/startup/backend_preload.go
+++ b/core/startup/backend_preload.go
@@ -8,8 +8,8 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/system"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/system"
 	"github.com/rs/zerolog/log"
 )
--- a/core/startup/model_preload.go
+++ b/core/startup/model_preload.go
@@ -10,8 +10,8 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/system"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	"github.com/mudler/LocalAI/pkg/system"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
 	"gopkg.in/yaml.v2"
--- a/core/startup/model_preload_test.go
+++ b/core/startup/model_preload_test.go
@@ -6,7 +6,7 @@ import (
 	"path/filepath"
 	"github.com/mudler/LocalAI/core/config"
-	. "github.com/mudler/LocalAI/core/startup"
+	. "github.com/mudler/LocalAI/pkg/startup"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
--- a/core/startup/startup_suite_test.go
+++ b/core/startup/startup_suite_test.go
--- a/core/templates/cache.go
+++ b/core/templates/cache.go
--- a/core/templates/evaluator.go
+++ b/core/templates/evaluator.go
--- a/core/templates/evaluator_test.go
+++ b/core/templates/evaluator_test.go
@@ -3,8 +3,8 @@ package templates_test
 import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	. "github.com/mudler/LocalAI/core/templates"
 	"github.com/mudler/LocalAI/pkg/functions"
 	. "github.com/mudler/LocalAI/pkg/templates"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
--- a/core/templates/multimodal.go
+++ b/core/templates/multimodal.go
--- a/core/templates/multimodal_test.go
+++ b/core/templates/multimodal_test.go
@@ -1,7 +1,7 @@
 package templates_test
 import (
-	. "github.com/mudler/LocalAI/core/templates" // Update with your module path
+	. "github.com/mudler/LocalAI/pkg/templates" // Update with your module path
 	// Update with your module path
 	. "github.com/onsi/ginkgo/v2"
--- a/core/templates/templates_suite_test.go
+++ b/core/templates/templates_suite_test.go
--- a/pkg/utils/base64.go
+++ b/pkg/utils/base64.go
@@ -20,7 +20,7 @@ var dataURIPattern = regexp.MustCompile(`^data:([^;]+);base64,`)
 // GetContentURIAsBase64 checks if the string is an URL, if it's an URL downloads the content in memory encodes it in base64 and returns the base64 string, otherwise returns the string by stripping base64 data headers
 func GetContentURIAsBase64(s string) (string, error) {
-	if strings.HasPrefix(s, "http") || strings.HasPrefix(s, "https") {
+	if strings.HasPrefix(s, "http") {
 		// download the image
 		resp, err := base64DownloadClient.Get(s)
 		if err != nil {
--- a/pkg/utils/config.go
+++ b/pkg/utils/config.go
@@ -0,0 +1,42 @@
 package utils
 import (
 	"encoding/json"
 	"os"
 	"path/filepath"
 	"github.com/rs/zerolog/log"
 )
 func SaveConfig(filePath, fileName string, obj any) {
 	file, err := json.MarshalIndent(obj, "", " ")
 	if err != nil {
 		log.Error().Err(err).Msg("failed to JSON marshal the uploadedFiles")
 	}
 	absolutePath := filepath.Join(filePath, fileName)
 	err = os.WriteFile(absolutePath, file, 0600)
 	if err != nil {
 		log.Error().Err(err).Str("filepath", absolutePath).Msg("failed to save configuration file")
 	}
 }
 func LoadConfig(filePath, fileName string, obj interface{}) {
 	uploadFilePath := filepath.Join(filePath, fileName)
 	_, err := os.Stat(uploadFilePath)
 	if os.IsNotExist(err) {
 		log.Debug().Msgf("No configuration file found at %s", uploadFilePath)
 		return
 	}
 	file, err := os.ReadFile(uploadFilePath)
 	if err != nil {
 		log.Error().Err(err).Str("filepath", uploadFilePath).Msg("failed to read file")
 	} else {
 		err = json.Unmarshal(file, &obj)
 		if err != nil {
 			log.Error().Err(err).Str("filepath", uploadFilePath).Msg("failed to parse file as JSON")
 		}
 	}
 }