chore: ⬆️ Update ggml-org/llama.cpp to daf2dd788066b8b239cb7f68210e090c2124c199 (#5951 )

⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
feat(swagger): update swagger (#5950 )
2026-02-03 11:13:31 -05:00 · 2025-08-01 08:25:36 +02:00 · 2025-07-31 21:04:23 +00:00 · 2025-07-31 19:44:46 +02:00 · 2025-07-31 18:32:06 +02:00 · 2025-07-31 17:23:48 +02:00
274 changed files with 8915 additions and 6463 deletions
--- a/.devcontainer-scripts/poststart.sh
+++ b/.devcontainer-scripts/poststart.sh
@@ -2,9 +2,6 @@

 cd /workspace

-# Grab the pre-stashed backend assets to avoid build issues
-cp -r /build/backend-assets /workspace/backend-assets
-
 # Ensures generated source files are present upon load
 make prepare

--- a/.devcontainer/docker-compose-devcontainer.yml
+++ b/.devcontainer/docker-compose-devcontainer.yml
@@ -4,9 +4,6 @@ services:
      context: ..
      dockerfile: Dockerfile
      target: devcontainer
-      args:
-      - FFMPEG=true
-      - GO_TAGS=p2p tts
    env_file:
      - ../.env
    ports:
--- a/.dockerignore
+++ b/.dockerignore
@@ -3,7 +3,9 @@
 .vscode
 .devcontainer
 models
+backends
 examples/chatbot-ui/models
+backend/go/image/stablediffusion-ggml/build/
 examples/rwkv/models
 examples/**/models
 Dockerfile*
@@ -14,4 +16,4 @@ __pycache__

 # backend virtual environments
 **/venv
-backend/python/**/source
+backend/python/**/source
--- a/.env
+++ b/.env
@@ -41,13 +41,6 @@
 ## Uncomment and set to true to enable rebuilding from source
 # REBUILD=true

-## Enable go tags, available: p2p, tts
-## p2p: enable distributed inferencing
-## tts: enables text-to-speech with go-piper 
-## (requires REBUILD=true)
-#
-# GO_TAGS=p2p
-
 ## Path where to store generated images
 # LOCALAI_IMAGE_PATH=/tmp/generated/images

--- a/.github/bump_deps.sh
+++ b/.github/bump_deps.sh
@@ -3,15 +3,20 @@ set -xe
 REPO=$1
 BRANCH=$2
 VAR=$3
+FILE=$4
+
+if [ -z "$FILE" ]; then
+    FILE="Makefile"
+fi

 LAST_COMMIT=$(curl -s -H "Accept: application/vnd.github.VERSION.sha" "https://api.github.com/repos/$REPO/commits/$BRANCH")

 # Read $VAR from Makefile (only first match)
 set +e
-CURRENT_COMMIT="$(grep -m1 "^$VAR?=" Makefile | cut -d'=' -f2)"
+CURRENT_COMMIT="$(grep -m1 "^$VAR?=" $FILE | cut -d'=' -f2)"
 set -e

-sed -i Makefile -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
+sed -i $FILE -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"

 if [ -z "$CURRENT_COMMIT" ]; then
    echo "Could not find $VAR in Makefile."
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
--- a/.github/workflows/backend_build.yml
+++ b/.github/workflows/backend_build.yml
@@ -28,10 +28,6 @@ on:
        description: 'Tag latest'
        default: ''
        type: string
-      latest-image:
-        description: 'Tag latest'
-        default: ''
-        type: string
      tag-suffix:
        description: 'Tag suffix'
        default: ''
@@ -53,6 +49,10 @@ on:
        description: 'Build Dockerfile'
        required: true
        type: string
+      skip-drivers:
+        description: 'Skip drivers'
+        default: 'false'
+        type: string
    secrets:
      dockerUsername:
        required: true
@@ -64,7 +64,7 @@ on:
        required: true

 jobs:
-  reusable_python_backend-build:
+  backend-build:
    runs-on: ${{ inputs.runs-on }}
    steps:

@@ -153,7 +153,7 @@ jobs:
            type=sha
          flavor: |
            latest=${{ inputs.tag-latest }}
-            suffix=${{ inputs.tag-suffix }}
+            suffix=${{ inputs.tag-suffix }},onlatest=true

      - name: Docker meta for PR
        id: meta_pull_request
@@ -168,7 +168,7 @@ jobs:
            type=sha,suffix=${{ github.event.number }}-${{ inputs.backend }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
          flavor: |
            latest=${{ inputs.tag-latest }}
-            suffix=${{ inputs.tag-suffix }}
+            suffix=${{ inputs.tag-suffix }},onlatest=true
 ## End testing image
      - name: Set up QEMU
        uses: docker/setup-qemu-action@master
@@ -201,16 +201,16 @@ jobs:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
            BUILD_TYPE=${{ inputs.build-type }}
+            SKIP_DRIVERS=${{ inputs.skip-drivers }}
            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
            BASE_IMAGE=${{ inputs.base-image }}
            BACKEND=${{ inputs.backend }}
-          context: ./backend
-          file: ./backend/Dockerfile.python
+          context: ${{ inputs.context }}
+          file: ${{ inputs.dockerfile }}
          cache-from: type=gha
          platforms: ${{ inputs.platforms }}
          push: ${{ github.event_name != 'pull_request' }}
-          load: ${{ github.event_name != 'pull_request' && inputs.latest-image != '' && github.ref_type == 'tag' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}

@@ -221,30 +221,20 @@ jobs:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
            BUILD_TYPE=${{ inputs.build-type }}
+            SKIP_DRIVERS=${{ inputs.skip-drivers }}
            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
            BASE_IMAGE=${{ inputs.base-image }}
            BACKEND=${{ inputs.backend }}
-          context: ./backend
-          file: ./backend/Dockerfile.python
+          context: ${{ inputs.context }}
+          file: ${{ inputs.dockerfile }}
          cache-from: type=gha
          platforms: ${{ inputs.platforms }}
          push: true
          tags: ${{ steps.meta_pull_request.outputs.tags }}
          labels: ${{ steps.meta_pull_request.outputs.labels }}

-      - name: Cleanup
-        run: |
-          docker builder prune -f
-          docker system prune --force --volumes --all

-      - name: Latest tag
-        if: github.event_name != 'pull_request' && inputs.latest-image != '' && github.ref_type == 'tag'
-        run: |
-          docker tag localai/localai-backends:${{ steps.meta.outputs.version }} localai/localai-backends:${{ inputs.latest-image }}
-          docker push localai/localai-backends:${{ inputs.latest-image }}
-          docker tag quay.io/go-skynet/local-ai-backends:${{ steps.meta.outputs.version }} quay.io/go-skynet/local-ai-backends:${{ inputs.latest-image }}
-          docker push quay.io/go-skynet/local-ai-backends:${{ inputs.latest-image }}

      - name: job summary
        run: |
--- a/.github/workflows/build-test.yaml
+++ b/.github/workflows/build-test.yaml
@@ -0,0 +1,23 @@
+name: Build test
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+
+jobs:
+  build-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: 1.23
+      - name: Run GoReleaser
+        run: |
+          make dev-dist
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -10,30 +10,32 @@ jobs:
      matrix:
        include:
          - repository: "ggml-org/llama.cpp"
-            variable: "CPPLLAMA_VERSION"
+            variable: "LLAMA_VERSION"
            branch: "master"
+            file: "backend/cpp/llama-cpp/Makefile"
          - repository: "ggml-org/whisper.cpp"
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
+            file: "backend/go/whisper/Makefile"
          - repository: "PABannier/bark.cpp"
            variable: "BARKCPP_VERSION"
            branch: "main"
+            file: "Makefile"
          - repository: "leejet/stable-diffusion.cpp"
            variable: "STABLEDIFFUSION_GGML_VERSION"
            branch: "master"
-          - repository: "mudler/go-stable-diffusion"
-            variable: "STABLEDIFFUSION_VERSION"
-            branch: "master"
+            file: "backend/go/stablediffusion-ggml/Makefile"
          - repository: "mudler/go-piper"
            variable: "PIPER_VERSION"
            branch: "master"
+            file: "backend/go/piper/Makefile"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Bump dependencies 🔧
        id: bump
        run: |
-          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
+          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }} ${{ matrix.file }}
          {
            echo 'message<<EOF'
            cat "${{ matrix.variable }}_message.txt"
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -20,7 +20,6 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install -y pip wget
-          sudo pip install --upgrade pip
          pip install huggingface_hub
      - name: 'Setup yq'
        uses: dcarbone/install-yq-action@v1.3.1
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -31,7 +31,7 @@ jobs:
          make protogen-go
      - name: Build api
        run: |
-          CGO_ENABLED=0 make build-api
+          CGO_ENABLED=0 make build
      - name: rm
        uses: appleboy/ssh-action@v1.2.2
        with:
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2025.2.0-0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -14,7 +14,6 @@ jobs:
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
@@ -40,8 +39,7 @@ jobs:
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-gpu-nvidia-cuda12-ffmpeg'
-            ffmpeg: 'true'
+            tag-suffix: '-gpu-nvidia-cuda-12'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
@@ -49,25 +47,22 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
-            ffmpeg: 'false'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'sycl_f16'
+          - build-type: 'sycl'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: 'sycl-f16-ffmpeg'
-            ffmpeg: 'true'
+            tag-suffix: 'sycl'
            runs-on: 'ubuntu-latest'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-vulkan-ffmpeg-core'
-            ffmpeg: 'true'
+            tag-suffix: '-vulkan-core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -18,7 +18,6 @@ jobs:
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
@@ -38,9 +37,8 @@ jobs:
        include:
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas'
-            ffmpeg: 'true'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-hipblas'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
@@ -52,7 +50,6 @@ jobs:
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
@@ -76,7 +73,6 @@ jobs:
            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'auto'
            tag-suffix: ''
-            ffmpeg: 'true'
            base-image: "ubuntu:22.04"
            runs-on: 'ubuntu-latest'
            aio: "-aio-cpu"
@@ -86,9 +82,8 @@ jobs:
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-gpu-nvidia-cuda11'
-            ffmpeg: 'true'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-11'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
@@ -98,9 +93,8 @@ jobs:
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-gpu-nvidia-cuda12'
-            ffmpeg: 'true'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-12'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
@@ -108,41 +102,28 @@ jobs:
            aio: "-aio-gpu-nvidia-cuda-12"
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-vulkan'
-            ffmpeg: 'true'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-vulkan'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
            aio: "-aio-gpu-vulkan"
-          - build-type: 'sycl_f16'
+          - build-type: 'sycl'
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-gpu-intel-f16'
-            ffmpeg: 'true'
+            tag-suffix: '-gpu-intel'
            runs-on: 'ubuntu-latest'
            makeflags: "--jobs=3 --output-sync=target"
-            aio: "-aio-gpu-intel-f16"
-          - build-type: 'sycl_f32'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-gpu-intel-f32'
-            ffmpeg: 'true'
-            runs-on: 'ubuntu-latest'
-            makeflags: "--jobs=3 --output-sync=target"
-            aio: "-aio-gpu-intel-f32"
+            aio: "-aio-gpu-intel"

  gh-runner:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
@@ -165,9 +146,8 @@ jobs:
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/arm64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            tag-suffix: '-nvidia-l4t-arm64'
-            ffmpeg: 'true'
            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
            runs-on: 'ubuntu-24.04-arm'
            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -37,10 +37,6 @@ on:
        description: 'Tag suffix'
        default: ''
        type: string
-      ffmpeg:
-        description: 'FFMPEG'
-        default: ''
-        type: string
      skip-drivers:
        description: 'Skip drivers by default'
        default: 'false'
@@ -236,7 +232,6 @@ jobs:
            BUILD_TYPE=${{ inputs.build-type }}
            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
-            FFMPEG=${{ inputs.ffmpeg }}
            BASE_IMAGE=${{ inputs.base-image }}
            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
@@ -264,7 +259,6 @@ jobs:
            BUILD_TYPE=${{ inputs.build-type }}
            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
-            FFMPEG=${{ inputs.ffmpeg }}
            BASE_IMAGE=${{ inputs.base-image }}
            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -96,7 +96,7 @@ jobs:
    - name: Start LocalAI
      run: |
        echo "Starting LocalAI..."
-        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
+        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master run --debug $MODEL_NAME
        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
      # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.8.1
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,375 +1,26 @@
-name: Build and Release
+name: goreleaser

 on:
  push:
-    branches:
-      - master
    tags:
      - 'v*'
-  pull_request:
-
-env:
-  GRPC_VERSION: v1.65.0
-
-permissions:
-  contents: write
-
-concurrency:
-  group: ci-releases-${{ github.head_ref || github.ref }}-${{ github.repository }}
-  cancel-in-progress: true

 jobs:
-
-  build-linux-arm:
+  goreleaser:
    runs-on: ubuntu-latest
    steps:
-      - name: Clone
+      - name: Checkout
        uses: actions/checkout@v4
        with:
-          submodules: true
-      - uses: actions/setup-go@v5
+          fetch-depth: 0
+      - name: Set up Go
+        uses: actions/setup-go@v5
        with:
-          go-version: '1.21.x'
-          cache: false
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
-          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
-          make install-go-tools
-      - name: Install CUDA Dependencies
-        run: |
-          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
-          sudo dpkg -i cuda-keyring_1.1-1_all.deb
-          sudo apt-get update
-          sudo apt-get install -y cuda-cross-aarch64 cuda-nvcc-cross-aarch64-${CUDA_VERSION} libcublas-cross-aarch64-${CUDA_VERSION}
+          go-version: 1.23
+      - name: Run GoReleaser
+        uses: goreleaser/goreleaser-action@v6
+        with:
+          version: v2.11.0
+          args: release --clean
        env:
-          CUDA_VERSION: 12-4
-      - name: Cache grpc
-        id: cache-grpc
-        uses: actions/cache@v4
-        with:
-          path: grpc
-          key: ${{ runner.os }}-arm-grpc-${{ env.GRPC_VERSION }}
-      - name: Build grpc
-        if: steps.cache-grpc.outputs.cache-hit != 'true'
-        run: |
-
-          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && sed -i "216i\  TESTONLY" "third_party/abseil-cpp/absl/container/CMakeLists.txt" && mkdir -p cmake/build && \
-          cd cmake/build && cmake -DgRPC_INSTALL=ON \
-            -DgRPC_BUILD_TESTS=OFF \
-            ../.. && sudo make --jobs 5 --output-sync=target
-      - name: Install gRPC
-        run: |
-          GNU_HOST=aarch64-linux-gnu
-          C_COMPILER_ARM_LINUX=$GNU_HOST-gcc
-          CXX_COMPILER_ARM_LINUX=$GNU_HOST-g++
-
-          CROSS_TOOLCHAIN=/usr/$GNU_HOST
-          CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
-          CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
-
-          # https://cmake.org/cmake/help/v3.13/manual/cmake-toolchains.7.html#cross-compiling-for-linux
-          echo "set(CMAKE_SYSTEM_NAME Linux)" >> $CMAKE_CROSS_TOOLCHAIN && \
-            echo "set(CMAKE_SYSTEM_PROCESSOR arm)" >> $CMAKE_CROSS_TOOLCHAIN && \
-            echo "set(CMAKE_STAGING_PREFIX $CROSS_STAGING_PREFIX)" >> $CMAKE_CROSS_TOOLCHAIN && \
-            echo "set(CMAKE_SYSROOT ${CROSS_TOOLCHAIN}/sysroot)" >> $CMAKE_CROSS_TOOLCHAIN && \
-            echo "set(CMAKE_C_COMPILER /usr/bin/$C_COMPILER_ARM_LINUX)" >> $CMAKE_CROSS_TOOLCHAIN && \
-            echo "set(CMAKE_CXX_COMPILER /usr/bin/$CXX_COMPILER_ARM_LINUX)" >> $CMAKE_CROSS_TOOLCHAIN && \
-            echo "set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)" >> $CMAKE_CROSS_TOOLCHAIN && \
-            echo "set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)" >> $CMAKE_CROSS_TOOLCHAIN && \
-            echo "set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)" >> $CMAKE_CROSS_TOOLCHAIN && \
-            echo "set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)" >> $CMAKE_CROSS_TOOLCHAIN
-          GRPC_DIR=$PWD/grpc
-          cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install && \
-          GRPC_CROSS_BUILD_DIR=$GRPC_DIR/cmake/cross_build && \
-          mkdir -p $GRPC_CROSS_BUILD_DIR && \
-          cd $GRPC_CROSS_BUILD_DIR && \
-          cmake -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CROSS_TOOLCHAIN \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DCMAKE_INSTALL_PREFIX=$CROSS_TOOLCHAIN/grpc_install \
-            ../.. && \
-          sudo make -j`nproc` install
-      - name: Build
-        id: build
-        run: |
-          GNU_HOST=aarch64-linux-gnu
-          C_COMPILER_ARM_LINUX=$GNU_HOST-gcc
-          CXX_COMPILER_ARM_LINUX=$GNU_HOST-g++
-
-          CROSS_TOOLCHAIN=/usr/$GNU_HOST
-          CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
-          CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
-          export PATH=$PATH:$GOPATH/bin
-          export PATH=/usr/local/cuda/bin:$PATH
-          sudo rm -rf /usr/aarch64-linux-gnu/lib/libstdc++.so.6
-          sudo cp -rf /usr/aarch64-linux-gnu/lib/libstdc++.so* /usr/aarch64-linux-gnu/lib/libstdc++.so.6
-          sudo cp /usr/aarch64-linux-gnu/lib/ld-linux-aarch64.so.1 ld.so
-          BACKEND_LIBS="./grpc/cmake/cross_build/third_party/re2/libre2.a ./grpc/cmake/cross_build/libgrpc.a ./grpc/cmake/cross_build/libgrpc++.a ./grpc/cmake/cross_build/third_party/protobuf/libprotobuf.a /usr/aarch64-linux-gnu/lib/libc.so.6 /usr/aarch64-linux-gnu/lib/libstdc++.so.6 /usr/aarch64-linux-gnu/lib/libgomp.so.1 /usr/aarch64-linux-gnu/lib/libm.so.6 /usr/aarch64-linux-gnu/lib/libgcc_s.so.1 /usr/aarch64-linux-gnu/lib/libdl.so.2 /usr/aarch64-linux-gnu/lib/libpthread.so.0 ./ld.so" \
-          GOOS=linux \
-          GOARCH=arm64 \
-          CMAKE_ARGS="-DProtobuf_INCLUDE_DIRS=$CROSS_STAGING_PREFIX/include -DProtobuf_DIR=$CROSS_STAGING_PREFIX/lib/cmake/protobuf -DgRPC_DIR=$CROSS_STAGING_PREFIX/lib/cmake/grpc -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CROSS_TOOLCHAIN -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++" make dist-cross-linux-arm64
-      - uses: actions/upload-artifact@v4
-        with:
-          name: LocalAI-linux-arm64
-          path: release/
-      - name: Release
-        uses: softprops/action-gh-release@v2
-        if: startsWith(github.ref, 'refs/tags/')
-        with:
-          files: |
-            release/*
-      - name: Setup tmate session if tests fail
-        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
-        with:
-          detached: true
-          connect-timeout-seconds: 180
-          limit-access-to-actor: true
-  build-linux:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: true
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: true
-          swap-storage: true
-
-      - name: Release space from worker
-        run: |
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          df -h
-          echo
-          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-          sudo apt-get remove --auto-remove android-sdk-platform-tools snapd || true
-          sudo apt-get purge --auto-remove android-sdk-platform-tools snapd || true
-          sudo rm -rf /usr/local/lib/android
-          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
-          sudo rm -rf /usr/share/dotnet
-          sudo apt-get remove -y '^mono-.*' || true
-          sudo apt-get remove -y '^ghc-.*' || true
-          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
-          sudo apt-get remove -y 'php.*' || true
-          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
-          sudo apt-get remove -y '^google-.*' || true
-          sudo apt-get remove -y azure-cli || true
-          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
-          sudo apt-get remove -y '^gfortran-.*' || true
-          sudo apt-get remove -y microsoft-edge-stable || true
-          sudo apt-get remove -y firefox || true
-          sudo apt-get remove -y powershell || true
-          sudo apt-get remove -y r-base-core || true
-          sudo apt-get autoremove -y
-          sudo apt-get clean
-          echo
-          echo "Listing top largest packages"
-          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
-          head -n 30 <<< "${pkgs}"
-          echo
-          sudo rm -rfv build || true
-          sudo rm -rf /usr/share/dotnet || true
-          sudo rm -rf /opt/ghc || true
-          sudo rm -rf "/usr/local/share/boost" || true
-          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
-          df -h
-
-      - name: Force Install GIT latest
-        run: |
-          sudo apt-get update \
-          && sudo apt-get install -y software-properties-common \
-          && sudo apt-get update \
-          && sudo add-apt-repository -y ppa:git-core/ppa \
-          && sudo apt-get update \
-          && sudo apt-get install -y git
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.21.x'
-          cache: false
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
-          make install-go-tools
-      - name: Intel Dependencies
-        run: |
-          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
-          echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
-          sudo apt update
-          sudo apt install -y intel-basekit
-      - name: Install CUDA Dependencies
-        run: |
-          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-          sudo dpkg -i cuda-keyring_1.1-1_all.deb
-          sudo apt-get update
-          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
-        env:
-          CUDA_VERSION: 12-5
-      - name: "Install Hipblas"
-        env:
-          ROCM_VERSION: "6.1"
-          AMDGPU_VERSION: "6.1"
-        run: |
-            set -ex
-
-            sudo apt-get update
-            sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates curl libnuma-dev gnupg
-
-            sudo apt update
-            wget https://repo.radeon.com/amdgpu-install/6.4.1/ubuntu/noble/amdgpu-install_6.4.60401-1_all.deb
-            sudo apt install ./amdgpu-install_6.4.60401-1_all.deb
-            sudo apt update
-
-            sudo amdgpu-install --usecase=rocm
-
-            sudo apt-get clean
-            sudo rm -rf /var/lib/apt/lists/*
-            sudo ldconfig
-      - name: Cache grpc
-        id: cache-grpc
-        uses: actions/cache@v4
-        with:
-          path: grpc
-          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
-      - name: Build grpc
-        if: steps.cache-grpc.outputs.cache-hit != 'true'
-        run: |
-          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && sed -i "216i\  TESTONLY" "third_party/abseil-cpp/absl/container/CMakeLists.txt" && mkdir -p cmake/build && \
-          cd cmake/build && cmake -DgRPC_INSTALL=ON \
-            -DgRPC_BUILD_TESTS=OFF \
-            ../.. && sudo make --jobs 5 --output-sync=target
-      - name: Install gRPC
-        run: |
-          cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install
-      # BACKEND_LIBS needed for gpu-workload: /opt/intel/oneapi/*/lib/libiomp5.so /opt/intel/oneapi/*/lib/libmkl_core.so /opt/intel/oneapi/*/lib/libmkl_core.so.2 /opt/intel/oneapi/*/lib/libmkl_intel_ilp64.so /opt/intel/oneapi/*/lib/libmkl_intel_ilp64.so.2 /opt/intel/oneapi/*/lib/libmkl_sycl_blas.so /opt/intel/oneapi/*/lib/libmkl_sycl_blas.so.4 /opt/intel/oneapi/*/lib/libmkl_tbb_thread.so /opt/intel/oneapi/*/lib/libmkl_tbb_thread.so.2 /opt/intel/oneapi/*/lib/libsycl.so /opt/intel/oneapi/*/lib/libsycl.so.7 /opt/intel/oneapi/*/lib/libsycl.so.7.1.0 /opt/rocm-*/lib/libamdhip64.so /opt/rocm-*/lib/libamdhip64.so.5 /opt/rocm-*/lib/libamdhip64.so.6 /opt/rocm-*/lib/libamdhip64.so.6.1.60100 /opt/rocm-*/lib/libhipblas.so /opt/rocm-*/lib/libhipblas.so.2 /opt/rocm-*/lib/libhipblas.so.2.1.60100 /opt/rocm-*/lib/librocblas.so /opt/rocm-*/lib/librocblas.so.4 /opt/rocm-*/lib/librocblas.so.4.1.60100 /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /usr/lib/x86_64-linux-gnu/libOpenCL.so.1 /usr/lib/x86_64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/x86_64-linux-gnu/libm.so.6 /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 /usr/lib/x86_64-linux-gnu/libc.so.6 /usr/lib/x86_64-linux-gnu/librt.so.1 /usr/local/cuda-*/targets/x86_64-linux/lib/libcublas.so /usr/local/cuda-*/targets/x86_64-linux/lib/libcublasLt.so /usr/local/cuda-*/targets/x86_64-linux/lib/libcudart.so /usr/local/cuda-*/targets/x86_64-linux/lib/stubs/libcuda.so
-      - name: Build
-        id: build
-        run: |
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
-          export PATH=$PATH:$GOPATH/bin
-          export PATH=/usr/local/cuda/bin:$PATH
-          export PATH=/opt/rocm/bin:$PATH
-          source /opt/intel/oneapi/setvars.sh
-          sudo cp /lib64/ld-linux-x86-64.so.2 ld.so
-          BACKEND_LIBS="./ld.so ./sources/go-piper/piper/build/fi/lib/libfmt.a ./sources/go-piper/piper-phonemize/pi/lib/libonnxruntime.so.1.14.1 ./sources/go-piper/piper-phonemize/pi/src/libespeak-ng/libespeak-ng.so /usr/lib/x86_64-linux-gnu/libdl.so.2 /usr/lib/x86_64-linux-gnu/librt.so.1 /usr/lib/x86_64-linux-gnu/libpthread.so.0 ./sources/go-piper/piper-phonemize/pi/lib/libpiper_phonemize.so.1 ./sources/go-piper/piper/build/si/lib/libspdlog.a ./sources/go-piper/espeak/ei/lib/libucd.so" \
-          make -j4 dist
-      - uses: actions/upload-artifact@v4
-        with:
-          name: LocalAI-linux
-          path: release/
-      - name: Release
-        uses: softprops/action-gh-release@v2
-        if: startsWith(github.ref, 'refs/tags/')
-        with:
-          files: |
-            release/*
-      - name: Setup tmate session if tests fail
-        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
-        with:
-          detached: true
-          connect-timeout-seconds: 180
-          limit-access-to-actor: true
-
-
-  build-macOS-x86_64:
-    runs-on: macos-13
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.21.x'
-          cache: false
-      - name: Dependencies
-        run: |
-          brew install protobuf grpc
-          make install-go-tools
-      - name: Build
-        id: build
-        run: |
-          export C_INCLUDE_PATH=/usr/local/include
-          export CPLUS_INCLUDE_PATH=/usr/local/include
-          export PATH=$PATH:$GOPATH/bin
-          export SKIP_GRPC_BACKEND=backend-assets/grpc/whisper
-          make dist
-      - uses: actions/upload-artifact@v4
-        with:
-          name: LocalAI-MacOS-x86_64
-          path: release/
-      - name: Release
-        uses: softprops/action-gh-release@v2
-        if: startsWith(github.ref, 'refs/tags/')
-        with:
-          files: |
-            release/*
-      - name: Setup tmate session if tests fail
-        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
-        with:
-          detached: true
-          connect-timeout-seconds: 180
-          limit-access-to-actor: true
-
-  build-macOS-arm64:
-    runs-on: macos-14
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.21.x'
-          cache: false
-      - name: Dependencies
-        run: |
-          brew install protobuf grpc libomp llvm
-          make install-go-tools
-      - name: Build
-        id: build
-        run: |
-          export C_INCLUDE_PATH=/usr/local/include
-          export CPLUS_INCLUDE_PATH=/usr/local/include
-          export PATH=$PATH:$GOPATH/bin
-          export CC=/opt/homebrew/opt/llvm/bin/clang
-          make dist
-      - uses: actions/upload-artifact@v4
-        with:
-          name: LocalAI-MacOS-arm64
-          path: release/
-      - name: Release
-        uses: softprops/action-gh-release@v2
-        if: startsWith(github.ref, 'refs/tags/')
-        with:
-          files: |
-            release/*
-      - name: Setup tmate session if tests fail
-        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
-        with:
-          detached: true
-          connect-timeout-seconds: 180
-          limit-access-to-actor: true
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.5
+        uses: securego/gosec@v2.22.7
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -67,18 +67,20 @@ jobs:
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
+      - name: Proto Dependencies
+        run: |
+          # Install protoc
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
          sudo apt-get install -y libgmock-dev clang
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake patch python3-pip unzip
@@ -94,38 +96,15 @@ jobs:
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
          export CUDACXX=/usr/local/cuda/bin/nvcc

-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
-          go install github.com/GeertJohan/go.rice/rice@latest

          # The python3-grpc-tools package in 22.04 is too old
          pip install --user grpcio-tools==1.71.0 grpcio==1.71.0

          make -C backend/python/transformers

-          # Pre-build piper before we start tests in order to have shared libraries in place
-          make sources/go-piper && \
-          GO_TAGS="tts" make -C sources/go-piper piper.o && \
-          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/
+          make backends/huggingface backends/llama-cpp backends/local-store backends/silero-vad backends/piper backends/whisper backends/stablediffusion-ggml
        env:
          CUDA_VERSION: 12-4
-      - name: Cache grpc
-        id: cache-grpc
-        uses: actions/cache@v4
-        with:
-          path: grpc
-          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
-      - name: Build grpc
-        if: steps.cache-grpc.outputs.cache-hit != 'true'
-        run: |
-          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --jobs 5 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && sed -i "216i\  TESTONLY" "third_party/abseil-cpp/absl/container/CMakeLists.txt" && mkdir -p cmake/build && cd cmake/build && \
-          cmake -DgRPC_INSTALL=ON \
-            -DgRPC_BUILD_TESTS=OFF \
-            ../.. && sudo make --jobs 5
-      - name: Install gRPC
-        run: |
-          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
          PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
@@ -184,16 +163,10 @@ jobs:
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
-          go install github.com/GeertJohan/go.rice/rice@latest
          PATH="$PATH:$HOME/go/bin" make protogen-go
-      - name: Build images
-        run: |
-          docker build --build-arg FFMPEG=true --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
-          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
      - name: Test
        run: |
-            PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
-            make run-e2e-aio
+            PATH="$PATH:$HOME/go/bin" make backends/local-store backends/silero-vad backends/llama-cpp backends/whisper backends/piper backends/stablediffusion-ggml docker-build-aio e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.22
@@ -224,7 +197,14 @@ jobs:
        run: |
          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
          pip install --user --no-cache-dir grpcio-tools==1.71.0 grpcio==1.71.0
-          go install github.com/GeertJohan/go.rice/rice@latest
+      - name: Build llama-cpp-darwin
+        run: |
+          make protogen-go
+          make build
+          bash scripts/build-llama-cpp-darwin.sh
+          ls -la build/darwin.tar
+          mv build/darwin.tar build/llama-cpp.tar
+          ./local-ai backends install "ocifile://$PWD/build/llama-cpp.tar"
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
@@ -232,7 +212,8 @@ jobs:
          export CC=/opt/homebrew/opt/llvm/bin/clang
          # Used to run the newer GNUMake version from brew that supports --output-sync
          export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
-          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
+          PATH="$PATH:$HOME/go/bin" make protogen-go
+          PATH="$PATH:$HOME/go/bin" BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.22
--- a/.gitignore
+++ b/.gitignore
@@ -5,9 +5,14 @@ __pycache__/
 *.o
 get-sources
 prepare-sources
-/backend/cpp/llama/grpc-server
-/backend/cpp/llama/llama.cpp
+/backend/cpp/llama-cpp/grpc-server
+/backend/cpp/llama-cpp/llama.cpp
 /backend/cpp/llama-*
+!backend/cpp/llama-cpp
+/backends
+/backend-images
+/result.yaml
+protoc

 *.log

@@ -56,4 +61,4 @@ docs/static/gallery.html
 **/venv

 # per-developer customization files for the development container
-.devcontainer/customization/*
+.devcontainer/customization/*
--- a/.goreleaser.yaml
+++ b/.goreleaser.yaml
@@ -0,0 +1,33 @@
+version: 2
+before:
+  hooks:
+    - make protogen-go
+    - go mod tidy
+dist: release
+source:
+  enabled: true
+  name_template: '{{ .ProjectName }}-{{ .Tag }}-source'
+builds:
+  -
+    env:
+      - CGO_ENABLED=0
+    ldflags:
+      - -s -w
+      - -X "github.com/mudler/LocalAI/internal.Version={{ .Tag }}"
+      - -X "github.com/mudler/LocalAI/internal.Commit={{ .FullCommit }}"
+    goos:
+      - linux
+      - darwin
+      #- windows
+    goarch:
+      - amd64
+      - arm64
+archives:
+  - formats: [ 'binary' ] # this removes the tar of the archives, leaving the binaries alone
+    name_template: local-ai-{{ .Tag }}-{{ .Os }}-{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}
+checksum:
+  name_template: '{{ .ProjectName }}-{{ .Tag }}-checksums.txt'
+snapshot:
+  version_template: "{{ .Tag }}-next"
+changelog:
+  use: github-native
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -26,7 +26,7 @@
                "LOCALAI_P2P": "true",
                "LOCALAI_FEDERATED": "true"
            },
-            "buildFlags": ["-tags", "p2p tts", "-v"],
+            "buildFlags": ["-tags", "", "-v"],
            "envFile": "${workspaceFolder}/.env",
            "cwd": "${workspaceRoot}"
        }
--- a/107
+++ b/107
@@ -24,6 +24,9 @@ ARG TARGETARCH
 ARG TARGETVARIANT
 ENV BUILD_TYPE=${BUILD_TYPE}

+RUN mkdir -p /run/localai
+RUN echo "default" > /run/localai/capability
+
 # Vulkan requirements
 RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
@@ -36,7 +39,8 @@ RUN <<EOT bash
        apt-get install -y \
            vulkan-sdk && \
        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
+        rm -rf /var/lib/apt/lists/* && \
+        echo "vulkan" > /run/localai/capability
    fi
 EOT

@@ -63,7 +67,14 @@ RUN <<EOT bash
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
+        rm -rf /var/lib/apt/lists/* && \
+        echo "nvidia" > /run/localai/capability
+    fi
+EOT
+
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
+        echo "nvidia-l4t" > /run/localai/capability
    fi
 EOT

@@ -83,11 +94,17 @@ RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
            rocblas-dev && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/* && \
+        echo "amd" > /run/localai/capability && \
        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
        ldconfig \
    ; fi

+RUN expr "${BUILD_TYPE}" : sycl && \
+      echo "intel" > /run/localai/capability || \
+      echo "Not Intel"
+
+
 # Cuda
 ENV PATH=/usr/local/cuda/bin:${PATH}

@@ -121,7 +138,7 @@ RUN apt-get update && \

 # Install CMake (the version in 22.04 is too old)
 RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
+    if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
    else
        apt-get update && \
@@ -136,10 +153,9 @@ EOT
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin

-# Install grpc compilers and rice
+# Install grpc compilers
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
-    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af && \
-    go install github.com/GeertJohan/go.rice/rice@latest
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af

 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
@@ -175,57 +191,12 @@ FROM ${INTEL_BASE_IMAGE} AS intel
 RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \
 gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
 RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" > /etc/apt/sources.list.d/intel-graphics.list
-
-###################################
-###################################
-
-# The grpc target does one thing, it builds and installs GRPC.  This is in it's own layer so that it can be effectively cached by CI.
-# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
-FROM ${GRPC_BASE_IMAGE} AS grpc
-
-# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
-ARG GRPC_MAKEFLAGS="-j4 -Otarget"
-ARG GRPC_VERSION=v1.65.0
-ARG CMAKE_FROM_SOURCE=false
-ARG CMAKE_VERSION=3.26.4
-
-ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
-
-WORKDIR /build
-
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        ca-certificates \
-        build-essential curl libssl-dev \
-        git && \
+        intel-oneapi-runtime-libs && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

-# Install CMake (the version in 22.04 is too old)
-RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
-        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
-    else
-        apt-get update && \
-        apt-get install -y \
-            cmake && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
-# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
-# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
-# and running make install in the target container
-RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-    mkdir -p /build/grpc/cmake/build && \
-    cd /build/grpc/cmake/build && \
-    sed -i "216i\  TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
-    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
-    make && \
-    make install && \
-    rm -rf /build
-
 ###################################
 ###################################

@@ -233,7 +204,7 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall

 FROM build-requirements AS builder-base

-ARG GO_TAGS="tts p2p"
+ARG GO_TAGS=""
 ARG GRPC_BACKENDS
 ARG MAKEFLAGS
 ARG LD_FLAGS="-s -w"
@@ -252,9 +223,7 @@ RUN echo "GO_TAGS: $GO_TAGS" && echo "TARGETARCH: $TARGETARCH"
 WORKDIR /build


-# We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
-# but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
-# here so that we can generate the grpc code for the stablediffusion build
+# We need protoc installed, and the version in 22.04 is too old.
 RUN <<EOT bash
    if [ "amd64" = "$TARGETARCH" ]; then
        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
@@ -276,8 +245,6 @@ FROM builder-base AS builder-backends
 ARG TARGETARCH
 ARG TARGETVARIANT

-COPY --from=grpc /opt/grpc /usr/local
-
 WORKDIR /build

 COPY ./Makefile .
@@ -292,13 +259,7 @@ COPY ./pkg/utils ./pkg/utils
 COPY ./pkg/langchain ./pkg/langchain

 RUN ls -l ./
-RUN make backend-assets
-RUN make prepare
-RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
-        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make grpcs; \
-    else \
-        make grpcs; \
-    fi
+RUN make protogen-go

 # The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
 # Adjustments to the build process should likely be made here.
@@ -311,16 +272,7 @@ COPY . .
 ## Build the binary
 ## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space
 ## Otherwise just run the normal build
-RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
-        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
-    else \
-        make build; \
-    fi
-
-RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
-        mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
-        touch /build/sources/go-piper/piper-phonemize/pi/lib/keep \
-    ; fi
+RUN make build

 ###################################
 ###################################
@@ -330,8 +282,6 @@ RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \

 FROM builder-base AS devcontainer

-COPY --from=grpc /opt/grpc /usr/local
-
 COPY .devcontainer-scripts /.devcontainer-scripts

 RUN apt-get update && \
@@ -364,9 +314,6 @@ COPY ./entrypoint.sh .
 # Copy the binary
 COPY --from=builder /build/local-ai ./

-# Copy shared libraries for piper
-COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
-
 # Make sure the models directory exists
 RUN mkdir -p /models /backends

--- a/5
+++ b/5
@@ -1,5 +0,0 @@
-VERSION 0.7
-
-build:
-    FROM DOCKERFILE -f Dockerfile .
-    SAVE ARTIFACT /usr/bin/local-ai AS LOCAL local-ai
--- a/867
+++ b/867
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <h1 align="center">
  <br>
-  <img height="300" src="./core/http/static/logo.png"> <br>
+  <img width="300" src="./core/http/static/logo.png"> <br>
 <br>
 </h1>

@@ -140,11 +140,7 @@ docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri
 ### Intel GPU Images (oneAPI):

 ```bash
-# Intel GPU with FP16 support
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f16
-
-# Intel GPU with FP32 support
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-intel-f32
+docker run -ti --name local-ai -p 8080:8080 --device=/dev/dri/card1 --device=/dev/dri/renderD128 localai/localai:latest-gpu-intel
 ```

 ### Vulkan GPU Images:
@@ -166,7 +162,7 @@ docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-ai
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-11

 # Intel GPU version
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-gpu-intel-f16
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-gpu-intel

 # AMD GPU version
 docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-aio-gpu-hipblas
@@ -189,10 +185,14 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 local-ai run oci://localai/phi-2:latest
 ```

+> ⚡ **Automatic Backend Detection**: When you install models from the gallery or YAML files, LocalAI automatically detects your system's GPU capabilities (NVIDIA, AMD, Intel) and downloads the appropriate backend. For advanced configuration options, see [GPU Acceleration](https://localai.io/features/gpu-acceleration/#automatic-backend-detection).
+
 For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)

 ## 📰 Latest project news

+- July/August 2025: 🔍 [Object Detection](https://localai.io/features/object-detection/) added to the API featuring [rf-detr](https://github.com/roboflow/rf-detr)
+- July 2025: All backends migrated outside of the main binary. LocalAI is now more lightweight, small, and automatically downloads the required backend to run the model. [Read the release notes](https://github.com/mudler/LocalAI/releases/tag/v3.2.0)
 - June 2025: [Backend management](https://github.com/mudler/LocalAI/pull/5607) has been added. Attention: extras images are going to be deprecated from the next release! Read [the backend management PR](https://github.com/mudler/LocalAI/pull/5607).
 - May 2025: [Audio input](https://github.com/mudler/LocalAI/pull/5466) and [Reranking](https://github.com/mudler/LocalAI/pull/5396) in llama.cpp backend, [Realtime API](https://github.com/mudler/LocalAI/pull/5392),  Support to Gemma, SmollVLM, and more multimodal models (available in the gallery).
 - May 2025: Important: image name changes [See release](https://github.com/mudler/LocalAI/releases/tag/v2.29.0)
@@ -215,6 +215,7 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A

 ## 🚀 [Features](https://localai.io/features/)

+- 🧩 [Backend Gallery](https://localai.io/backends/): Install/remove backends on the fly, powered by OCI images — fully customizable and API-driven.
 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
 - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
 - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
@@ -224,6 +225,7 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
 - 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
+- 🔍 [Object Detection](https://localai.io/features/object-detection/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
 - [Agentic capabilities](https://github.com/mudler/LocalAGI)
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -1,5 +1,6 @@
 embeddings: true
 name: text-embedding-ada-002
+backend: llama-cpp
 parameters:
  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf

--- a/aio/cpu/rerank.yaml
+++ b/aio/cpu/rerank.yaml
@@ -3,7 +3,7 @@ reranking: true
 f16: true
 parameters:
  model: jina-reranker-v1-tiny-en.f16.gguf
-
+backend: llama-cpp
 download_files:
  - filename: jina-reranker-v1-tiny-en.f16.gguf
    sha256: 5f696cf0d0f3d347c4a279eee8270e5918554cdac0ed1f632f2619e4e8341407
--- a/aio/cpu/text-to-speech.yaml
+++ b/aio/cpu/text-to-speech.yaml
@@ -2,7 +2,7 @@ name: tts-1
 download_files:
  - filename: voice-en-us-amy-low.tar.gz
    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
-
+backend: piper
 parameters:
  model: en-us-amy-low.onnx

--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -1,5 +1,6 @@
 context_size: 8192
 f16: true
+backend: llama-cpp
 function:
  grammar:
    no_mixed_free_string: true
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -1,5 +1,6 @@
 context_size: 4096
 f16: true
+backend: llama-cpp
 mmap: true
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@@ -1,5 +1,6 @@
 embeddings: true
 name: text-embedding-ada-002
+backend: llama-cpp
 parameters:
  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf

--- a/aio/gpu-8g/rerank.yaml
+++ b/aio/gpu-8g/rerank.yaml
@@ -3,7 +3,7 @@ reranking: true
 f16: true
 parameters:
  model: jina-reranker-v1-tiny-en.f16.gguf
-
+backend: llama-cpp
 download_files:
  - filename: jina-reranker-v1-tiny-en.f16.gguf
    sha256: 5f696cf0d0f3d347c4a279eee8270e5918554cdac0ed1f632f2619e4e8341407
--- a/aio/gpu-8g/text-to-speech.yaml
+++ b/aio/gpu-8g/text-to-speech.yaml
@@ -2,7 +2,7 @@ name: tts-1
 download_files:
  - filename: voice-en-us-amy-low.tar.gz
    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
-
+backend: piper
 parameters:
  model: en-us-amy-low.onnx

--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -1,5 +1,6 @@
 context_size: 4096
 f16: true
+backend: llama-cpp
 function:
  capture_llm_results:
  - (?s)<Thought>(.*?)</Thought>
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -1,4 +1,5 @@
 context_size: 4096
+backend: llama-cpp
 f16: true
 mmap: true
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
--- a/aio/intel/embeddings.yaml
+++ b/aio/intel/embeddings.yaml
@@ -1,5 +1,6 @@
 embeddings: true
 name: text-embedding-ada-002
+backend: llama-cpp
 parameters:
  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf

--- a/aio/intel/rerank.yaml
+++ b/aio/intel/rerank.yaml
@@ -3,7 +3,7 @@ reranking: true
 f16: true
 parameters:
  model: jina-reranker-v1-tiny-en.f16.gguf
-
+backend: llama-cpp
 download_files:
  - filename: jina-reranker-v1-tiny-en.f16.gguf
    sha256: 5f696cf0d0f3d347c4a279eee8270e5918554cdac0ed1f632f2619e4e8341407
--- a/aio/intel/text-to-speech.yaml
+++ b/aio/intel/text-to-speech.yaml
@@ -2,7 +2,7 @@ name: tts-1
 download_files:
  - filename: voice-en-us-amy-low.tar.gz
    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
-
+backend: piper
 parameters:
  model: en-us-amy-low.onnx

--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -1,5 +1,6 @@
 context_size: 4096
 f16: true
+backend: llama-cpp
 function:
  capture_llm_results:
  - (?s)<Thought>(.*?)</Thought>
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -1,4 +1,5 @@
 context_size: 4096
+backend: llama-cpp
 f16: true
 mmap: true
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
--- a/assets.go
+++ b/assets.go
@@ -1,15 +0,0 @@
-package main
-
-import (
-	rice "github.com/GeertJohan/go.rice"
-)
-
-var backendAssets *rice.Box
-
-func init() {
-	var err error
-	backendAssets, err = rice.FindBox("backend-assets")
-	if err != nil {
-		panic(err)
-	}
-}
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -17,9 +17,9 @@ ARG GO_VERSION=1.22.6
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
-        ccache \
+        git ccache \
        ca-certificates \
-        make \
+        make cmake \
        curl unzip \
        libssl-dev && \
    apt-get clean && \
@@ -123,9 +123,9 @@ EOT

 COPY . /LocalAI

-RUN cd /LocalAI && make backend-assets/grpc/bark-cpp
+RUN cd /LocalAI && make protogen-go && make -C /LocalAI/backend/go/${BACKEND} build

 FROM scratch
+ARG BACKEND=rerankers

-COPY --from=builder /LocalAI/backend-assets/grpc/bark-cpp ./
-COPY --from=builder /LocalAI/backend/go/bark/run.sh ./
+COPY --from=builder /LocalAI/backend/go/${BACKEND}/package/. ./
--- a/backend/Dockerfile.llama-cpp
+++ b/backend/Dockerfile.llama-cpp
@@ -0,0 +1,207 @@
+ARG BASE_IMAGE=ubuntu:22.04
+ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
+
+
+# The grpc target does one thing, it builds and installs GRPC.  This is in it's own layer so that it can be effectively cached by CI.
+# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
+FROM ${GRPC_BASE_IMAGE} AS grpc
+
+# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
+ARG GRPC_MAKEFLAGS="-j4 -Otarget"
+ARG GRPC_VERSION=v1.65.0
+ARG CMAKE_FROM_SOURCE=false
+ARG CMAKE_VERSION=3.26.4
+
+ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
+
+WORKDIR /build
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ca-certificates \
+        build-essential curl libssl-dev \
+        git && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install CMake (the version in 22.04 is too old)
+RUN <<EOT bash
+    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
+        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
+    else
+        apt-get update && \
+        apt-get install -y \
+            cmake && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
+# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
+# and running make install in the target container
+RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+    mkdir -p /build/grpc/cmake/build && \
+    cd /build/grpc/cmake/build && \
+    sed -i "216i\  TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
+    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
+    make && \
+    make install && \
+    rm -rf /build
+
+FROM ${BASE_IMAGE} AS builder
+ARG BACKEND=rerankers
+ARG BUILD_TYPE
+ENV BUILD_TYPE=${BUILD_TYPE}
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
+ARG SKIP_DRIVERS=false
+ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
+ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETARCH
+ARG TARGETVARIANT
+ARG GO_VERSION=1.22.6
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ccache git \
+        ca-certificates \
+        make \
+        curl unzip \
+        libssl-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Cuda
+ENV PATH=/usr/local/cuda/bin:${PATH}
+
+# HipBLAS requirements
+ENV PATH=/opt/rocm/bin:${PATH}
+
+# Vulkan requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils wget gpg-agent && \
+        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+        apt-get update && \
+        apt-get install -y \
+            vulkan-sdk && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# CuBLAS requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils
+        if [ "amd64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        fi
+        if [ "arm64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
+        fi
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+        rm -f cuda-keyring_1.1-1_all.deb && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# If we are building with clblas support, we need the libraries for the builds
+RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            libclblast-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
+    ; fi
+
+RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            hipblas-dev \
+            rocblas-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
+        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
+        ldconfig \
+    ; fi
+
+RUN echo "TARGETARCH: $TARGETARCH"
+
+# We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
+# but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
+# here so that we can generate the grpc code for the stablediffusion build
+RUN <<EOT bash
+    if [ "amd64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+    if [ "arm64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+EOT
+
+# Install CMake (the version in 22.04 is too old)
+RUN <<EOT bash
+    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
+        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
+    else
+        apt-get update && \
+        apt-get install -y \
+            cmake && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+COPY --from=grpc /opt/grpc /usr/local
+
+
+COPY . /LocalAI
+
+## Otherwise just run the normal build
+RUN <<EOT bash
+if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
+        cd /LocalAI/backend/cpp/llama-cpp && make llama-cpp-fallback && \
+        make llama-cpp-grpc && make llama-cpp-rpc-server; \
+    else \
+        cd /LocalAI/backend/cpp/llama-cpp && make llama-cpp-avx && \
+        make llama-cpp-avx2 && \
+        make llama-cpp-avx512 && \
+        make llama-cpp-fallback && \
+        make llama-cpp-grpc && \
+        make llama-cpp-rpc-server; \
+    fi  
+EOT
+
+
+# Copy libraries using a script to handle architecture differences
+RUN make -C /LocalAI/backend/cpp/llama-cpp package
+
+
+FROM scratch
+
+
+# Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
+COPY --from=builder /LocalAI/backend/cpp/llama-cpp/package/. ./
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -20,6 +20,7 @@ service Backend {
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}
+  rpc Detect(DetectOptions) returns (DetectResponse) {}

  rpc StoresSet(StoresSetOptions) returns (Result) {}
  rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
@@ -185,7 +186,6 @@ message ModelOptions {
  string MainGPU = 13;
  string TensorSplit = 14;
  int32 Threads = 15;
-  string LibrarySearchPath = 16;
  float RopeFreqBase = 17;
  float RopeFreqScale = 18;
  float RMSNormEps = 19;
@@ -258,6 +258,8 @@ message ModelOptions {
  repeated GrammarTrigger GrammarTriggers = 65;

  bool Reranking = 71;
+
+  repeated string Overrides = 72;
 }

 message Result {
@@ -303,6 +305,9 @@ message GenerateImageRequest {
  // Diffusers
  string EnableParameters = 10;
  int32 CLIPSkip = 11;
+  
+  // Reference images for models that support them (e.g., Flux Kontext)
+  repeated string ref_images = 12;
 }

 message GenerateVideoRequest {
@@ -375,3 +380,20 @@ message Message {
  string role = 1;
  string content = 2;
 }
+
+message DetectOptions {
+  string src = 1;
+}
+
+message Detection {
+  float x = 1;
+  float y = 2;
+  float width = 3;
+  float height = 4;
+  float confidence = 5;
+  string class_name = 6;
+}
+
+message DetectResponse {
+  repeated Detection Detections = 1;
+}
--- a/backend/cpp/llama-cpp/CMakeLists.txt
+++ b/backend/cpp/llama-cpp/CMakeLists.txt
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -0,0 +1,168 @@
+
+LLAMA_VERSION?=daf2dd788066b8b239cb7f68210e090c2124c199
+LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
+
+CMAKE_ARGS?=
+BUILD_TYPE?=
+NATIVE?=false
+ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
+TARGET?=--target grpc-server
+JOBS?=$(shell nproc)
+
+# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
+
+CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DGGML_CUDA=ON
+# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# to CMAKE_ARGS automatically
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblas)
+	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
+else ifeq ($(BUILD_TYPE),hipblas)
+	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+#	GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102
+#	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
+	CMAKE_ARGS+=-DGGML_HIP=ON
+#	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DGGML_VULKAN=1
+else ifeq ($(OS),Darwin)
+	ifeq ($(BUILD_TYPE),)
+		BUILD_TYPE=metal
+	endif
+	ifneq ($(BUILD_TYPE),metal)
+		CMAKE_ARGS+=-DGGML_METAL=OFF
+	else
+		CMAKE_ARGS+=-DGGML_METAL=ON
+		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+		CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
+		CMAKE_ARGS+=-DGGML_OPENMP=OFF
+	endif
+	TARGET+=--target ggml-metal
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f16)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl" \
+		-DGGML_SYCL_F16=ON
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f32)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl"
+endif
+
+INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages
+INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
+ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
+				 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
+				 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
+				 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
+				 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
+build-llama-cpp-grpc-server:
+# Conditionally build grpc for the llama backend to use if needed
+ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
+	$(MAKE) -C ../../grpc build
+	_PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto \
+	_GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin \
+	PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
+	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
+	LLAMA_VERSION=$(LLAMA_VERSION) \
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server
+else
+	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
+	LLAMA_VERSION=$(LLAMA_VERSION) $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server
+endif
+
+llama-cpp-avx2: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build purge
+	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx2-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build/grpc-server llama-cpp-avx2
+
+llama-cpp-avx512: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build purge
+	$(info ${GREEN}I llama-cpp build info:avx512${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx512-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build/grpc-server llama-cpp-avx512
+
+llama-cpp-avx: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build purge
+	$(info ${GREEN}I llama-cpp build info:avx${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-cpp-avx-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build/grpc-server llama-cpp-avx
+
+llama-cpp-fallback: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build purge
+	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
+
+llama-cpp-grpc: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
+	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc
+
+llama-cpp-rpc-server: llama-cpp-grpc
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
+
+llama.cpp:
+	mkdir -p llama.cpp
+	cd llama.cpp && \
+	git init && \
+	git remote add origin $(LLAMA_REPO)  && \
+	git fetch origin && \
+	git checkout -b build $(LLAMA_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+llama.cpp/tools/grpc-server: llama.cpp
+	mkdir -p llama.cpp/tools/grpc-server
+	bash prepare.sh
+
+rebuild:
+	bash prepare.sh
+	rm -rf grpc-server
+	$(MAKE) grpc-server
+
+package:
+	bash package.sh
+
+purge:
+	rm -rf llama.cpp/build
+	rm -rf llama.cpp/tools/grpc-server
+	rm -rf grpc-server
+
+clean: purge
+	rm -rf llama.cpp
+
+grpc-server: llama.cpp llama.cpp/tools/grpc-server
+	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+	+bash -c "source $(ONEAPI_VARS); \
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET)"
+else
+	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET)
+endif
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -12,6 +12,7 @@

 #include "backend.pb.h"
 #include "backend.grpc.pb.h"
+#include "common.h"
 #include <getopt.h>
 #include <grpcpp/ext/proto_server_reflection_plugin.h>
 #include <grpcpp/grpcpp.h>
@@ -260,6 +261,13 @@ static void params_parse(const backend::ModelOptions* request,
        }
    }

+    // Add kv_overrides
+    if (request->overrides_size() > 0) {
+        for (int i = 0; i < request->overrides_size(); i++) {
+            string_parse_kv_override(request->overrides(i).c_str(), params.kv_overrides);
+        }
+    }
+
    // TODO: Add yarn

    if (!request->tensorsplit().empty()) {
--- a/backend/cpp/llama-cpp/package.sh
+++ b/backend/cpp/llama-cpp/package.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Script to copy the appropriate libraries based on architecture
+# This script is used in the final stage of the Dockerfile
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+
+# Create lib directory
+mkdir -p $CURDIR/package/lib
+
+cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
+cp -rfv $CURDIR/run.sh $CURDIR/package/
+
+# Detect architecture and copy appropriate libraries
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    # x86_64 architecture
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    # ARM64 architecture
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+echo "Packaging completed successfully" 
+ls -liah $CURDIR/package/
+ls -liah $CURDIR/package/lib/
--- a/backend/cpp/llama-cpp/patches/01-llava.patch
+++ b/backend/cpp/llama-cpp/patches/01-llava.patch
--- a/backend/cpp/llama-cpp/prepare.sh
+++ b/backend/cpp/llama-cpp/prepare.sh
--- a/backend/cpp/llama-cpp/run.sh
+++ b/backend/cpp/llama-cpp/run.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+set -ex
+
+# Get the absolute current dir where the script is located
+CURDIR=$(dirname "$(realpath $0)")
+
+cd /
+
+echo "CPU info:"
+grep -e "model\sname" /proc/cpuinfo | head -1
+grep -e "flags" /proc/cpuinfo | head -1
+
+BINARY=llama-cpp-fallback
+
+if grep -q -e "\savx\s" /proc/cpuinfo ; then
+	echo "CPU:    AVX    found OK"
+	if [ -e $CURDIR/llama-cpp-avx ]; then
+		BINARY=llama-cpp-avx
+	fi
+fi
+
+if grep -q -e "\savx2\s" /proc/cpuinfo ; then
+	echo "CPU:    AVX2   found OK"
+	if [ -e $CURDIR/llama-cpp-avx2 ]; then
+		BINARY=llama-cpp-avx2
+	fi
+fi
+
+# Check avx 512
+if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
+	echo "CPU:    AVX512F found OK"
+	if [ -e $CURDIR/llama-cpp-avx512 ]; then
+		BINARY=llama-cpp-avx512
+	fi
+fi
+
+if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
+	if [ -e $CURDIR/llama-cpp-grpc ]; then
+		BINARY=llama-cpp-grpc
+	fi
+fi
+ 
+# Extend ld library path with the dir where this script is located/lib
+if [ "$(uname)" == "Darwin" ]; then
+	DYLD_FALLBACK_LIBRARY_PATH=$CURDIR/lib:$DYLD_FALLBACK_LIBRARY_PATH
+else
+	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+fi
+
+# If there is a lib/ld.so, use it
+if [ -f $CURDIR/lib/ld.so ]; then
+	echo "Using lib/ld.so"
+	echo "Using binary: $BINARY"
+	exec $CURDIR/lib/ld.so $CURDIR/$BINARY "$@"
+fi
+
+echo "Using binary: $BINARY"
+exec $CURDIR/$BINARY "$@"
+
+# In case we fail execing, just run fallback
+exec $CURDIR/llama-cpp-fallback "$@"
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -1,87 +0,0 @@
-
-LLAMA_VERSION?=
-LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
-
-CMAKE_ARGS?=
-BUILD_TYPE?=
-ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
-TARGET?=--target grpc-server
-
-# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
-
-# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
-ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DGGML_CUDA=ON
-# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-# to CMAKE_ARGS automatically
-else ifeq ($(BUILD_TYPE),openblas)
-	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
-else ifeq ($(BUILD_TYPE),clblas)
-	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
-# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
-else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIP=ON
-# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
-# But if it's OSX without metal, disable it here
-else ifeq ($(OS),Darwin)
-	ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DGGML_METAL=OFF
-	else
-		CMAKE_ARGS+=-DGGML_METAL=ON
-		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
-		TARGET+=--target ggml-metal
-	endif
-endif
-
-ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx \
-		-DCMAKE_CXX_FLAGS="-fsycl" \
-		-DGGML_SYCL_F16=ON
-endif
-
-ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
-		-DCMAKE_C_COMPILER=icx \
-		-DCMAKE_CXX_COMPILER=icpx \
-		-DCMAKE_CXX_FLAGS="-fsycl"
-endif
-
-llama.cpp:
-	mkdir -p llama.cpp
-	cd llama.cpp && \
-	git init && \
-	git remote add origin $(LLAMA_REPO)  && \
-	git fetch origin && \
-	git checkout -b build $(LLAMA_VERSION) && \
-	git submodule update --init --recursive --depth 1 --single-branch
-
-llama.cpp/tools/grpc-server: llama.cpp
-	mkdir -p llama.cpp/tools/grpc-server
-	bash prepare.sh
-
-rebuild:
-	bash prepare.sh
-	rm -rf grpc-server
-	$(MAKE) grpc-server
-
-purge:
-	rm -rf llama.cpp/build
-	rm -rf llama.cpp/tools/grpc-server
-	rm -rf grpc-server
-
-clean: purge
-	rm -rf llama.cpp
-
-grpc-server: llama.cpp llama.cpp/tools/grpc-server
-	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
-ifneq (,$(findstring sycl,$(BUILD_TYPE)))
-	+bash -c "source $(ONEAPI_VARS); \
-	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)"
-else
-	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
-endif
-	cp llama.cpp/build/bin/grpc-server .
--- a/backend/go/bark-cpp/Makefile
+++ b/backend/go/bark-cpp/Makefile
@@ -0,0 +1,51 @@
+INCLUDE_PATH := $(abspath ./)
+LIBRARY_PATH := $(abspath ./)
+
+AR?=ar
+
+CMAKE_ARGS?=-DGGML_NATIVE=OFF
+BUILD_TYPE?=
+GOCMD=go
+# keep standard at C11 and C++11
+CXXFLAGS = -I. -I$(INCLUDE_PATH)/sources/bark.cpp/examples -I$(INCLUDE_PATH)/sources/bark.cpp/encodec.cpp/ggml/include -I$(INCLUDE_PATH)/sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
+LDFLAGS  = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/sources/bark.cpp/build/examples -lbark -lstdc++ -lm
+
+# bark.cpp
+BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
+BARKCPP_VERSION?=5d5be84f089ab9ea53b7a793f088d3fbf7247495
+
+# warnings
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+
+## bark.cpp
+sources/bark.cpp:
+	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
+	cd sources/bark.cpp && \
+	git checkout $(BARKCPP_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+sources/bark.cpp/build/libbark.a: sources/bark.cpp
+	cd sources/bark.cpp && \
+	mkdir -p build && \
+	cd build && \
+	cmake $(CMAKE_ARGS) .. && \
+	cmake --build . --config Release
+
+gobark.o:
+	$(CXX) $(CXXFLAGS) gobark.cpp -o gobark.o -c $(LDFLAGS)
+
+libbark.a: sources/bark.cpp/build/libbark.a gobark.o
+	cp $(INCLUDE_PATH)/sources/bark.cpp/build/libbark.a ./
+	$(AR) rcs libbark.a gobark.o
+
+bark-cpp: libbark.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH="$(CURDIR)" LIBRARY_PATH=$(CURDIR) \
+	$(GOCMD) build -v -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o bark-cpp ./
+
+package:
+	bash package.sh
+
+build: bark-cpp package
+
+clean:
+	rm -f gobark.o libbark.a
--- a/backend/go/bark-cpp/gobark.cpp
+++ b/backend/go/bark-cpp/gobark.cpp
--- a/backend/go/bark-cpp/gobark.go
+++ b/backend/go/bark-cpp/gobark.go
@@ -1,7 +1,7 @@
 package main

-// #cgo CXXFLAGS: -I${SRCDIR}/../../../sources/bark.cpp/ -I${SRCDIR}/../../../sources/bark.cpp/encodec.cpp -I${SRCDIR}/../../../sources/bark.cpp/examples -I${SRCDIR}/../../../sources/bark.cpp/spm-headers
-// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../sources/bark.cpp/build/examples -L${SRCDIR}/../../../sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon
+// #cgo CXXFLAGS: -I${SRCDIR}/sources/bark.cpp/ -I${SRCDIR}/sources/bark.cpp/encodec.cpp -I${SRCDIR}/sources/bark.cpp/encodec.cpp/ggml/include -I${SRCDIR}/sources/bark.cpp/examples -I${SRCDIR}/sources/bark.cpp/spm-headers
+// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/sources/bark.cpp/build/examples -L${SRCDIR}/sources/bark.cpp/build/encodec.cpp/ggml/src/ -L${SRCDIR}/sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon -lggml -lgomp
 // #include <gobark.h>
 // #include <stdlib.h>
 import "C"
--- a/backend/go/bark-cpp/gobark.h
+++ b/backend/go/bark-cpp/gobark.h
--- a/backend/go/bark-cpp/main.go
+++ b/backend/go/bark-cpp/main.go
--- a/backend/go/bark-cpp/package.sh
+++ b/backend/go/bark-cpp/package.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Script to copy the appropriate libraries based on architecture
+# This script is used in the final stage of the Dockerfile
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+
+# Create lib directory
+mkdir -p $CURDIR/package/lib
+cp -avrf $CURDIR/bark-cpp $CURDIR/package/
+cp -rfv $CURDIR/run.sh $CURDIR/package/
+
+# Detect architecture and copy appropriate libraries
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    # x86_64 architecture
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    # ARM64 architecture
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+echo "Packaging completed successfully" 
+ls -liah $CURDIR/package/
+ls -liah $CURDIR/package/lib/
--- a/backend/go/bark-cpp/run.sh
+++ b/backend/go/bark-cpp/run.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -ex
+
+CURDIR=$(dirname "$(realpath $0)")
+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+
+# If there is a lib/ld.so, use it
+if [ -f $CURDIR/lib/ld.so ]; then
+	echo "Using lib/ld.so"
+	exec $CURDIR/lib/ld.so $CURDIR/bark-cpp "$@"
+fi
+
+exec $CURDIR/bark-cpp "$@"
--- a/backend/go/bark/Makefile
+++ b/backend/go/bark/Makefile
@@ -1,25 +0,0 @@
-INCLUDE_PATH := $(abspath ./)
-LIBRARY_PATH := $(abspath ./)
-
-AR?=ar
-
-BUILD_TYPE?=
-# keep standard at C11 and C++11
-CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../sources/bark.cpp/examples -I$(INCLUDE_PATH)/../../../sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/../../../sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
-LDFLAGS  = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/../../../sources/bark.cpp/build/examples -lbark -lstdc++ -lm
-
-# warnings
-CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
-
-gobark.o:
-	$(CXX) $(CXXFLAGS) gobark.cpp -o gobark.o -c $(LDFLAGS)
-
-libbark.a: gobark.o
-	cp $(INCLUDE_PATH)/../../../sources/bark.cpp/build/libbark.a ./
-	$(AR) rcs libbark.a gobark.o
-	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml.c.o
-	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-alloc.c.o
-	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-backend.c.o
-
-clean:
-	rm -f gobark.o libbark.a
--- a/backend/go/bark/run.sh
+++ b/backend/go/bark/run.sh
@@ -1,3 +0,0 @@
-#!/bin/bash
-set -ex
-exec ./bark-cpp
--- a/backend/go/huggingface/Makefile
+++ b/backend/go/huggingface/Makefile
@@ -0,0 +1,9 @@
+GOCMD=go
+
+huggingface:
+	CGO_ENABLED=0 $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o huggingface ./
+
+package:
+	bash package.sh
+
+build: huggingface package
--- a/backend/go/llm/langchain/langchain.go
+++ b/backend/go/llm/langchain/langchain.go
--- a/backend/go/llm/langchain/main.go
+++ b/backend/go/llm/langchain/main.go
--- a/backend/go/huggingface/package.sh
+++ b/backend/go/huggingface/package.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# Script to copy the appropriate libraries based on architecture
+# This script is used in the final stage of the Dockerfile
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+
+mkdir -p $CURDIR/package
+cp -avrf $CURDIR/huggingface $CURDIR/package/
+cp -rfv $CURDIR/run.sh $CURDIR/package/
--- a/backend/go/huggingface/run.sh
+++ b/backend/go/huggingface/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -ex
+
+CURDIR=$(dirname "$(realpath $0)")
+
+exec $CURDIR/huggingface "$@"
--- a/backend/go/image/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/image/stablediffusion-ggml/gosd.cpp
@@ -1,231 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <time.h>
-#include <iostream>
-#include <random>
-#include <string>
-#include <vector>
-#include "gosd.h"
-
-// #include "preprocessing.hpp"
-#include "flux.hpp"
-#include "stable-diffusion.h"
-
-#define STB_IMAGE_IMPLEMENTATION
-#define STB_IMAGE_STATIC
-#include "stb_image.h"
-
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#define STB_IMAGE_WRITE_STATIC
-#include "stb_image_write.h"
-
-#define STB_IMAGE_RESIZE_IMPLEMENTATION
-#define STB_IMAGE_RESIZE_STATIC
-#include "stb_image_resize.h"
-
-// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
-const char* sample_method_str[] = {
-    "euler_a",
-    "euler",
-    "heun",
-    "dpm2",
-    "dpm++2s_a",
-    "dpm++2m",
-    "dpm++2mv2",
-    "ipndm",
-    "ipndm_v",
-    "lcm",
-    "ddim_trailing",
-    "tcd",
-};
-
-// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
-const char* schedule_str[] = {
-    "default",
-    "discrete",
-    "karras",
-    "exponential",
-    "ays",
-    "gits",
-};
-
-sd_ctx_t* sd_c;
-
-sample_method_t sample_method;
-
-int load_model(char *model, char* options[], int threads, int diff) {
-    fprintf (stderr, "Loading model!\n");
-
-    char *stableDiffusionModel = "";
-    if (diff == 1 ) {
-        stableDiffusionModel = model;
-        model = "";
-    }
-
-    // decode options. Options are in form optname:optvale, or if booleans only optname.
-    char *clip_l_path  = "";
-    char *clip_g_path  = "";
-    char *t5xxl_path  = "";
-    char *vae_path  = "";
-    char *scheduler = "";
-    char *sampler = "";
-
-    // If options is not NULL, parse options
-    for (int i = 0; options[i] != NULL; i++) {
-        char *optname = strtok(options[i], ":");
-        char *optval = strtok(NULL, ":");
-        if (optval == NULL) {
-            optval = "true";
-        }
-
-        if (!strcmp(optname, "clip_l_path")) {
-            clip_l_path = optval;
-        }
-        if (!strcmp(optname, "clip_g_path")) {
-            clip_g_path = optval;
-        }
-        if (!strcmp(optname, "t5xxl_path")) {
-            t5xxl_path = optval;
-        }
-        if (!strcmp(optname, "vae_path")) {
-            vae_path = optval;
-        }
-        if (!strcmp(optname, "scheduler")) {
-            scheduler = optval;
-        }
-        if (!strcmp(optname, "sampler")) {
-            sampler = optval;
-        }
-    }
-
-    int sample_method_found = -1;
-    for (int m = 0; m < N_SAMPLE_METHODS; m++) {
-        if (!strcmp(sampler, sample_method_str[m])) {
-            sample_method_found = m;
-        }
-    }
-    if (sample_method_found == -1) {
-        fprintf(stderr, "Invalid sample method, default to EULER_A!\n");
-        sample_method_found = EULER_A;
-    }
-    sample_method = (sample_method_t)sample_method_found;
-
-    int schedule_found            = -1;
-    for (int d = 0; d < N_SCHEDULES; d++) {
-        if (!strcmp(scheduler, schedule_str[d])) {
-            schedule_found = d;
-                fprintf (stderr, "Found scheduler: %s\n", scheduler);
-
-        }
-    }
-
-    if (schedule_found == -1) {
-        fprintf (stderr, "Invalid scheduler! using DEFAULT\n");
-        schedule_found = DEFAULT;
-    }
-
-    schedule_t schedule = (schedule_t)schedule_found;
-    
-    fprintf (stderr, "Creating context\n");
-    sd_ctx_t* sd_ctx = new_sd_ctx(model,
-                                  clip_l_path,
-                                  clip_g_path,
-                                  t5xxl_path,
-                                  stableDiffusionModel,
-                                  vae_path,
-                                  "",
-                                  "",
-                                  "",
-                                  "",
-                                  "",
-                                  false,
-                                  false,
-                                  false,
-                                  threads,
-                                  SD_TYPE_COUNT,
-                                  STD_DEFAULT_RNG,
-                                  schedule,
-                                  false,
-                                  false,
-                                  false,
-                                  false);
-
-    if (sd_ctx == NULL) {
-        fprintf (stderr, "failed loading model (generic error)\n");
-        return 1;
-    }
-    fprintf (stderr, "Created context: OK\n");
-
-    sd_c = sd_ctx;
-
-    return 0;
-}
-
-int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed , char *dst, float cfg_scale) {
-
-    sd_image_t* results;
-
-    std::vector<int> skip_layers = {7, 8, 9};
-
-    fprintf (stderr, "Generating image\n");
-
-    results = txt2img(sd_c,
-                            text,
-                            negativeText,
-                            -1, //clip_skip
-                            cfg_scale, // sfg_scale
-                            3.5f,
-			    0, // eta
-                            width,
-                            height,
-                            sample_method, 
-                            steps,
-                            seed,
-                            1,
-                            NULL,
-                            0.9f,
-                            20.f,
-                            false,
-                            "",
-                            skip_layers.data(),
-                            skip_layers.size(),
-                            0,
-                            0.01,
-                            0.2);
-
-    if (results == NULL) {
-        fprintf (stderr, "NO results\n");
-        return 1;
-    }
-
-    if (results[0].data == NULL) {
-        fprintf (stderr, "Results with no data\n");
-        return 1;
-    }
-
-    fprintf (stderr, "Writing PNG\n");
-
-    fprintf (stderr, "DST: %s\n", dst);
-    fprintf (stderr, "Width: %d\n", results[0].width);
-    fprintf (stderr, "Height: %d\n", results[0].height);
-    fprintf (stderr, "Channel: %d\n", results[0].channel);
-    fprintf (stderr, "Data: %p\n", results[0].data);
-
-    stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
-                       results[0].data, 0, NULL);
-    fprintf (stderr, "Saved resulting image to '%s'\n", dst);
-
-    // TODO: free results. Why does it crash?
-
-    free(results[0].data);
-    results[0].data = NULL;
-    free(results);
-    fprintf (stderr, "gen_image is done", dst);
-
-    return 0;
-}
-
-int unload() {
-    free_sd_ctx(sd_c);
-}
-
--- a/backend/go/image/stablediffusion-ggml/gosd.go
+++ b/backend/go/image/stablediffusion-ggml/gosd.go
@@ -1,96 +0,0 @@
-package main
-
-// #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
-// #cgo LDFLAGS: -L${SRCDIR}/ -lsd -lstdc++ -lm -lggmlall -lgomp
-// #include <gosd.h>
-// #include <stdlib.h>
-import "C"
-
-import (
-	"fmt"
-	"os"
-	"path/filepath"
-	"strings"
-	"unsafe"
-
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/utils"
-)
-
-type SDGGML struct {
-	base.SingleThread
-	threads      int
-	sampleMethod string
-	cfgScale     float32
-}
-
-func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
-
-	sd.threads = int(opts.Threads)
-
-	modelFile := C.CString(opts.ModelFile)
-	defer C.free(unsafe.Pointer(modelFile))
-
-	var options **C.char
-	// prepare the options array to pass to C
-
-	size := C.size_t(unsafe.Sizeof((*C.char)(nil)))
-	length := C.size_t(len(opts.Options))
-	options = (**C.char)(C.malloc(length * size))
-	view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0:len(opts.Options):len(opts.Options)]
-
-	var diffusionModel int
-
-	var oo []string
-	for _, op := range opts.Options {
-		if op == "diffusion_model" {
-			diffusionModel = 1
-			continue
-		}
-
-		// If it's an option path, we resolve absolute path from the model path
-		if strings.Contains(op, ":") && strings.Contains(op, "path") {
-			data := strings.Split(op, ":")
-			data[1] = filepath.Join(opts.ModelPath, data[1])
-			if err := utils.VerifyPath(data[1], opts.ModelPath); err == nil {
-				oo = append(oo, strings.Join(data, ":"))
-			}
-		} else {
-			oo = append(oo, op)
-		}
-	}
-
-	fmt.Fprintf(os.Stderr, "Options: %+v\n", oo)
-
-	for i, x := range oo {
-		view[i] = C.CString(x)
-	}
-
-	sd.cfgScale = opts.CFGScale
-
-	ret := C.load_model(modelFile, options, C.int(opts.Threads), C.int(diffusionModel))
-	if ret != 0 {
-		return fmt.Errorf("could not load model")
-	}
-
-	return nil
-}
-
-func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
-	t := C.CString(opts.PositivePrompt)
-	defer C.free(unsafe.Pointer(t))
-
-	dst := C.CString(opts.Dst)
-	defer C.free(unsafe.Pointer(dst))
-
-	negative := C.CString(opts.NegativePrompt)
-	defer C.free(unsafe.Pointer(negative))
-
-	ret := C.gen_image(t, negative, C.int(opts.Width), C.int(opts.Height), C.int(opts.Step), C.int(opts.Seed), dst, C.float(sd.cfgScale))
-	if ret != 0 {
-		return fmt.Errorf("inference failed")
-	}
-
-	return nil
-}
--- a/backend/go/image/stablediffusion-ggml/gosd.h
+++ b/backend/go/image/stablediffusion-ggml/gosd.h
@@ -1,8 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-int load_model(char *model, char* options[], int threads, int diffusionModel);
-int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed, char *dst, float cfg_scale);
-#ifdef __cplusplus
-}
-#endif
--- a/backend/go/local-store/Makefile
+++ b/backend/go/local-store/Makefile
@@ -0,0 +1,9 @@
+GOCMD=go
+
+local-store:
+	CGO_ENABLED=0 $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o local-store ./
+
+package:
+	bash package.sh
+
+build: local-store package
--- a/backend/go/local-store/debug.go
+++ b/backend/go/local-store/debug.go
--- a/backend/go/local-store/main.go
+++ b/backend/go/local-store/main.go
--- a/backend/go/local-store/package.sh
+++ b/backend/go/local-store/package.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# Script to copy the appropriate libraries based on architecture
+# This script is used in the final stage of the Dockerfile
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+
+mkdir -p $CURDIR/package
+cp -avrf $CURDIR/local-store $CURDIR/package/
+cp -rfv $CURDIR/run.sh $CURDIR/package/
--- a/backend/go/local-store/production.go
+++ b/backend/go/local-store/production.go
--- a/backend/go/local-store/run.sh
+++ b/backend/go/local-store/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -ex
+
+CURDIR=$(dirname "$(realpath $0)")
+
+exec $CURDIR/local-store "$@"
--- a/backend/go/local-store/store.go
+++ b/backend/go/local-store/store.go
@@ -4,6 +4,7 @@ package main
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"container/heap"
+	"errors"
 	"fmt"
 	"math"
 	"slices"
@@ -99,6 +100,9 @@ func sortIntoKeySlicese(keys []*pb.StoresKey) [][]float32 {
 }

 func (s *Store) Load(opts *pb.ModelOptions) error {
+	if opts.Model != "" {
+		return errors.New("not implemented")
+	}
 	return nil
 }

@@ -315,7 +319,7 @@ func isNormalized(k []float32) bool {

 	for _, v := range k {
 		v64 := float64(v)
-		sum += v64*v64
+		sum += v64 * v64
 	}

 	s := math.Sqrt(sum)
--- a/backend/go/piper/Makefile
+++ b/backend/go/piper/Makefile
@@ -0,0 +1,37 @@
+
+# go-piper version
+PIPER_REPO?=https://github.com/mudler/go-piper
+PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
+
+CURRENT_DIR=$(abspath ./)
+GOCMD=go
+
+PIPER_CGO_CXXFLAGS+=-I$(CURRENT_DIR)/sources/go-piper/piper/src/cpp -I$(CURRENT_DIR)/sources/go-piper/piper/build/fi/include -I$(CURRENT_DIR)/sources/go-piper/piper/build/pi/include -I$(CURRENT_DIR)/sources/go-piper/piper/build/si/include
+PIPER_CGO_LDFLAGS+=-L$(CURRENT_DIR)/sources/go-piper/piper/build/fi/lib -L$(CURRENT_DIR)/sources/go-piper/piper/build/pi/lib -L$(CURRENT_DIR)/sources/go-piper/piper/build/si/lib -lfmt -lspdlog -lucd
+
+## go-piper
+sources/go-piper:
+	mkdir -p sources/go-piper
+	cd sources/go-piper && \
+	git init && \
+	git remote add origin $(PIPER_REPO) && \
+	git fetch origin && \
+	git checkout $(PIPER_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+sources/go-piper/libpiper_binding.a: sources/go-piper
+	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
+
+espeak-ng-data: sources/go-piper sources/go-piper/libpiper_binding.a
+	mkdir -p espeak-ng-data
+	@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. espeak-ng-data
+
+piper: sources/go-piper sources/go-piper/libpiper_binding.a espeak-ng-data
+	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURRENT_DIR)/sources/go-piper
+	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURRENT_DIR)/sources/go-piper \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o piper ./
+
+package:
+	bash package.sh
+
+build: piper package
--- a/backend/go/piper/main.go
+++ b/backend/go/piper/main.go
--- a/backend/go/piper/package.sh
+++ b/backend/go/piper/package.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Script to copy the appropriate libraries based on architecture
+# This script is used in the final stage of the Dockerfile
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+
+# Create lib directory
+mkdir -p $CURDIR/package/lib
+
+cp -avrf $CURDIR/piper $CURDIR/package/
+cp -avrf $CURDIR/espeak-ng-data $CURDIR/package/
+cp -rfv $CURDIR/run.sh $CURDIR/package/
+cp -rfLv $CURDIR/sources/go-piper/piper-phonemize/pi/lib/* $CURDIR/package/lib/
+
+# Detect architecture and copy appropriate libraries
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    # x86_64 architecture
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    # ARM64 architecture
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+echo "Packaging completed successfully" 
+ls -liah $CURDIR/package/
+ls -liah $CURDIR/package/lib/
--- a/backend/go/piper/piper.go
+++ b/backend/go/piper/piper.go
@@ -23,7 +23,7 @@ func (sd *Piper) Load(opts *pb.ModelOptions) error {
 	}
 	var err error
 	// Note: the Model here is a path to a directory containing the model files
-	sd.piper, err = New(opts.LibrarySearchPath)
+	sd.piper, err = New(os.Getenv("ESPEAK_NG_DATA"))
 	return err
 }

--- a/backend/go/piper/run.sh
+++ b/backend/go/piper/run.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -ex
+
+CURDIR=$(dirname "$(realpath $0)")
+
+export ESPEAK_NG_DATA=$CURDIR/espeak-ng-data
+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+
+# If there is a lib/ld.so, use it
+if [ -f $CURDIR/lib/ld.so ]; then
+	echo "Using lib/ld.so"
+	exec $CURDIR/lib/ld.so $CURDIR/piper "$@"
+fi
+
+exec $CURDIR/piper "$@"
--- a/backend/go/silero-vad/Makefile
+++ b/backend/go/silero-vad/Makefile
@@ -0,0 +1,47 @@
+
+CURRENT_DIR=$(abspath ./)
+GOCMD=go
+
+ONNX_VERSION?=1.20.0
+ONNX_ARCH?=x64
+ONNX_OS?=linux
+
+# Detect if we are running on arm64
+ifneq (,$(findstring aarch64,$(shell uname -m)))
+	ONNX_ARCH=aarch64
+endif
+
+ifeq ($(OS),Darwin)
+	ONNX_OS=osx
+	ifneq (,$(findstring aarch64,$(shell uname -m)))
+		ONNX_ARCH=arm64
+	else ifneq (,$(findstring arm64,$(shell uname -m)))
+		ONNX_ARCH=arm64
+	else
+		ONNX_ARCH=x86_64
+	endif
+endif
+
+sources/onnxruntime:
+	mkdir -p sources/onnxruntime
+	curl -L https://github.com/microsoft/onnxruntime/releases/download/v$(ONNX_VERSION)/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz -o sources/onnxruntime/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
+	cd sources/onnxruntime && tar -xvf onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz && rm onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
+	cd sources/onnxruntime && mv onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION)/* ./
+
+backend-assets/lib/libonnxruntime.so.1: sources/onnxruntime
+	mkdir -p backend-assets/lib
+	cp -rfLv sources/onnxruntime/lib/* backend-assets/lib/
+ifeq ($(OS),Darwin)
+	mv backend-assets/lib/libonnxruntime.$(ONNX_VERSION).dylib backend-assets/lib/libonnxruntime.dylib
+else
+	mv backend-assets/lib/libonnxruntime.so.$(ONNX_VERSION) backend-assets/lib/libonnxruntime.so.1
+endif
+
+silero-vad: backend-assets/lib/libonnxruntime.so.1
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURRENT_DIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURRENT_DIR)/backend-assets/lib \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o silero-vad ./
+
+package:
+	bash package.sh
+
+build: silero-vad package
--- a/backend/go/silero-vad/main.go
+++ b/backend/go/silero-vad/main.go
--- a/backend/go/silero-vad/package.sh
+++ b/backend/go/silero-vad/package.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Script to copy the appropriate libraries based on architecture
+# This script is used in the final stage of the Dockerfile
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+
+# Create lib directory
+mkdir -p $CURDIR/package/lib
+
+cp -avrf $CURDIR/silero-vad $CURDIR/package/
+cp -avrf $CURDIR/run.sh $CURDIR/package/
+cp -rfLv $CURDIR/backend-assets/lib/* $CURDIR/package/lib/
+
+# Detect architecture and copy appropriate libraries
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    # x86_64 architecture
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    # ARM64 architecture
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+echo "Packaging completed successfully" 
+ls -liah $CURDIR/package/
+ls -liah $CURDIR/package/lib/
--- a/backend/go/silero-vad/run.sh
+++ b/backend/go/silero-vad/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -ex
+
+CURDIR=$(dirname "$(realpath $0)")
+
+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+
+# If there is a lib/ld.so, use it
+if [ -f $CURDIR/lib/ld.so ]; then
+	echo "Using lib/ld.so"
+	exec $CURDIR/lib/ld.so $CURDIR/silero-vad "$@"
+fi
+
+exec $CURDIR/silero-vad "$@"
--- a/backend/go/silero-vad/vad.go
+++ b/backend/go/silero-vad/vad.go
--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -4,9 +4,11 @@ LIBRARY_PATH := $(abspath ./)
 AR?=ar
 CMAKE_ARGS?=
 BUILD_TYPE?=
+NATIVE?=false
+CUDA_LIBPATH?=/usr/local/cuda/lib64/
 ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # keep standard at C11 and C++11
-CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
+CXXFLAGS = -I. -I$(INCLUDE_PATH)/sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC

 GOCMD?=go
 CGO_LDFLAGS?=
@@ -15,12 +17,21 @@ CGO_LDFLAGS_SYCL=
 GO_TAGS?=
 LD_FLAGS?=

+# stablediffusion.cpp (ggml)
+STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
+STABLEDIFFUSION_GGML_VERSION?=f6b9aa1a4373e322ff12c15b8a0749e6dd6f0253
+
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DSD_CUDA=ON
+	CMAKE_ARGS+=-DSD_CUDA=ON -DGGML_CUDA=ON
+	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda
 # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
@@ -30,14 +41,17 @@ else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DSD_HIPBLAS=ON
+	CMAKE_ARGS+=-DSD_HIPBLAS=ON -DGGML_HIPBLAS=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DSD_VULKAN=ON -DGGML_VULKAN=ON
+	CGO_LDFLAGS+=-lvulkan
 else ifeq ($(OS),Darwin)
 	ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DSD_METAL=OFF
+		CMAKE_ARGS+=-DSD_METAL=OFF -DGGML_METAL=OFF
 	else
-		CMAKE_ARGS+=-DSD_METAL=ON
+		CMAKE_ARGS+=-DSD_METAL=ON -DGGML_METAL=ON
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		TARGET+=--target ggml-metal
 	endif
@@ -49,8 +63,8 @@ ifeq ($(BUILD_TYPE),sycl_f16)
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DSD_SYCL=ON \
 		-DGGML_SYCL_F16=ON
-	CC=icx
-	CXX=icpx
+	export CC=icx
+	export CXX=icpx
 	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
 	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
 	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
@@ -62,8 +76,8 @@ ifeq ($(BUILD_TYPE),sycl_f32)
 		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DSD_SYCL=ON
-	CC=icx
-	CXX=icpx
+	export CC=icx
+	export CXX=icpx
 	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
 	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
 	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
@@ -77,23 +91,18 @@ endif
 # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
 GGML_ARCHIVE_DIR := build/ggml/src/
 ALL_ARCHIVES := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.a')
+ALL_OBJS := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.o')

 # Name of the single merged library
 COMBINED_LIB := libggmlall.a

-# Rule to merge all the .a files into one
+# Instead of using the archives generated by GGML, use the object files directly to avoid overwriting objects with the same base name
 $(COMBINED_LIB): $(ALL_ARCHIVES)
-	@echo "Merging all .a into $(COMBINED_LIB)"
+	@echo "Merging all .o into $(COMBINED_LIB): $(ALL_OBJS)"
 	rm -f $@
-	mkdir -p merge-tmp
-	for a in $(ALL_ARCHIVES); do \
-		( cd merge-tmp && ar x ../$$a ); \
-	done
-	( cd merge-tmp && ar rcs ../$@ *.o )
+	ar -qc $@ $(ALL_OBJS)
 	# Ensure we have a proper index
 	ranlib $@
-	# Clean up
-	rm -rf merge-tmp

 build/libstable-diffusion.a:
 	@echo "Building SD with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
@@ -101,12 +110,12 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
 	mkdir -p build && \
 	cd build && \
-	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
+	cmake $(CMAKE_ARGS) ../sources/stablediffusion-ggml.cpp && \
 	cmake --build . --config Release"
 else
 	mkdir -p build && \
 	cd build && \
-	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
+	cmake $(CMAKE_ARGS) ../sources/stablediffusion-ggml.cpp && \
 	cmake --build . --config Release
 endif
 	$(MAKE) $(COMBINED_LIB)
@@ -119,17 +128,26 @@ else
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
 endif

-libsd.a: gosd.o
+## stablediffusion (ggml)
+sources/stablediffusion-ggml.cpp:
+	git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
+	cd sources/stablediffusion-ggml.cpp && \
+	git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+libsd.a: sources/stablediffusion-ggml.cpp build/libstable-diffusion.a gosd.o
 	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
 	$(AR) rcs libsd.a gosd.o

-stablediffusion-ggml:
+stablediffusion-ggml: libsd.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
 	CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
-ifneq ($(UPX),)
-	$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
-endif
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o stablediffusion-ggml ./
+
+package:
+	bash package.sh
+
+build: stablediffusion-ggml package

 clean:
 	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/go/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/stablediffusion-ggml/gosd.cpp
@@ -0,0 +1,463 @@
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <iostream>
+#include <random>
+#include <string>
+#include <vector>
+#include <filesystem>
+#include "gosd.h"
+
+// #include "preprocessing.hpp"
+#include "flux.hpp"
+#include "stable-diffusion.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#define STB_IMAGE_STATIC
+#include "stb_image.h"
+
+#define STB_IMAGE_WRITE_IMPLEMENTATION
+#define STB_IMAGE_WRITE_STATIC
+#include "stb_image_write.h"
+
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#define STB_IMAGE_RESIZE_STATIC
+#include "stb_image_resize.h"
+
+// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
+const char* sample_method_str[] = {
+    "euler_a",
+    "euler",
+    "heun",
+    "dpm2",
+    "dpm++2s_a",
+    "dpm++2m",
+    "dpm++2mv2",
+    "ipndm",
+    "ipndm_v",
+    "lcm",
+    "ddim_trailing",
+    "tcd",
+};
+
+// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
+const char* schedule_str[] = {
+    "default",
+    "discrete",
+    "karras",
+    "exponential",
+    "ays",
+    "gits",
+};
+
+sd_ctx_t* sd_c;
+
+sample_method_t sample_method;
+
+// Copied from the upstream CLI
+void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) {
+    //SDParams* params = (SDParams*)data;
+    const char* level_str;
+
+    if (!log /*|| (!params->verbose && level <= SD_LOG_DEBUG)*/) {
+        return;
+    }
+
+    switch (level) {
+        case SD_LOG_DEBUG:
+            level_str = "DEBUG";
+            break;
+        case SD_LOG_INFO:
+            level_str = "INFO";
+            break;
+        case SD_LOG_WARN:
+            level_str = "WARN";
+            break;
+        case SD_LOG_ERROR:
+            level_str = "ERROR";
+            break;
+        default: /* Potential future-proofing */
+            level_str = "?????";
+            break;
+    }
+
+    fprintf(stderr, "[%-5s] ", level_str);
+    fputs(log, stderr);
+    fflush(stderr);
+}
+
+int load_model(char *model, char *model_path, char* options[], int threads, int diff) {
+    fprintf (stderr, "Loading model!\n");
+
+    sd_set_log_callback(sd_log_cb, NULL);
+
+    char *stableDiffusionModel = "";
+    if (diff == 1 ) {
+        stableDiffusionModel = model;
+        model = "";
+    }
+
+    // decode options. Options are in form optname:optvale, or if booleans only optname.
+    char *clip_l_path  = "";
+    char *clip_g_path  = "";
+    char *t5xxl_path  = "";
+    char *vae_path  = "";
+    char *scheduler = "";
+    char *sampler = "";
+    char *lora_dir = model_path;
+    bool lora_dir_allocated = false;
+
+    fprintf(stderr, "parsing options\n");
+
+    // If options is not NULL, parse options
+    for (int i = 0; options[i] != NULL; i++) {
+        char *optname = strtok(options[i], ":");
+        char *optval = strtok(NULL, ":");
+        if (optval == NULL) {
+            optval = "true";
+        }
+
+        if (!strcmp(optname, "clip_l_path")) {
+            clip_l_path = optval;
+        }
+        if (!strcmp(optname, "clip_g_path")) {
+            clip_g_path = optval;
+        }
+        if (!strcmp(optname, "t5xxl_path")) {
+            t5xxl_path = optval;
+        }
+        if (!strcmp(optname, "vae_path")) {
+            vae_path = optval;
+        }
+        if (!strcmp(optname, "scheduler")) {
+            scheduler = optval;
+        }
+        if (!strcmp(optname, "sampler")) {
+            sampler = optval;
+        }
+        if (!strcmp(optname, "lora_dir")) {
+            // Path join with model dir
+            if (model_path && strlen(model_path) > 0) {
+                std::filesystem::path model_path_str(model_path);
+                std::filesystem::path lora_path(optval);
+                std::filesystem::path full_lora_path = model_path_str / lora_path;
+                lora_dir = strdup(full_lora_path.string().c_str());
+                lora_dir_allocated = true;
+                fprintf(stderr, "Lora dir resolved to: %s\n", lora_dir);
+            } else {
+                lora_dir = optval;
+                fprintf(stderr, "No model path provided, using lora dir as-is: %s\n", lora_dir);
+            }
+        }
+    }
+
+    fprintf(stderr, "parsed options\n");
+
+    int sample_method_found = -1;
+    for (int m = 0; m < SAMPLE_METHOD_COUNT; m++) {
+        if (!strcmp(sampler, sample_method_str[m])) {
+            sample_method_found = m;
+            fprintf(stderr, "Found sampler: %s\n", sampler);
+        }
+    }
+    if (sample_method_found == -1) {
+        fprintf(stderr, "Invalid sample method, default to EULER_A!\n");
+        sample_method_found = EULER_A;
+    }
+    sample_method = (sample_method_t)sample_method_found;
+
+    int schedule_found            = -1;
+    for (int d = 0; d < SCHEDULE_COUNT; d++) {
+        if (!strcmp(scheduler, schedule_str[d])) {
+            schedule_found = d;
+                fprintf (stderr, "Found scheduler: %s\n", scheduler);
+
+        }
+    }
+
+    if (schedule_found == -1) {
+        fprintf (stderr, "Invalid scheduler! using DEFAULT\n");
+        schedule_found = DEFAULT;
+    }
+
+    schedule_t schedule = (schedule_t)schedule_found;
+
+    fprintf (stderr, "Creating context\n");
+    sd_ctx_params_t ctx_params;
+    sd_ctx_params_init(&ctx_params);
+    ctx_params.model_path = model;
+    ctx_params.clip_l_path = clip_l_path;
+    ctx_params.clip_g_path = clip_g_path;
+    ctx_params.t5xxl_path = t5xxl_path;
+    ctx_params.diffusion_model_path = stableDiffusionModel;
+    ctx_params.vae_path = vae_path;
+    ctx_params.taesd_path = "";
+    ctx_params.control_net_path = "";
+    ctx_params.lora_model_dir = lora_dir;
+    ctx_params.embedding_dir = "";
+    ctx_params.stacked_id_embed_dir = "";
+    ctx_params.vae_decode_only = false;
+    ctx_params.vae_tiling = false;
+    ctx_params.free_params_immediately = false;
+    ctx_params.n_threads = threads;
+    ctx_params.rng_type = STD_DEFAULT_RNG;
+    ctx_params.schedule = schedule;
+    sd_ctx_t* sd_ctx = new_sd_ctx(&ctx_params);
+
+    if (sd_ctx == NULL) {
+        fprintf (stderr, "failed loading model (generic error)\n");
+        // Clean up allocated memory
+        if (lora_dir_allocated && lora_dir) {
+            free(lora_dir);
+        }
+        return 1;
+    }
+    fprintf (stderr, "Created context: OK\n");
+
+    sd_c = sd_ctx;
+
+    // Clean up allocated memory
+    if (lora_dir_allocated && lora_dir) {
+        free(lora_dir);
+    }
+
+    return 0;
+}
+
+int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed , char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count) {
+
+    sd_image_t* results;
+
+    std::vector<int> skip_layers = {7, 8, 9};
+
+    fprintf (stderr, "Generating image\n");
+
+    sd_img_gen_params_t p;
+    sd_img_gen_params_init(&p);
+
+    p.prompt = text;
+    p.negative_prompt = negativeText;
+    p.guidance.txt_cfg = cfg_scale;
+    p.guidance.slg.layers = skip_layers.data();
+    p.guidance.slg.layer_count = skip_layers.size();
+    p.width = width;
+    p.height = height;
+    p.sample_method = sample_method;
+    p.sample_steps = steps;
+    p.seed = seed;
+    p.input_id_images_path = "";
+
+    // Handle input image for img2img
+    bool has_input_image = (src_image != NULL && strlen(src_image) > 0);
+    bool has_mask_image = (mask_image != NULL && strlen(mask_image) > 0);
+    
+    uint8_t* input_image_buffer = NULL;
+    uint8_t* mask_image_buffer = NULL;
+    std::vector<uint8_t> default_mask_image_vec;
+    
+    if (has_input_image) {
+        fprintf(stderr, "Loading input image: %s\n", src_image);
+        
+        int c = 0;
+        int img_width = 0;
+        int img_height = 0;
+        input_image_buffer = stbi_load(src_image, &img_width, &img_height, &c, 3);
+        if (input_image_buffer == NULL) {
+            fprintf(stderr, "Failed to load input image from '%s'\n", src_image);
+            return 1;
+        }
+        if (c < 3) {
+            fprintf(stderr, "Input image must have at least 3 channels, got %d\n", c);
+            free(input_image_buffer);
+            return 1;
+        }
+        
+        // Resize input image if dimensions don't match
+        if (img_width != width || img_height != height) {
+            fprintf(stderr, "Resizing input image from %dx%d to %dx%d\n", img_width, img_height, width, height);
+            
+            uint8_t* resized_image_buffer = (uint8_t*)malloc(height * width * 3);
+            if (resized_image_buffer == NULL) {
+                fprintf(stderr, "Failed to allocate memory for resized image\n");
+                free(input_image_buffer);
+                return 1;
+            }
+            
+            stbir_resize(input_image_buffer, img_width, img_height, 0,
+                         resized_image_buffer, width, height, 0, STBIR_TYPE_UINT8,
+                         3, STBIR_ALPHA_CHANNEL_NONE, 0,
+                         STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP,
+                         STBIR_FILTER_BOX, STBIR_FILTER_BOX,
+                         STBIR_COLORSPACE_SRGB, nullptr);
+            
+            free(input_image_buffer);
+            input_image_buffer = resized_image_buffer;
+        }
+        
+        p.init_image = {(uint32_t)width, (uint32_t)height, 3, input_image_buffer};
+        p.strength = strength;
+        fprintf(stderr, "Using img2img with strength: %.2f\n", strength);
+    } else {
+        // No input image, use empty image for text-to-image
+        p.init_image = {(uint32_t)width, (uint32_t)height, 3, NULL};
+        p.strength = 0.0f;
+    }
+    
+    // Handle mask image for inpainting
+    if (has_mask_image) {
+        fprintf(stderr, "Loading mask image: %s\n", mask_image);
+        
+        int c = 0;
+        int mask_width = 0;
+        int mask_height = 0;
+        mask_image_buffer = stbi_load(mask_image, &mask_width, &mask_height, &c, 1);
+        if (mask_image_buffer == NULL) {
+            fprintf(stderr, "Failed to load mask image from '%s'\n", mask_image);
+            if (input_image_buffer) free(input_image_buffer);
+            return 1;
+        }
+        
+        // Resize mask if dimensions don't match
+        if (mask_width != width || mask_height != height) {
+            fprintf(stderr, "Resizing mask image from %dx%d to %dx%d\n", mask_width, mask_height, width, height);
+            
+            uint8_t* resized_mask_buffer = (uint8_t*)malloc(height * width);
+            if (resized_mask_buffer == NULL) {
+                fprintf(stderr, "Failed to allocate memory for resized mask\n");
+                free(mask_image_buffer);
+                if (input_image_buffer) free(input_image_buffer);
+                return 1;
+            }
+            
+            stbir_resize(mask_image_buffer, mask_width, mask_height, 0,
+                         resized_mask_buffer, width, height, 0, STBIR_TYPE_UINT8,
+                         1, STBIR_ALPHA_CHANNEL_NONE, 0,
+                         STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP,
+                         STBIR_FILTER_BOX, STBIR_FILTER_BOX,
+                         STBIR_COLORSPACE_SRGB, nullptr);
+            
+            free(mask_image_buffer);
+            mask_image_buffer = resized_mask_buffer;
+        }
+        
+        p.mask_image = {(uint32_t)width, (uint32_t)height, 1, mask_image_buffer};
+        fprintf(stderr, "Using inpainting with mask\n");
+    } else {
+        // No mask image, create default full mask
+        default_mask_image_vec.resize(width * height, 255);
+        p.mask_image = {(uint32_t)width, (uint32_t)height, 1, default_mask_image_vec.data()};
+    }
+
+    // Handle reference images
+    std::vector<sd_image_t> ref_images_vec;
+    std::vector<uint8_t*> ref_image_buffers;
+    
+    if (ref_images_count > 0 && ref_images != NULL) {
+        fprintf(stderr, "Loading %d reference images\n", ref_images_count);
+        
+        for (int i = 0; i < ref_images_count; i++) {
+            if (ref_images[i] == NULL || strlen(ref_images[i]) == 0) {
+                continue;
+            }
+            
+            fprintf(stderr, "Loading reference image %d: %s\n", i + 1, ref_images[i]);
+            
+            int c = 0;
+            int ref_width = 0;
+            int ref_height = 0;
+            uint8_t* ref_image_buffer = stbi_load(ref_images[i], &ref_width, &ref_height, &c, 3);
+            if (ref_image_buffer == NULL) {
+                fprintf(stderr, "Failed to load reference image from '%s'\n", ref_images[i]);
+                continue;
+            }
+            if (c < 3) {
+                fprintf(stderr, "Reference image must have at least 3 channels, got %d\n", c);
+                free(ref_image_buffer);
+                continue;
+            }
+            
+            // Resize reference image if dimensions don't match
+            if (ref_width != width || ref_height != height) {
+                fprintf(stderr, "Resizing reference image from %dx%d to %dx%d\n", ref_width, ref_height, width, height);
+                
+                uint8_t* resized_ref_buffer = (uint8_t*)malloc(height * width * 3);
+                if (resized_ref_buffer == NULL) {
+                    fprintf(stderr, "Failed to allocate memory for resized reference image\n");
+                    free(ref_image_buffer);
+                    continue;
+                }
+                
+                stbir_resize(ref_image_buffer, ref_width, ref_height, 0,
+                             resized_ref_buffer, width, height, 0, STBIR_TYPE_UINT8,
+                             3, STBIR_ALPHA_CHANNEL_NONE, 0,
+                             STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP,
+                             STBIR_FILTER_BOX, STBIR_FILTER_BOX,
+                             STBIR_COLORSPACE_SRGB, nullptr);
+                
+                free(ref_image_buffer);
+                ref_image_buffer = resized_ref_buffer;
+            }
+            
+            ref_image_buffers.push_back(ref_image_buffer);
+            ref_images_vec.push_back({(uint32_t)width, (uint32_t)height, 3, ref_image_buffer});
+        }
+        
+        if (!ref_images_vec.empty()) {
+            p.ref_images = ref_images_vec.data();
+            p.ref_images_count = ref_images_vec.size();
+            fprintf(stderr, "Using %zu reference images\n", ref_images_vec.size());
+        }
+    }
+
+    results = generate_image(sd_c, &p);
+
+    if (results == NULL) {
+        fprintf (stderr, "NO results\n");
+        if (input_image_buffer) free(input_image_buffer);
+        if (mask_image_buffer) free(mask_image_buffer);
+        for (auto buffer : ref_image_buffers) {
+            if (buffer) free(buffer);
+        }
+        return 1;
+    }
+
+    if (results[0].data == NULL) {
+        fprintf (stderr, "Results with no data\n");
+        if (input_image_buffer) free(input_image_buffer);
+        if (mask_image_buffer) free(mask_image_buffer);
+        for (auto buffer : ref_image_buffers) {
+            if (buffer) free(buffer);
+        }
+        return 1;
+    }
+
+    fprintf (stderr, "Writing PNG\n");
+
+    fprintf (stderr, "DST: %s\n", dst);
+    fprintf (stderr, "Width: %d\n", results[0].width);
+    fprintf (stderr, "Height: %d\n", results[0].height);
+    fprintf (stderr, "Channel: %d\n", results[0].channel);
+    fprintf (stderr, "Data: %p\n", results[0].data);
+
+    stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
+                       results[0].data, 0, NULL);
+    fprintf (stderr, "Saved resulting image to '%s'\n", dst);
+
+    // Clean up
+    free(results[0].data);
+    results[0].data = NULL;
+    free(results);
+    if (input_image_buffer) free(input_image_buffer);
+    if (mask_image_buffer) free(mask_image_buffer);
+    for (auto buffer : ref_image_buffers) {
+        if (buffer) free(buffer);
+    }
+    fprintf (stderr, "gen_image is done", dst);
+
+    return 0;
+}
+
+int unload() {
+    free_sd_ctx(sd_c);
+}
+
--- a/backend/go/stablediffusion-ggml/gosd.go
+++ b/backend/go/stablediffusion-ggml/gosd.go
@@ -0,0 +1,151 @@
+package main
+
+// #cgo CXXFLAGS: -I${SRCDIR}/sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/sources/stablediffusion-ggml.cpp -I${SRCDIR}/sources/stablediffusion-ggml.cpp/ggml/include
+// #cgo LDFLAGS: -L${SRCDIR}/ -lsd -lstdc++ -lm -lggmlall -lgomp
+// #include <gosd.h>
+// #include <stdlib.h>
+import "C"
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/utils"
+)
+
+type SDGGML struct {
+	base.SingleThread
+	threads      int
+	sampleMethod string
+	cfgScale     float32
+}
+
+func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
+
+	sd.threads = int(opts.Threads)
+
+	modelPath := opts.ModelPath
+
+	modelFile := C.CString(opts.ModelFile)
+	defer C.free(unsafe.Pointer(modelFile))
+
+	modelPathC := C.CString(modelPath)
+	defer C.free(unsafe.Pointer(modelPathC))
+
+	var options **C.char
+	// prepare the options array to pass to C
+
+	size := C.size_t(unsafe.Sizeof((*C.char)(nil)))
+	length := C.size_t(len(opts.Options))
+	options = (**C.char)(C.malloc((length + 1) * size))
+	view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0 : len(opts.Options)+1 : len(opts.Options)+1]
+
+	var diffusionModel int
+
+	var oo []string
+	for _, op := range opts.Options {
+		if op == "diffusion_model" {
+			diffusionModel = 1
+			continue
+		}
+
+		// If it's an option path, we resolve absolute path from the model path
+		if strings.Contains(op, ":") && strings.Contains(op, "path") {
+			data := strings.Split(op, ":")
+			data[1] = filepath.Join(opts.ModelPath, data[1])
+			if err := utils.VerifyPath(data[1], opts.ModelPath); err == nil {
+				oo = append(oo, strings.Join(data, ":"))
+			}
+		} else {
+			oo = append(oo, op)
+		}
+	}
+
+	fmt.Fprintf(os.Stderr, "Options: %+v\n", oo)
+
+	for i, x := range oo {
+		view[i] = C.CString(x)
+	}
+	view[len(oo)] = nil
+
+	sd.cfgScale = opts.CFGScale
+
+	ret := C.load_model(modelFile, modelPathC, options, C.int(opts.Threads), C.int(diffusionModel))
+	if ret != 0 {
+		return fmt.Errorf("could not load model")
+	}
+
+	return nil
+}
+
+func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
+	t := C.CString(opts.PositivePrompt)
+	defer C.free(unsafe.Pointer(t))
+
+	dst := C.CString(opts.Dst)
+	defer C.free(unsafe.Pointer(dst))
+
+	negative := C.CString(opts.NegativePrompt)
+	defer C.free(unsafe.Pointer(negative))
+
+	// Handle source image path
+	var srcImage *C.char
+	if opts.Src != "" {
+		srcImage = C.CString(opts.Src)
+		defer C.free(unsafe.Pointer(srcImage))
+	}
+
+	// Handle mask image path
+	var maskImage *C.char
+	if opts.EnableParameters != "" {
+		// Parse EnableParameters for mask path if provided
+		// This is a simple approach - in a real implementation you might want to parse JSON
+		if strings.Contains(opts.EnableParameters, "mask:") {
+			parts := strings.Split(opts.EnableParameters, "mask:")
+			if len(parts) > 1 {
+				maskPath := strings.TrimSpace(parts[1])
+				if maskPath != "" {
+					maskImage = C.CString(maskPath)
+					defer C.free(unsafe.Pointer(maskImage))
+				}
+			}
+		}
+	}
+
+	// Handle reference images
+	var refImages **C.char
+	var refImagesCount C.int
+	if len(opts.RefImages) > 0 {
+		refImagesCount = C.int(len(opts.RefImages))
+		// Allocate array of C strings
+		size := C.size_t(unsafe.Sizeof((*C.char)(nil)))
+		refImages = (**C.char)(C.malloc((C.size_t(len(opts.RefImages)) + 1) * size))
+		view := (*[1 << 30]*C.char)(unsafe.Pointer(refImages))[0 : len(opts.RefImages)+1 : len(opts.RefImages)+1]
+
+		for i, refImagePath := range opts.RefImages {
+			view[i] = C.CString(refImagePath)
+			defer C.free(unsafe.Pointer(view[i]))
+		}
+		view[len(opts.RefImages)] = nil
+	}
+
+	// Default strength for img2img (0.75 is a good default)
+	strength := C.float(0.75)
+	if opts.Src != "" {
+		// If we have a source image, use img2img mode
+		// You could also parse strength from EnableParameters if needed
+		strength = C.float(0.75)
+	}
+
+	ret := C.gen_image(t, negative, C.int(opts.Width), C.int(opts.Height), C.int(opts.Step), C.int(opts.Seed), dst, C.float(sd.cfgScale), srcImage, strength, maskImage, refImages, refImagesCount)
+	if ret != 0 {
+		return fmt.Errorf("inference failed")
+	}
+
+	return nil
+}
--- a/backend/go/stablediffusion-ggml/gosd.h
+++ b/backend/go/stablediffusion-ggml/gosd.h
@@ -0,0 +1,8 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+int load_model(char *model, char *model_path, char* options[], int threads, int diffusionModel);
+int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed, char *dst, float cfg_scale, char *src_image, float strength, char *mask_image, char **ref_images, int ref_images_count);
+#ifdef __cplusplus
+}
+#endif
--- a/backend/go/image/stablediffusion-ggml/main.go
+++ b/backend/go/image/stablediffusion-ggml/main.go
--- a/backend/go/stablediffusion-ggml/package.sh
+++ b/backend/go/stablediffusion-ggml/package.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Script to copy the appropriate libraries based on architecture
+# This script is used in the final stage of the Dockerfile
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+
+# Create lib directory
+mkdir -p $CURDIR/package/lib
+
+cp -avrf $CURDIR/stablediffusion-ggml $CURDIR/package/
+cp -rfv $CURDIR/run.sh $CURDIR/package/
+
+# Detect architecture and copy appropriate libraries
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    # x86_64 architecture
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    # ARM64 architecture
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+echo "Packaging completed successfully" 
+ls -liah $CURDIR/package/
+ls -liah $CURDIR/package/lib/
--- a/backend/go/stablediffusion-ggml/run.sh
+++ b/backend/go/stablediffusion-ggml/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -ex
+
+CURDIR=$(dirname "$(realpath $0)")
+
+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+
+# If there is a lib/ld.so, use it
+if [ -f $CURDIR/lib/ld.so ]; then
+	echo "Using lib/ld.so"
+	exec $CURDIR/lib/ld.so $CURDIR/stablediffusion-ggml "$@"
+fi
+
+exec $CURDIR/stablediffusion-ggml "$@"
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -0,0 +1,131 @@
+GOCMD=go
+NATIVE?=false
+
+BUILD_TYPE?=
+CMAKE_ARGS?=
+
+# whisper.cpp version
+WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
+WHISPER_CPP_VERSION?=f7502dca872866a310fe69d30b163fa87d256319
+
+export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
+export WHISPER_DIR=$(abspath ./sources/whisper.cpp)
+export WHISPER_INCLUDE_PATH=$(WHISPER_DIR)/include:$(WHISPER_DIR)/ggml/include
+export WHISPER_LIBRARY_PATH=$(WHISPER_DIR)/build/src/:$(WHISPER_DIR)/build/ggml/src
+
+CGO_LDFLAGS_WHISPER?=
+CGO_LDFLAGS_WHISPER+=-lggml
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
+CUDA_LIBPATH?=/usr/local/cuda/lib64/
+
+ONEAPI_VERSION?=2025.2
+
+# IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+	WHISPER_CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
+ifeq ($(BUILD_TYPE),cublas)
+	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda
+	CMAKE_ARGS+=-DGGML_CUDA=ON
+	CGO_LDFLAGS_WHISPER+=-lcufft -lggml-cuda
+	export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-cuda/
+# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# to CMAKE_ARGS automatically
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblas)
+	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
+else ifeq ($(BUILD_TYPE),hipblas)
+	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
+	export STABLE_BUILD_TYPE=
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+#	GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102
+#	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
+	CMAKE_ARGS+=-DGGML_HIP=ON
+	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib -L$(CURRENT_MAKEFILE_DIR)/sources/whisper.cpp/build/ggml/src/ggml-hip/ -lggml-hip
+#	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DGGML_VULKAN=1
+	CGO_LDFLAGS_WHISPER+=-lggml-vulkan -lvulkan
+	export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-vulkan/
+else ifeq ($(OS),Darwin)
+	ifeq ($(BUILD_TYPE),)
+		BUILD_TYPE=metal
+	endif
+	ifneq ($(BUILD_TYPE),metal)
+		CMAKE_ARGS+=-DGGML_METAL=OFF
+		CGO_LDFLAGS_WHISPER+=-lggml-blas
+		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
+	else
+		CMAKE_ARGS+=-DGGML_METAL=ON
+		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+		CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
+		CMAKE_ARGS+=-DGGML_OPENMP=OFF
+		CMAKE_ARGS+=-DWHISPER_BUILD_EXAMPLES=OFF
+		CMAKE_ARGS+=-DWHISPER_BUILD_TESTS=OFF
+		CMAKE_ARGS+=-DWHISPER_BUILD_SERVER=OFF
+		CGO_LDFLAGS += -framework Accelerate
+		CGO_LDFLAGS_WHISPER+=-lggml-metal -lggml-blas
+		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-metal/:$(WHISPER_DIR)/build/ggml/src/ggml-blas
+	endif
+	TARGET+=--target ggml-metal
+endif
+
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+	export CC=icx
+	export CXX=icpx
+	CGO_LDFLAGS_WHISPER += -fsycl -L${DNNLROOT}/lib -rpath ${ONEAPI_ROOT}/${ONEAPI_VERSION}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL -lggml-sycl
+	CGO_LDFLAGS_WHISPER += $(shell pkg-config --libs mkl-static-lp64-gomp)
+	CGO_CXXFLAGS_WHISPER += -fiopenmp -fopenmp-targets=spir64
+	CGO_CXXFLAGS_WHISPER += $(shell pkg-config --cflags mkl-static-lp64-gomp )
+	export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-sycl/
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl"
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f16)
+	CMAKE_ARGS+=-DGGML_SYCL_F16=ON
+endif
+
+ifneq ($(OS),Darwin)
+	CGO_LDFLAGS_WHISPER+=-lgomp
+endif
+
+## whisper
+sources/whisper.cpp:
+	mkdir -p sources/whisper.cpp
+	cd sources/whisper.cpp && \
+	git init && \
+	git remote add origin $(WHISPER_REPO) && \
+	git fetch origin && \
+	git checkout $(WHISPER_CPP_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp
+	cd sources/whisper.cpp && cmake $(CMAKE_ARGS) $(WHISPER_CMAKE_ARGS) . -B ./build
+	cd sources/whisper.cpp/build && cmake --build . --config Release
+
+whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a
+	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
+	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
+	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \
+	CGO_CXXFLAGS="$(CGO_CXXFLAGS_WHISPER)" \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o whisper ./
+
+package:
+	bash package.sh
+
+build: whisper package
--- a/backend/go/transcribe/whisper/main.go
+++ b/backend/go/transcribe/whisper/main.go
--- a/backend/go/whisper/package.sh
+++ b/backend/go/whisper/package.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+# Script to copy the appropriate libraries based on architecture
+# This script is used in the final stage of the Dockerfile
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+
+# Create lib directory
+mkdir -p $CURDIR/package/lib
+
+cp -avrf $CURDIR/whisper $CURDIR/package/
+cp -rfv $CURDIR/run.sh $CURDIR/package/
+
+# Detect architecture and copy appropriate libraries
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    # x86_64 architecture
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    # ARM64 architecture
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+echo "Packaging completed successfully" 
+ls -liah $CURDIR/package/
+ls -liah $CURDIR/package/lib/
--- a/backend/go/whisper/run.sh
+++ b/backend/go/whisper/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -ex
+
+CURDIR=$(dirname "$(realpath $0)")
+
+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+
+# If there is a lib/ld.so, use it
+if [ -f $CURDIR/lib/ld.so ]; then
+	echo "Using lib/ld.so"
+	exec $CURDIR/lib/ld.so $CURDIR/whisper "$@"
+fi
+
+exec $CURDIR/whisper "$@"
--- a/backend/go/transcribe/whisper/whisper.go
+++ b/backend/go/transcribe/whisper/whisper.go
--- a/backend/index.yaml
+++ b/backend/index.yaml
--- a/Show More
+++ b/Show More