fix(go-grpc-server): always close resultChan

By not closing the channel, if a server not implementing PredictStream receives a client call would hang indefinetly as would wait for resultChan to be consumed. If the prediction stream returns we close the channel now and we wait for the goroutine to finish. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-24 00:26:34 -04:00 · 2024-10-05 00:07:58 +02:00
667 changed files with 19874 additions and 265277 deletions
--- a/.devcontainer/docker-compose-devcontainer.yml
+++ b/.devcontainer/docker-compose-devcontainer.yml
@@ -7,7 +7,7 @@ services:
      args:
      - FFMPEG=true
      - IMAGE_TYPE=extras
-      - GO_TAGS=p2p tts
+      - GO_TAGS=stablediffusion p2p tts
    env_file:
      - ../.env
    ports:
--- a/.env
+++ b/.env
@@ -29,9 +29,6 @@
 ## Enable/Disable single backend (useful if only one GPU is available)
 # LOCALAI_SINGLE_ACTIVE_BACKEND=true
 # Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
 # LOCALAI_FORCE_BACKEND_SHUTDOWN=true
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
@@ -41,12 +38,12 @@
 ## Uncomment and set to true to enable rebuilding from source
 # REBUILD=true
-## Enable go tags, available: p2p, tts
+## Enable go tags, available: stablediffusion, tts
-## p2p: enable distributed inferencing
+## stablediffusion: image generation with stablediffusion
 ## tts: enables text-to-speech with go-piper 
 ## (requires REBUILD=true)
 #
-# GO_TAGS=p2p
+# GO_TAGS=stablediffusion
 ## Path where to store generated images
 # LOCALAI_IMAGE_PATH=/tmp/generated/images
@@ -76,7 +73,7 @@
 ### Define a list of GRPC Servers for llama-cpp workers to distribute the load
 # https://github.com/ggerganov/llama.cpp/pull/6829
-# https://github.com/ggerganov/llama.cpp/blob/master/tools/rpc/README.md
+# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
 # LLAMACPP_GRPC_SERVERS=""
 ### Enable to run parallel requests
@@ -85,15 +82,6 @@
 # Enable to allow p2p mode
 # LOCALAI_P2P=true
 # Enable to use federated mode
 # LOCALAI_FEDERATED=true
 # Enable to start federation server
 # FEDERATED_SERVER=true
 # Define to use federation token
 # TOKEN=""
 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1 @@
 *.sh text eol=lf
 backend/cpp/llama/*.hpp linguist-vendored
--- a/.github/ci/modelslist.go
+++ b/.github/ci/modelslist.go
@@ -6,7 +6,6 @@ import (
 	"io/ioutil"
 	"os"
 	"github.com/microcosm-cc/bluemonday"
 	"gopkg.in/yaml.v3"
 )
@@ -280,12 +279,6 @@ func main() {
 		return
 	}
 	// Ensure that all arbitrary text content is sanitized before display
 	for i, m := range models {
 		models[i].Name = bluemonday.StrictPolicy().Sanitize(m.Name)
 		models[i].Description = bluemonday.StrictPolicy().Sanitize(m.Description)
 	}
 	// render the template
 	data := struct {
 		Models          []*GalleryModel
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -9,8 +9,6 @@ updates:
    directory: "/"
    schedule:
      interval: "weekly"
    ignore:
    - dependency-name: "github.com/mudler/LocalAI/pkg/grpc/proto"
  - package-ecosystem: "github-actions"
    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
    directory: "/"
@@ -29,6 +27,10 @@ updates:
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/autogptq"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/bark"
    schedule:
@@ -77,6 +79,14 @@ updates:
    directory: "/backend/python/transformers"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/transformers-musicgen"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/vall-e-x"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/vllm"
    schedule:
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,15 +1,6 @@
-enhancement:
+enhancements:
 - head-branch: ['^feature', 'feature']
 dependencies:
 - any:
  - changed-files:
    - any-glob-to-any-file: 'Makefile'
  - changed-files:
    - any-glob-to-any-file: '*.mod'
  - changed-files:
    - any-glob-to-any-file: '*.sum'
 kind/documentation:
 - any:
  - changed-files:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -9,17 +9,26 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - repository: "ggml-org/llama.cpp"
+          - repository: "ggerganov/llama.cpp"
            variable: "CPPLLAMA_VERSION"
            branch: "master"
-          - repository: "ggml-org/whisper.cpp"
+          - repository: "go-skynet/go-ggml-transformers.cpp"
            variable: "GOGGMLTRANSFORMERS_VERSION"
            branch: "master"
          - repository: "donomii/go-rwkv.cpp"
            variable: "RWKV_VERSION"
            branch: "main"
          - repository: "ggerganov/whisper.cpp"
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
-          - repository: "PABannier/bark.cpp"
+          - repository: "go-skynet/go-bert.cpp"
-            variable: "BARKCPP_VERSION"
+            variable: "BERT_VERSION"
            branch: "master"
          - repository: "go-skynet/bloomz.cpp"
            variable: "BLOOMZ_VERSION"
            branch: "main"
-          - repository: "leejet/stable-diffusion.cpp"
+          - repository: "mudler/go-ggllm.cpp"
-            variable: "STABLEDIFFUSION_GGML_VERSION"
+            variable: "GOGGLLM_VERSION"
            branch: "master"
          - repository: "mudler/go-stable-diffusion"
            variable: "STABLEDIFFUSION_VERSION"
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -23,7 +23,7 @@ jobs:
          sudo pip install --upgrade pip
          pip install huggingface_hub
      - name: 'Setup yq'
-        uses: dcarbone/install-yq-action@v1.3.1
+        uses: dcarbone/install-yq-action@v1.1.1
        with:
          version: 'v4.44.2'
          download-compressed: true
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.4.0
+        uses: dependabot/fetch-metadata@v2.2.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -33,7 +33,7 @@ jobs:
        run: |
          CGO_ENABLED=0 make build-api
      - name: rm
-        uses: appleboy/ssh-action@v1.2.2
+        uses: appleboy/ssh-action@v1.0.3
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -42,7 +42,7 @@ jobs:
            script: |
                sudo rm -rf local-ai/ || true
      - name: copy file via ssh
-        uses: appleboy/scp-action@v1.0.0
+        uses: appleboy/scp-action@v0.1.7
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -53,7 +53,7 @@ jobs:
            rm: true
            target: ./local-ai
      - name: restarting
-        uses: appleboy/ssh-action@v1.2.2
+        uses: appleboy/ssh-action@v1.0.3
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -2,10 +2,9 @@ name: 'generate and publish GRPC docker caches'
 on:
  workflow_dispatch:
-
+  push:
-  schedule:
+    branches:
-    # daily at midnight
+      - master
    - cron: '0 0 * * *'
 concurrency:
  group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -17,7 +16,7 @@ jobs:
      matrix:
        include:
          - grpc-base-image: ubuntu:22.04
-            runs-on: 'arc-runner-set'
+            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64,linux/arm64'
    runs-on: ${{matrix.runs-on}}
    steps:
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -33,7 +33,6 @@ jobs:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
      max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
      fail-fast: false
      matrix:
        include:
          # This is basically covered by the AIO test
@@ -57,35 +56,26 @@ jobs:
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
+          # - build-type: 'hipblas'
-            platforms: 'linux/amd64'
+          #   platforms: 'linux/amd64'
-            tag-latest: 'false'
+          #   tag-latest: 'false'
-            tag-suffix: '-hipblas'
+          #   tag-suffix: '-hipblas'
-            ffmpeg: 'false'
+          #   ffmpeg: 'false'
-            image-type: 'extras'
+          #   image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
+          #   base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
+          #   grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
+          #   runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
+          #   makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'sycl_f16'
+          # - build-type: 'sycl_f16'
-            platforms: 'linux/amd64'
+          #   platforms: 'linux/amd64'
-            tag-latest: 'false'
+          #   tag-latest: 'false'
-            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+          #   base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-            grpc-base-image: "ubuntu:22.04"
+          #   grpc-base-image: "ubuntu:22.04"
-            tag-suffix: 'sycl-f16-ffmpeg'
+          #   tag-suffix: 'sycl-f16-ffmpeg'
-            ffmpeg: 'true'
+          #   ffmpeg: 'true'
-            image-type: 'extras'
+          #   image-type: 'extras'
-            runs-on: 'arc-runner-set'
+          #   runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
+          #   makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-vulkan-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
  # core-image-build:
  #   uses: ./.github/workflows/image_build.yml
  #   with:
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -45,13 +45,13 @@ jobs:
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-hipblas-extras'
+            tag-suffix: '-hipblas-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
-            latest-image: 'latest-gpu-hipblas-extras'
+            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
@@ -59,13 +59,32 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-            latest-image: 'latest-gpu-hipblas'
+          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-core'
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -95,18 +114,60 @@ jobs:
      max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
      matrix:
        include:
          # Extra images
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: ''
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11'
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12'
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-cublas-cuda11-extras'
+            tag-suffix: '-cublas-cuda11-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            aio: "-aio-gpu-nvidia-cuda-11"
-            latest-image: 'latest-gpu-nvidia-cuda-11-extras'
+            latest-image: 'latest-gpu-nvidia-cuda-11'
            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
@@ -114,26 +175,36 @@ jobs:
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-cublas-cuda12-extras'
+            tag-suffix: '-cublas-cuda12-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            aio: "-aio-gpu-nvidia-cuda-12"
-            latest-image: 'latest-gpu-nvidia-cuda-12-extras'
+            latest-image: 'latest-gpu-nvidia-cuda-12'
            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
            image-type: 'extras'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f16-extras'
+            tag-suffix: '-sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            aio: "-aio-gpu-intel-f16"
-            latest-image: 'latest-gpu-intel-f16-extras'
+            latest-image: 'latest-gpu-intel-f16'
            latest-image-aio: 'latest-aio-gpu-intel-f16'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f32'
@@ -141,12 +212,12 @@ jobs:
            tag-latest: 'auto'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f32-extras'
+            tag-suffix: '-sycl-f32-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            aio: "-aio-gpu-intel-f32"
-            latest-image: 'latest-gpu-intel-f32-extras'
+            latest-image: 'latest-gpu-intel-f32'
            latest-image-aio: 'latest-aio-gpu-intel-f32'
            makeflags: "--jobs=3 --output-sync=target"
          # Core images
@@ -155,23 +226,41 @@ jobs:
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f16'
+            tag-suffix: '-sycl-f16-core'
-            ffmpeg: 'true'
+            ffmpeg: 'false'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
            latest-image: 'latest-gpu-intel-f16'
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f32'
+            tag-suffix: '-sycl-f32-core'
            ffmpeg: 'false'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
            latest-image: 'latest-gpu-intel-f32'
  core-image-build:
    uses: ./.github/workflows/image_build.yml
@@ -191,7 +280,6 @@ jobs:
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
      skip-drivers: ${{ matrix.skip-drivers }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -204,7 +292,7 @@ jobs:
          - build-type: ''
            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'auto'
-            tag-suffix: ''
+            tag-suffix: '-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "ubuntu:22.04"
@@ -213,81 +301,57 @@ jobs:
            latest-image: 'latest-cpu'
            latest-image-aio: 'latest-aio-cpu'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11'
+            tag-suffix: '-cublas-cuda11-core'
-            ffmpeg: 'true'
+            ffmpeg: ''
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
            latest-image: 'latest-gpu-nvidia-cuda-12'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12'
+            tag-suffix: '-cublas-cuda12-core'
            ffmpeg: ''
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
            latest-image: 'latest-gpu-nvidia-cuda-12'
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-vulkan'
+            tag-suffix: '-vulkan-ffmpeg-core'
            latest-image: 'latest-vulkan-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
            latest-image: 'latest-gpu-vulkan'
  gh-runner:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
      ffmpeg: ${{ matrix.ffmpeg }}
      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      aio: ${{ matrix.aio }}
      base-image: ${{ matrix.base-image }}
      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
      skip-drivers: ${{ matrix.skip-drivers }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
      matrix:
        include:
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/arm64'
            tag-latest: 'false'
            tag-suffix: '-nvidia-l4t-arm64'
            latest-image: 'latest-nvidia-l4t-arm64'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
            runs-on: 'ubuntu-24.04-arm'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'true'
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -49,10 +49,6 @@ on:
        description: 'FFMPEG'
        default: ''
        type: string
      skip-drivers:
        description: 'Skip drivers by default'
        default: 'false'
        type: string
      image-type:
        description: 'Image type'
        default: ''
@@ -238,7 +234,6 @@ jobs:
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
@@ -267,7 +262,6 @@ jobs:
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
@@ -310,11 +304,6 @@ jobs:
          tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
          labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
      - name: Cleanup
        run: |
          docker builder prune -f
          docker system prune --force --volumes --all
      - name: Latest tag
        # run this on branches, when it is a tag and there is a latest-image defined
        if: github.event_name != 'pull_request' && inputs.latest-image != ''  && github.ref_type == 'tag'
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -8,7 +8,7 @@ jobs:
  notify-discord:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: gemma-3-12b-it
+        MODEL_NAME: hermes-2-theta-llama-3-8b
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@@ -16,9 +16,9 @@ jobs:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.8.0
+    - uses: GrantBirki/git-diff-action@v2.7.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
@@ -79,7 +79,7 @@ jobs:
        args: ${{ steps.summarize.outputs.message }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.22
+      uses: mxschmitt/action-tmate@v3.18
      with:
        detached: true
        connect-timeout-seconds: 180
@@ -87,7 +87,7 @@ jobs:
  notify-twitter:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: gemma-3-12b-it
+        MODEL_NAME: hermes-2-theta-llama-3-8b
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@@ -99,7 +99,7 @@ jobs:
        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
      # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.8.0
+    - uses: GrantBirki/git-diff-action@v2.7.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
@@ -161,7 +161,7 @@ jobs:
        TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.22
+      uses: mxschmitt/action-tmate@v3.18
      with:
        detached: true
        connect-timeout-seconds: 180
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@@ -14,7 +14,7 @@ jobs:
    steps:
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
    - name: Summarize
      id: summarize
      run: |
@@ -60,4 +60,4 @@ jobs:
        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
      uses: Ilshidur/action-discord@master
      with:
-        args: ${{ steps.summarize.outputs.message }}
+        args: ${{ steps.summarize.outputs.message }}
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -36,7 +36,6 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
          make install-go-tools
      - name: Install CUDA Dependencies
        run: |
          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
@@ -124,7 +123,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -152,7 +151,6 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
          make install-go-tools
      - name: Intel Dependencies
        run: |
          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
@@ -234,12 +232,45 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
-
+  build-stablediffusion:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache upx-ucl
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build stablediffusion
        run: |
          export PATH=$PATH:$GOPATH/bin
          make backend-assets/grpc/stablediffusion
          mkdir -p release && cp backend-assets/grpc/stablediffusion release
        env:
          GO_TAGS: stablediffusion
      - uses: actions/upload-artifact@v4
        with:
          name: stablediffusion
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
            release/*
  build-macOS-x86_64:
    runs-on: macos-13
@@ -255,7 +286,8 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc
-          make install-go-tools
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
      - name: Build
        id: build
        run: |
@@ -276,7 +308,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -296,7 +328,8 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc libomp llvm
-          make install-go-tools
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build
        id: build
        run: |
@@ -317,7 +350,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.4
+        uses: securego/gosec@v2.21.4
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -35,6 +35,30 @@ jobs:
        run: |
           make --jobs=5 --output-sync=target -C backend/python/transformers
           make --jobs=5 --output-sync=target -C backend/python/transformers test
  tests-sentencetransformers:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test sentencetransformers
        run: |
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test
  tests-rerankers:
    runs-on: ubuntu-latest
    steps:
@@ -78,47 +102,71 @@ jobs:
          make --jobs=5 --output-sync=target -C backend/python/diffusers
          make --jobs=5 --output-sync=target -C backend/python/diffusers test
-  #tests-vllm:
+  tests-parler-tts:
-  #  runs-on: ubuntu-latest
+    runs-on: ubuntu-latest
-  #  steps:
+    steps:
-  #    - name: Clone
+      - name: Clone
-  #      uses: actions/checkout@v4
+        uses: actions/checkout@v4
-  #      with:
+        with:
-  #        submodules: true
+          submodules: true
-  #    - name: Dependencies
+      - name: Dependencies
-  #      run: |
+        run: |
-  #        sudo apt-get update
+          sudo apt-get update
-  #        sudo apt-get install -y build-essential ffmpeg
+          sudo apt-get install build-essential ffmpeg
-  #        sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          # Install UV
-  #        sudo apt-get install -y libopencv-dev
+          curl -LsSf https://astral.sh/uv/install.sh | sh
-  #        # Install UV
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
-  #        curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y libopencv-dev
-  #        pip install --user --no-cache-dir grpcio-tools==1.64.1
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
  #    - name: Test vllm backend
  #      run: |
  #        make --jobs=5 --output-sync=target -C backend/python/vllm
  #        make --jobs=5 --output-sync=target -C backend/python/vllm test
  # tests-transformers-musicgen:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
  #       uses: actions/checkout@v4
  #       with:
  #         submodules: true
  #     - name: Dependencies
  #       run: |
  #         sudo apt-get update
  #         sudo apt-get install build-essential ffmpeg
  #         # Install UV
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
-  #     - name: Test transformers-musicgen
+      - name: Test parler-tts
-  #       run: |
+        run: |
-  #          make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts
-  #          make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts test
  tests-openvoice:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test openvoice
        run: |
           make --jobs=5 --output-sync=target -C backend/python/openvoice
           make --jobs=5 --output-sync=target -C backend/python/openvoice test
  tests-transformers-musicgen:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test transformers-musicgen
        run: |
           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
  # tests-bark:
  #   runs-on: ubuntu-latest
@@ -205,6 +253,26 @@ jobs:
  #       run: |
  #          make --jobs=5 --output-sync=target -C backend/python/vllm
  #          make --jobs=5 --output-sync=target -C backend/python/vllm test
  tests-vallex:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test vall-e-x
        run: |
           make --jobs=5 --output-sync=target -C backend/python/vall-e-x
           make --jobs=5 --output-sync=target -C backend/python/vall-e-x test
  tests-coqui:
    runs-on: ubuntu-latest
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -71,7 +71,7 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
-          sudo apt-get install -y libgmock-dev clang
+          sudo apt-get install -y libgmock-dev
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
@@ -96,17 +96,19 @@ jobs:
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install github.com/GeertJohan/go.rice/rice@latest
          # The python3-grpc-tools package in 22.04 is too old
          pip install --user grpcio-tools
-          make -C backend/python/transformers
+          sudo rm -rfv /usr/bin/conda || true
          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
          # Pre-build piper before we start tests in order to have shared libraries in place
          make sources/go-piper && \
          GO_TAGS="tts" make -C sources/go-piper piper.o && \
-          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/
+          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
        env:
          CUDA_VERSION: 12-4
      - name: Cache grpc
@@ -128,10 +130,10 @@ jobs:
          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
-          PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -184,7 +186,6 @@ jobs:
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install github.com/GeertJohan/go.rice/rice@latest
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Build images
        run: |
@@ -196,7 +197,7 @@ jobs:
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -223,8 +224,7 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
-          pip install --user --no-cache-dir grpcio-tools
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
          go install github.com/GeertJohan/go.rice/rice@latest
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
@@ -235,7 +235,7 @@ jobs:
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,6 @@
 /sources/
 __pycache__/
 *.a
 *.o
 get-sources
 prepare-sources
 /backend/cpp/llama/grpc-server
@@ -13,6 +12,7 @@ prepare-sources
 go-ggml-transformers
 go-gpt2
 go-rwkv
 whisper.cpp
 /bloomz
 go-bert
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -26,7 +26,7 @@
                "LOCALAI_P2P": "true",
                "LOCALAI_FEDERATED": "true"
            },
-            "buildFlags": ["-tags", "p2p tts", "-v"],
+            "buildFlags": ["-tags", "stablediffusion p2p tts", "-v"],
            "envFile": "${workspaceFolder}/.env",
            "cwd": "${workspaceRoot}"
        }
--- a/131
+++ b/131
@@ -9,47 +9,32 @@ FROM ${BASE_IMAGE} AS requirements-core
 USER root
 ARG GO_VERSION=1.22.6
 ARG CMAKE_VERSION=3.26.4
 ARG CMAKE_FROM_SOURCE=false
 ARG TARGETARCH
 ARG TARGETVARIANT
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
        ccache \
        ca-certificates \
-        curl libssl-dev \
+        cmake \
        curl \
        git \
        git-lfs \
        unzip upx-ucl && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Install CMake (the version in 22.04 is too old)
 RUN <<EOT bash
    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
    else
        apt-get update && \
        apt-get install -y \
            cmake && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 # Install Go
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
-# Install grpc compilers and rice
+# Install grpc compilers
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
-    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af && \
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
    go install github.com/GeertJohan/go.rice/rice@latest
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
@@ -70,10 +55,14 @@ ENV PATH=/opt/rocm/bin:${PATH}
 # OpenBLAS requirements and stable diffusion
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        libopenblas-dev && \
+        libopenblas-dev \
        libopencv-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Set up OpenCV
 RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
 WORKDIR /build
 ###################################
@@ -82,8 +71,7 @@ WORKDIR /build
 # The requirements-extras target is for any builds with IMAGE_TYPE=extras. It should not be placed in this target unless every IMAGE_TYPE=extras build will use it
 FROM requirements-core AS requirements-extras
-# Install uv as a system package
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
 RUN curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=/usr/bin sh
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
@@ -112,13 +100,12 @@ FROM requirements-${IMAGE_TYPE} AS requirements-drivers
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
 ARG CUDA_MINOR_VERSION=0
 ARG SKIP_DRIVERS=false
 ENV BUILD_TYPE=${BUILD_TYPE}
 # Vulkan requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+    if [ "${BUILD_TYPE}" = "vulkan" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils wget gpg-agent && \
@@ -134,7 +121,7 @@ EOT
 # CuBLAS requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+    if [ "${BUILD_TYPE}" = "cublas" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils
@@ -160,7 +147,7 @@ RUN <<EOT bash
 EOT
 # If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            libclblast-dev && \
@@ -168,7 +155,7 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        rm -rf /var/lib/apt/lists/* \
    ; fi
-RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            hipblas-dev \
@@ -201,8 +188,6 @@ FROM ${GRPC_BASE_IMAGE} AS grpc
 # This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
 ARG GRPC_MAKEFLAGS="-j4 -Otarget"
 ARG GRPC_VERSION=v1.65.0
 ARG CMAKE_FROM_SOURCE=false
 ARG CMAKE_VERSION=3.26.4
 ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
@@ -211,24 +196,12 @@ WORKDIR /build
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        ca-certificates \
-        build-essential curl libssl-dev \
+        build-essential \
        cmake \
        git && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Install CMake (the version in 22.04 is too old)
 RUN <<EOT bash
    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
    else
        apt-get update && \
        apt-get install -y \
            cmake && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 # We install GRPC to a different prefix here so that we can copy in only the build artifacts later
 # saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
 # and running make install in the target container
@@ -248,7 +221,7 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
 FROM requirements-drivers AS builder-base
-ARG GO_TAGS="tts p2p"
+ARG GO_TAGS="stablediffusion tts p2p"
 ARG GRPC_BACKENDS
 ARG MAKEFLAGS
 ARG LD_FLAGS="-s -w"
@@ -282,12 +255,35 @@ RUN <<EOT bash
    fi
 EOT
 ###################################
 ###################################
 # This first portion of builder holds the layers specifically used to build backend-assets/grpc/stablediffusion
 # In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
 FROM builder-base AS builder-sd
 # stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
 COPY Makefile .
 COPY go.mod .
 COPY go.sum .
 COPY backend/backend.proto ./backend/backend.proto
 COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
 COPY pkg/grpc ./pkg/grpc
 COPY pkg/stablediffusion ./pkg/stablediffusion
 RUN git init
 RUN make sources/go-stable-diffusion
 RUN touch prepare-sources
 # Actually build the backend
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
 ###################################
 ###################################
 # The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
 # Adjustments to the build process should likely be made here.
-FROM builder-base AS builder
+FROM builder-sd AS builder
 # Install the pre-built GRPC
 COPY --from=grpc /opt/grpc /usr/local
@@ -301,10 +297,11 @@ COPY .git .
 RUN make prepare
 ## Build the binary
-## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space
+## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
-## Otherwise just run the normal build
+## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
-RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
+## (both will use CUDA or hipblas for the actual computation)
-        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
+RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
    fi
@@ -326,6 +323,8 @@ ARG FFMPEG
 COPY --from=grpc /opt/grpc /usr/local
 COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion /build/backend-assets/grpc/stablediffusion
 COPY .devcontainer-scripts /.devcontainer-scripts
 # Add FFmpeg
@@ -398,28 +397,36 @@ COPY --from=builder /build/local-ai ./
 # Copy shared libraries for piper
 COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
 # do not let stablediffusion rebuild (requires an older version of absl)
 COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
 # Change the shell to bash so we can use [[ tests below
 SHELL ["/bin/bash", "-c"]
 # We try to strike a balance between individual layer size (as that affects total push time) and total image size
 # Splitting the backends into more groups with fewer items results in a larger image, but a smaller size for the largest layer
 # Splitting the backends into fewer groups with more items results in a smaller image, but a larger size for the largest layer
 RUN if [[ ( "${IMAGE_TYPE}" == "extras ")]]; then \
        apt-get -qq -y install espeak-ng \
    ; fi
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/coqui \
    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "faster-whisper" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/faster-whisper \
+        make -C backend/python/parler-tts \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/diffusers \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/transformers-musicgen \
    ; fi
-RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/kokoro \
+        make -C backend/python/vall-e-x \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/openvoice \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "sentencetransformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/sentencetransformers \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/exllama2 \
@@ -431,11 +438,17 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/vllm \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/autogptq \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/bark \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/rerankers \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/mamba \
    ; fi
 # Make sure the models directory exists
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2023-2025 Ettore Di Giacinto (mudler@localai.io)
+Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/424
+++ b/424
@@ -6,39 +6,38 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 # llama.cpp versions
-CPPLLAMA_VERSION?=9a390c4829cd3058d26a2e2c09d16e3fd12bf1b1
+GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
 CPPLLAMA_VERSION?=d5ed2b929d85bbd7dbeecb690880f07d9d7a6077
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
 # whisper.cpp version
-WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
+WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=2e310b841e0b4e7cf00890b53411dd9f8578f243
+WHISPER_CPP_VERSION?=ccc2547210e09e3a1785817383ab770389bb442b
 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
 BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4
 # go-piper version
 PIPER_REPO?=https://github.com/mudler/go-piper
-PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
+PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759
-# bark.cpp
+# stablediffusion version
-BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
+STABLEDIFFUSION_REPO?=https://github.com/mudler/go-stable-diffusion
-BARKCPP_VERSION?=v1.0.0
+STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
-# stablediffusion.cpp (ggml)
+# tinydream version
-STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
+TINYDREAM_REPO?=https://github.com/M0Rf30/go-tiny-dream
-STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae
+TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057
 # ONEAPI variables for SYCL
 export ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
 ONNX_OS?=linux
 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
-export CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
+export CMAKE_ARGS?=
 export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
 export BACKEND_LIBS?=
 export WHISPER_DIR=$(abspath ./sources/whisper.cpp)
 export WHISPER_INCLUDE_PATH=$(WHISPER_DIR)/include:$(WHISPER_DIR)/ggml/include
 export WHISPER_LIBRARY_PATH=$(WHISPER_DIR)/build/src/:$(WHISPER_DIR)/build/ggml/src
 CGO_LDFLAGS?=
 CGO_LDFLAGS_WHISPER?=
@@ -46,7 +45,6 @@ CGO_LDFLAGS_WHISPER+=-lggml
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
 BUILD_ID?=
 NATIVE?=false
 TEST_DIR=/tmp/test
@@ -85,26 +83,7 @@ ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
 # IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 	WHISPER_CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # Detect if we are running on arm64
 ifneq (,$(findstring aarch64,$(shell uname -m)))
 	ONNX_ARCH=aarch64
 endif
 ifeq ($(OS),Darwin)
 	ONNX_OS=osx
 	ifneq (,$(findstring aarch64,$(shell uname -m)))
 		ONNX_ARCH=arm64
 	else ifneq (,$(findstring arm64,$(shell uname -m)))
 		ONNX_ARCH=arm64
 	else
 		ONNX_ARCH=x86_64
 	endif
 	ifeq ($(OSX_SIGNING_IDENTITY),)
 		OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
@@ -116,31 +95,13 @@ ifeq ($(OS),Darwin)
 	# disable metal if on Darwin and any other value is explicitly passed.
 	else ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 		WHISPER_CMAKE_ARGS+=-DGGML_METAL=OFF
 		export GGML_NO_ACCELERATE=1
 		export GGML_NO_METAL=1
 		GO_LDFLAGS_WHISPER+=-lggml-blas
 		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
 	endif
 	ifeq ($(BUILD_TYPE),metal)
 #			-lcblas 	removed: it seems to always be listed as a duplicate flag.
 		CGO_LDFLAGS += -framework Accelerate
 		CGO_LDFLAGS_WHISPER+=-lggml-metal -lggml-blas
 		CMAKE_ARGS+=-DGGML_METAL=ON
 		CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		CMAKE_ARGS+=-DGGML_OPENMP=OFF
 		WHISPER_CMAKE_ARGS+=-DGGML_METAL=ON
 		WHISPER_CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
 		WHISPER_CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_EXAMPLES=OFF
 		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_TESTS=OFF
 		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_SERVER=OFF
 		WHISPER_CMAKE_ARGS+=-DGGML_OPENMP=OFF
 		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-metal/:$(WHISPER_DIR)/build/ggml/src/ggml-blas
 	else
 		CGO_LDFLAGS_WHISPER+=-lggml-blas
 		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
 	endif
 else
 CGO_LDFLAGS_WHISPER+=-lgomp
@@ -152,29 +113,21 @@ ifeq ($(BUILD_TYPE),openblas)
 endif
 ifeq ($(BUILD_TYPE),cublas)
-	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda
+	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
 	export GGML_CUDA=1
-	CMAKE_ARGS+=-DGGML_CUDA=ON
+	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
 	WHISPER_CMAKE_ARGS+=-DGGML_CUDA=ON
 	CGO_LDFLAGS_WHISPER+=-lcufft -lggml-cuda
 	export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-cuda/
 endif
 ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=1
 	WHISPER_CMAKE_ARGS+=-DGGML_VULKAN=1
 	CGO_LDFLAGS_WHISPER+=-lggml-vulkan -lvulkan
 	export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-vulkan/
 endif
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	export GGML_SYCL=1
 	CMAKE_ARGS+=-DGGML_SYCL=ON
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
 	export GGML_SYCL_F16=1
 	CMAKE_ARGS+=-DGGML_SYCL_F16=ON
 endif
 ifeq ($(BUILD_TYPE),hipblas)
@@ -183,11 +136,12 @@ ifeq ($(BUILD_TYPE),hipblas)
 	LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
-	export GGML_HIP=1
+	export GGML_HIPBLAS=1
-	GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102
+	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
-	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
 endif
@@ -206,6 +160,16 @@ ifeq ($(STATIC),true)
 	LD_FLAGS+=-linkmode external -extldflags -static
 endif
 ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
 #	OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
 	OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
 endif
 ifeq ($(findstring tinydream,$(GO_TAGS)),tinydream)
 #	OPTIONAL_TARGETS+=go-tiny-dream/libtinydream.a
 	OPTIONAL_GRPC+=backend-assets/grpc/tinydream
 endif
 ifeq ($(findstring tts,$(GO_TAGS)),tts)
 #	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
 #	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
@@ -215,23 +179,16 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 endif
 ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
 ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
 ifeq ($(ONNX_OS),linux)
 ifeq ($(ONNX_ARCH),x64)
 	ALL_GRPC_BACKENDS+=backend-assets/grpc/bark-cpp
 	ALL_GRPC_BACKENDS+=backend-assets/grpc/stablediffusion-ggml
 endif
 endif
 ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
 ALL_GRPC_BACKENDS+=backend-assets/grpc/silero-vad
 ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
 # Use filter-out to remove the specified backends
 ALL_GRPC_BACKENDS := $(filter-out $(SKIP_GRPC_BACKEND),$(ALL_GRPC_BACKENDS))
@@ -252,22 +209,31 @@ endif
 all: help
-## bark.cpp
+## BERT embeddings
-sources/bark.cpp:
+sources/go-bert.cpp:
-	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
+	mkdir -p sources/go-bert.cpp
-	cd sources/bark.cpp && \
+	cd sources/go-bert.cpp && \
-	git checkout $(BARKCPP_VERSION) && \
+	git init && \
 	git remote add origin $(BERT_REPO) && \
 	git fetch origin && \
 	git checkout $(BERT_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
-sources/bark.cpp/build/libbark.a: sources/bark.cpp
+sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
-	cd sources/bark.cpp && \
+	$(MAKE) -C sources/go-bert.cpp libgobert.a
 	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) .. && \
 	cmake --build . --config Release
-backend/go/bark/libbark.a: sources/bark.cpp/build/libbark.a
+## go-llama.cpp
-	$(MAKE) -C backend/go/bark libbark.a
+sources/go-llama.cpp:
 	mkdir -p sources/go-llama.cpp
 	cd sources/go-llama.cpp && \
 	git init && \
 	git remote add origin $(GOLLAMA_REPO) && \
 	git fetch origin && \
 	git checkout $(GOLLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
 	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
 ## go-piper
 sources/go-piper:
@@ -282,33 +248,45 @@ sources/go-piper:
 sources/go-piper/libpiper_binding.a: sources/go-piper
 	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
-## stablediffusion (ggml)
+
-sources/stablediffusion-ggml.cpp:
+## RWKV
-	git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
+sources/go-rwkv.cpp:
-	cd sources/stablediffusion-ggml.cpp && \
+	mkdir -p sources/go-rwkv.cpp
-	git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
+	cd sources/go-rwkv.cpp && \
 	git init && \
 	git remote add origin $(RWKV_REPO) && \
 	git fetch origin && \
 	git checkout $(RWKV_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
-backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
+sources/go-rwkv.cpp/librwkv.a: sources/go-rwkv.cpp
-	$(MAKE) -C backend/go/image/stablediffusion-ggml build/libstable-diffusion.a
+	cd sources/go-rwkv.cpp && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
 	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
-backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
+## stable diffusion
-	$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml
+sources/go-stable-diffusion:
 	mkdir -p sources/go-stable-diffusion
 	cd sources/go-stable-diffusion && \
 	git init && \
 	git remote add origin $(STABLEDIFFUSION_REPO) && \
 	git fetch origin && \
 	git checkout $(STABLEDIFFUSION_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
-sources/onnxruntime:
+sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
-	mkdir -p sources/onnxruntime
+	CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
 	curl -L https://github.com/microsoft/onnxruntime/releases/download/v$(ONNX_VERSION)/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz -o sources/onnxruntime/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
 	cd sources/onnxruntime && tar -xvf onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz && rm onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
 	cd sources/onnxruntime && mv onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION)/* ./
-backend-assets/lib/libonnxruntime.so.1: backend-assets/lib sources/onnxruntime
+## tiny-dream
-	cp -rfv sources/onnxruntime/lib/* backend-assets/lib/
+sources/go-tiny-dream:
-ifeq ($(OS),Darwin)
+	mkdir -p sources/go-tiny-dream
-	mv backend-assets/lib/libonnxruntime.$(ONNX_VERSION).dylib backend-assets/lib/libonnxruntime.dylib
+	cd sources/go-tiny-dream && \
-else
+	git init && \
-	mv backend-assets/lib/libonnxruntime.so.$(ONNX_VERSION) backend-assets/lib/libonnxruntime.so.1
+	git remote add origin $(TINYDREAM_REPO) && \
-endif
+	git fetch origin && \
 	git checkout $(TINYDREAM_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream
 	$(MAKE) -C sources/go-tiny-dream libtinydream.a
 ## whisper
 sources/whisper.cpp:
@@ -320,21 +298,30 @@ sources/whisper.cpp:
 	git checkout $(WHISPER_CPP_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
-sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp
+sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
-	cd sources/whisper.cpp && cmake $(WHISPER_CMAKE_ARGS) . -B ./build
+	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
 	cd sources/whisper.cpp/build && cmake --build . --config Release
-get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
+get-sources: sources/go-llama.cpp sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
 replace:
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp
 	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-bert.cpp
 	$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
 	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@@ -342,8 +329,13 @@ prepare-sources: get-sources replace
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/go-rwkv.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
 	$(MAKE) -C sources/go-bert.cpp clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build
 prepare: prepare-sources $(OPTIONAL_TARGETS)
@@ -356,9 +348,7 @@ clean: ## Remove build related file
 	rm -rf release/
 	rm -rf backend-assets/*
 	$(MAKE) -C backend/cpp/grpc clean
 	$(MAKE) -C backend/go/bark clean
 	$(MAKE) -C backend/cpp/llama clean
 	$(MAKE) -C backend/go/image/stablediffusion-ggml clean
 	rm -rf backend/cpp/llama-* || true
 	$(MAKE) dropreplace
 	$(MAKE) protogen-clean
@@ -372,14 +362,8 @@ clean-tests:
 clean-dc: clean
 	cp -r /build/backend-assets /workspace/backend-assets
 ## Install Go tools
 install-go-tools:
 	go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
 	go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
 	go install github.com/GeertJohan/go.rice/rice@latest
 ## Build:
-build: prepare backend-assets grpcs install-go-tools ## Build the project
+build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
@@ -389,9 +373,7 @@ ifneq ($(BACKEND_LIBS),)
 	$(MAKE) backend-assets/lib
 	cp -f $(BACKEND_LIBS) backend-assets/lib/
 endif
 	rm -rf $(BINARY_NAME) || true
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
 	rice append --exec $(BINARY_NAME)
 build-minimal:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
@@ -453,23 +435,25 @@ run: prepare ## run local-ai
 test-models/testmodel.ggml:
 	mkdir test-models
 	mkdir test-dir
-	wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
+	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
 	wget -q https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
 	wget -q https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
 	cp tests/models_fixtures/* test-models
 prepare-test: grpcs
 	cp -rf backend-assets core/http
 	cp tests/models_fixtures/* test-models
 ## Test targets
 test: prepare test-models/testmodel.ggml grpcs
 	@echo 'Running tests'
-	export GO_TAGS="tts debug"
+	export GO_TAGS="tts stablediffusion debug"
 	$(MAKE) prepare-test
-	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion
@@ -486,34 +470,38 @@ run-e2e-image:
 run-e2e-aio: protogen-go
 	@echo 'Running e2e AIO tests'
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
 test-e2e:
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
 	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
 teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)
 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)
 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)
 test-stablediffusion: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)
 test-stores: backend-assets/grpc/local-store
 	mkdir -p tests/integration/backend-assets/grpc
 	cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts $(TEST_FLAKES) -v -r tests/integration
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts 1 -v -r tests/integration
 test-container:
 	docker build --target requirements -t local-ai-test-container .
@@ -538,7 +526,7 @@ protogen: protogen-go protogen-python
 protogen-clean: protogen-go-clean protogen-python-clean
 .PHONY: protogen-go
-protogen-go: install-go-tools
+protogen-go:
 	mkdir -p pkg/grpc/proto
 	protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
    backend/backend.proto
@@ -549,10 +537,18 @@ protogen-go-clean:
 	$(RM) bin/*
 .PHONY: protogen-python
-protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
 .PHONY: protogen-python-clean
-protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
 .PHONY: autogptq-protogen
 autogptq-protogen:
 	$(MAKE) -C backend/python/autogptq protogen
 .PHONY: autogptq-protogen-clean
 autogptq-protogen-clean:
 	$(MAKE) -C backend/python/autogptq protogen-clean
 .PHONY: bark-protogen
 bark-protogen:
@@ -578,14 +574,6 @@ diffusers-protogen:
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean
 .PHONY: faster-whisper-protogen
 faster-whisper-protogen:
 	$(MAKE) -C backend/python/faster-whisper protogen
 .PHONY: faster-whisper-protogen-clean
 faster-whisper-protogen-clean:
 	$(MAKE) -C backend/python/faster-whisper protogen-clean
 .PHONY: exllama2-protogen
 exllama2-protogen:
 	$(MAKE) -C backend/python/exllama2 protogen
@@ -594,6 +582,14 @@ exllama2-protogen:
 exllama2-protogen-clean:
 	$(MAKE) -C backend/python/exllama2 protogen-clean
 .PHONY: mamba-protogen
 mamba-protogen:
 	$(MAKE) -C backend/python/mamba protogen
 .PHONY: mamba-protogen-clean
 mamba-protogen-clean:
 	$(MAKE) -C backend/python/mamba protogen-clean
 .PHONY: rerankers-protogen
 rerankers-protogen:
 	$(MAKE) -C backend/python/rerankers protogen
@@ -602,6 +598,14 @@ rerankers-protogen:
 rerankers-protogen-clean:
 	$(MAKE) -C backend/python/rerankers protogen-clean
 .PHONY: sentencetransformers-protogen
 sentencetransformers-protogen:
 	$(MAKE) -C backend/python/sentencetransformers protogen
 .PHONY: sentencetransformers-protogen-clean
 sentencetransformers-protogen-clean:
 	$(MAKE) -C backend/python/sentencetransformers protogen-clean
 .PHONY: transformers-protogen
 transformers-protogen:
 	$(MAKE) -C backend/python/transformers protogen
@@ -610,13 +614,37 @@ transformers-protogen:
 transformers-protogen-clean:
 	$(MAKE) -C backend/python/transformers protogen-clean
-.PHONY: kokoro-protogen
+.PHONY: parler-tts-protogen
-kokoro-protogen:
+parler-tts-protogen:
-	$(MAKE) -C backend/python/kokoro protogen
+	$(MAKE) -C backend/python/parler-tts protogen
-.PHONY: kokoro-protogen-clean
+.PHONY: parler-tts-protogen-clean
-kokoro-protogen-clean:
+parler-tts-protogen-clean:
-	$(MAKE) -C backend/python/kokoro protogen-clean
+	$(MAKE) -C backend/python/parler-tts protogen-clean
 .PHONY: transformers-musicgen-protogen
 transformers-musicgen-protogen:
 	$(MAKE) -C backend/python/transformers-musicgen protogen
 .PHONY: transformers-musicgen-protogen-clean
 transformers-musicgen-protogen-clean:
 	$(MAKE) -C backend/python/transformers-musicgen protogen-clean
 .PHONY: vall-e-x-protogen
 vall-e-x-protogen:
 	$(MAKE) -C backend/python/vall-e-x protogen
 .PHONY: vall-e-x-protogen-clean
 vall-e-x-protogen-clean:
 	$(MAKE) -C backend/python/vall-e-x protogen-clean
 .PHONY: openvoice-protogen
 openvoice-protogen:
 	$(MAKE) -C backend/python/openvoice protogen
 .PHONY: openvoice-protogen-clean
 openvoice-protogen-clean:
 	$(MAKE) -C backend/python/openvoice protogen-clean
 .PHONY: vllm-protogen
 vllm-protogen:
@@ -629,25 +657,28 @@ vllm-protogen-clean:
 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
 	$(MAKE) -C backend/python/faster-whisper
 	$(MAKE) -C backend/python/vllm
 	$(MAKE) -C backend/python/mamba
 	$(MAKE) -C backend/python/sentencetransformers
 	$(MAKE) -C backend/python/rerankers
 	$(MAKE) -C backend/python/transformers
-	$(MAKE) -C backend/python/kokoro
+	$(MAKE) -C backend/python/transformers-musicgen
 	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
 	$(MAKE) -C backend/python/openvoice
 	$(MAKE) -C backend/python/exllama2
 prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/diffusers
 	$(MAKE) -C backend/python/vllm
 test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/transformers test
 	$(MAKE) -C backend/python/diffusers test
 	$(MAKE) -C backend/python/vllm test
 backend-assets:
 	mkdir -p backend-assets
@@ -662,6 +693,13 @@ backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_bindin
 backend-assets/grpc: protogen-go replace
 	mkdir -p backend-assets/grpc
 backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/bert-embeddings
 endif
 backend-assets/grpc/huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/
 ifneq ($(UPX),)
@@ -708,13 +746,6 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
 backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-avx512
 	$(MAKE) -C backend/cpp/llama-avx512 purge
 	$(info ${GREEN}I llama-cpp build info:avx512${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
 backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-avx
 	$(MAKE) -C backend/cpp/llama-avx purge
@@ -728,6 +759,10 @@ backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc backend/cpp/llama/ll
 	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
 # TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/
 endif
 backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-cuda
@@ -740,7 +775,7 @@ backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc backend/cpp/llama/lla
 	cp -rf backend/cpp/llama backend/cpp/llama-hipblas
 	$(MAKE) -C backend/cpp/llama-hipblas purge
 	$(info ${GREEN}I llama-cpp build info:hipblas${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
+	BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
 backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc backend/cpp/llama/llama.cpp
@@ -768,11 +803,11 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
 	mkdir -p backend-assets/util/
 	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
-backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
+backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
 ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/bark-cpp
+	$(UPX) backend-assets/grpc/llama-ggml
 endif
 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
@@ -782,15 +817,29 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/piper
 endif
-backend-assets/grpc/silero-vad: backend-assets/grpc backend-assets/lib/libonnxruntime.so.1
+backend-assets/grpc/rwkv: sources/go-rwkv.cpp sources/go-rwkv.cpp/librwkv.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURDIR)/backend-assets/lib \
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv.cpp LIBRARY_PATH=$(CURDIR)/sources/go-rwkv.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/silero-vad ./backend/go/vad/silero
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
 ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/silero-vad
+	$(UPX) backend-assets/grpc/rwkv
 endif
-backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a backend-assets/grpc
+backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/stablediffusion
 endif
 backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/tinydream
 endif
 backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/whisper
@@ -842,8 +891,7 @@ docker-aio-all:
 docker-image-intel:
 	docker build \
-		--progress plain \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04 \
 		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -851,7 +899,7 @@ docker-image-intel:
 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -863,7 +911,7 @@ swagger:
 .PHONY: gen-assets
 gen-assets:
-	$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets
+	$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
 ## Documentation
 docs/layouts/_default:
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 <h1 align="center">
  <br>
-  <img height="300" src="./core/http/static/logo.png"> <br>
+  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
    LocalAI
 <br>
 </h1>
@@ -30,146 +31,79 @@
 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
-<img src="https://img.shields.io/badge/X-%23000000.svg?style=for-the-badge&logo=X&logoColor=white&label=LocalAI_API" alt="Follow LocalAI_API"/>
+<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
 </a>
 <a href="https://discord.gg/uJAeKSAGDy" target="blank">
 <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
 </a>
 </p>
 <p align="center">
 <a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 </p>
 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) 
 [![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/localaiofficial_bot)
 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
-
+![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
 ## 📚🆕 Local Stack Family
 🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
 <table>
  <tr>
    <td width="50%" valign="top">
      <a href="https://github.com/mudler/LocalAGI">
        <img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
      </a>
    </td>
    <td width="50%" valign="top">
      <h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
      <p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
    </td>
  </tr>
  <tr>
    <td width="50%" valign="top">
      <a href="https://github.com/mudler/LocalRecall">
        <img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
      </a>
    </td>
    <td width="50%" valign="top">
      <h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
      <p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
    </td>
  </tr>
 </table>
 ## Screenshots
 | Talk Interface | Generate Audio |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
 | Models Overview | Generate Images |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
 | Chat Interface | Home |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
 | Login | Swarm |
 | --- | --- |
 |![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
 ## 💻 Quickstart
 Run the installer script:
 ```bash
 # Basic installation
 curl https://localai.io/install.sh | sh
 ```
 For more installation options, see [Installer Options](https://localai.io/docs/advanced/installer/).
 Or run with docker:
 ### CPU only image:
 ```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
 ```
 ### Nvidia GPU:
 ```bash
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
 ```
 ### CPU and GPU image (bigger size):
 ```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
 ```
 ### AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
 ```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 # Alternative images:
 # - if you have an Nvidia GPU:
 # docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
 # - without preconfigured models
 # docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
 # - without preconfigured models for Nvidia GPUs
 # docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12 
 ```
-To load models:
+[💻 Getting started](https://localai.io/basics/getting_started/index.html)
 ```bash
 # From the model gallery (see available models with `local-ai models list`, in the WebUI from the model tab, or visiting https://models.localai.io)
 local-ai run llama-3.2-1b-instruct:q4_k_m
 # Start LocalAI with the phi-2 model directly from huggingface
 local-ai run huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf
 # Install and run a model from the Ollama OCI registry
 local-ai run ollama://gemma:2b
 # Run a model from a configuration file
 local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 # Install and run a model from a standard OCI registry (e.g., Docker Hub)
 local-ai run oci://localai/phi-2:latest
 ```
 For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)
 ## 📰 Latest project news
 - Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
 - Apr 2025: WebUI overhaul, AIO images updates
 - Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
 - Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
 - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
 - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
 - Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204
 - Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
 - Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723. P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
+- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
 - June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
 - June 2024: Support for models from OCI registries: https://github.com/mudler/LocalAI/pull/2628
 - May 2024: 🔥🔥 Decentralized P2P llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs  https://localai.io/features/distribute/
 - May 2024: 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
 - May 2024: 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
 - May 2024: 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
 - May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
 - April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
 Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 ## 🔥🔥 Hot topics (looking for help):
 - Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
 - Realtime API https://github.com/mudler/LocalAI/issues/3714
 - 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
 - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
 - Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
 - Assistant API: https://github.com/mudler/LocalAI/issues/1273
 - Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
 - Vulkan: https://github.com/mudler/LocalAI/issues/1647
 - Anthropic API: https://github.com/mudler/LocalAI/issues/1808
 If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
 ## 🚀 [Features](https://localai.io/features/)
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
 - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
 - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
- 🎨 [Image generation](https://localai.io/features/image-generation)
+- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
 - 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) 
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
@@ -177,10 +111,11 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
 - [Agentic capabilities](https://github.com/mudler/LocalAGI)
 - 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!
 ## 💻 Usage
 Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
 ### 🔗 Community and integrations
@@ -198,7 +133,6 @@ Model galleries
 Other:
 - Helm chart https://github.com/go-skynet/helm-charts
 - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
 - Langchain: https://python.langchain.com/docs/integrations/providers/localai/
 - Terminal utility https://github.com/djcopley/ShellOracle
 - Local Smart assistant https://github.com/mudler/LocalAGI
 - Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
@@ -206,9 +140,6 @@ Other:
 - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
 - Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
 - Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
 - Another Telegram Bot https://github.com/JackBekket/Hellper
 - Auto-documentation https://github.com/JackBekket/Reflexia
 - Github bot which answer on issues, with code and documentation as context https://github.com/JackBekket/GitHelper
 - Github Actions: https://github.com/marketplace/actions/start-localai
 - Examples: https://github.com/mudler/LocalAI/tree/master/examples/
@@ -256,7 +187,7 @@ A huge thank you to our generous sponsors who support this project covering CI e
 <p align="center">
  <a href="https://www.spectrocloud.com/" target="blank">
-    <img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
+    <img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
  </a>
  <a href="https://www.premai.io/" target="blank">
    <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
@@ -283,6 +214,7 @@ LocalAI couldn't have been built without the help of great software already avai
 - https://github.com/antimatter15/alpaca.cpp
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/ggerganov/whisper.cpp
 - https://github.com/saharNooby/rwkv.cpp
 - https://github.com/rhasspy/piper
 ## 🤗 Contributors
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -1,7 +1,7 @@
 embeddings: true
 name: text-embedding-ada-002
 backend: bert-embeddings
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin
 usage: |
    You can test this model with curl like this:
--- a/aio/cpu/image-gen.yaml
+++ b/aio/cpu/image-gen.yaml
@@ -1,17 +1,56 @@
 name: stablediffusion
-backend: stablediffusion-ggml
+backend: stablediffusion
 cfg_scale: 4.5
 options:
 - sampler:euler
 parameters:
-  model: stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf
+  model: stablediffusion_assets
-step: 25
+
 license: "BSD-3"
 urls:
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
 description: |
     Stable Diffusion in NCNN with c++, supported txt2img and img2img
 download_files:
- filename: "stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
+- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
-  sha256: "b8944e9fe0b69b36ae1b5bb0185b3a7b8ef14347fe0fa9af6c64c4829022261f"
+  sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
-  uri: "huggingface://second-state/stable-diffusion-v1-5-GGUF/stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
 - filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
  sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
 - filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
  sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
 - filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
  sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
 - filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
  sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
 - filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
  sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
 - filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
  sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
 - filename: "stablediffusion_assets/log_sigmas.bin"
  sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
 - filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
  sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
 - filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
  sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
 - filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
  sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
 - filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
  sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
 - filename: "stablediffusion_assets/vocab.txt"
  sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
 usage: |
        curl http://localhost:8080/v1/images/generations \
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -1,57 +1,101 @@
 context_size: 8192
 f16: true
 function:
  grammar:
    no_mixed_free_string: true
    schema_type: llama3.1 # or JSON is supported too (json)
  response_regex:
  - <function=(?P<name>\w+)>(?P<arguments>.*)</function>
 mmap: true
 name: gpt-4
 mmap: true
 parameters:
-  model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 context_size: 8192
 stopwords:
- <|im_end|>
+- "<|im_end|>"
- <dummy32000>
+- "<dummy32000>"
- <|eot_id|>
+- "</tool_call>"
- <|end_of_text|>
+- "<|eot_id|>"
 - "<|end_of_text|>"
 function:
  # disable injecting the "answer" tool
  disable_no_action: true
  grammar:
    # This allows the grammar to also return messages
    mixed_mode: true
    # Suffix to add to the grammar
    #prefix: '<tool_call>\n'
    # Force parallel calls in the grammar
    # parallel_calls: true
  return_name_in_function_response: true
  # Without grammar uncomment the lines below
  # Warning: this is relying only on the capability of the
  # LLM model to generate the correct function call.
  json_regex_match: 
   - "(?s)<tool_call>(.*?)</tool_call>"
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
  replace_function_results: 
  # Replace everything that is not JSON array or object
  # 
  - key: '(?s)^[^{\[]*'
    value: ""
  - key: '(?s)[^}\]]*$'
    value: ""
  - key: "'([^']*?)'"
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
-    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+    {{.Input -}}
-    You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
+    <|im_start|>assistant
    {{.Input }}
    <|start_header_id|>assistant<|end_header_id|>
  chat_message: |
-    <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall }}
-    {{ else if eq .RoleName "tool" -}}
+    <tool_call>
-    The Function was executed and the response was:
+    {{- else if eq .RoleName "tool" }}
-    {{ end -}}
+    <tool_response>
-    {{ if .Content -}}
+    {{- end }}
-    {{.Content -}}
+    {{- if .Content}}
-    {{ else if .FunctionCall -}}
+    {{.Content }}
-    {{ range .FunctionCall }}
+    {{- end }}
-    [{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
+    {{- if .FunctionCall}}
-    {{ end }}
+    {{toJson .FunctionCall}}
-    {{ end -}}
+    {{- end }}
-    <|eot_id|>
+    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+  function: |-
-    <|start_header_id|>system<|end_header_id|>
+    <|im_start|>system
-    You are an expert in composing functions. You are given a question and a set of possible functions.
+    You are a function calling AI model.
-    Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
+    Here are the available tools:
-    If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
+    <tools>
-    If you decide to invoke any of the function(s), you MUST put it in the format as follows:
+    {{range .Functions}}
-    [func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-    You SHOULD NOT include any other text in the response.
+    {{end}}
-    Here is a list of functions in JSON format that you can invoke.
+    </tools>
-    {{toJson .Functions}}
+    You should call the tools provided to you sequentially
-    <|eot_id|><|start_header_id|>user<|end_header_id|>
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
-    {{.Input}}
+    <scratchpad>
-    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+    {step-by-step reasoning and plan in bullet points}
-
+    </scratchpad>
-download_files:
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+    <tool_call>
-  sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
+    {"arguments": <args-dict>, "name": <function-name>}
-  uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
--- a/aio/cpu/vad.yaml
+++ b/aio/cpu/vad.yaml
@@ -1,8 +0,0 @@
 backend: silero-vad
 name: silero-vad
 parameters:
  model: silero-vad.onnx
 download_files:
 - filename: silero-vad.onnx
  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -1,49 +1,31 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: bakllava-mmproj.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: bakllava.gguf
-stopwords:
+
 - <|im_end|>
 - <dummy32000>
 - </s>
 - <|endoftext|>
 template:
  chat: |
-    {{.Input -}}
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+    ASSISTANT:
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
+- filename: bakllava.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: bakllava-mmproj.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -129,7 +129,7 @@ detect_gpu
 detect_gpu_size
 PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
-export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vad.yaml,/aio/${PROFILE}/vision.yaml}"
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
 check_vars
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@@ -1,7 +1,7 @@
 embeddings: true
 name: text-embedding-ada-002
 backend: sentencetransformers
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: all-MiniLM-L6-v2
 usage: |
    You can test this model with curl like this:
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -1,53 +1,101 @@
 context_size: 4096
 f16: true
 function:
  capture_llm_results:
  - (?s)<Thought>(.*?)</Thought>
  grammar:
    properties_order: name,arguments
  json_regex_match:
  - (?s)<Output>(.*?)</Output>
  replace_llm_results:
  - key: (?s)<Thought>(.*?)</Thought>
    value: ""
 mmap: true
 name: gpt-4
 mmap: true
 parameters:
-  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 context_size: 8192
 stopwords:
- <|im_end|>
+- "<|im_end|>"
- <dummy32000>
+- "<dummy32000>"
- </s>
+- "</tool_call>"
 - "<|eot_id|>"
 - "<|end_of_text|>"
 function:
  # disable injecting the "answer" tool
  disable_no_action: true
  grammar:
    # This allows the grammar to also return messages
    mixed_mode: true
    # Suffix to add to the grammar
    #prefix: '<tool_call>\n'
    # Force parallel calls in the grammar
    # parallel_calls: true
  return_name_in_function_response: true
  # Without grammar uncomment the lines below
  # Warning: this is relying only on the capability of the
  # LLM model to generate the correct function call.
  json_regex_match: 
   - "(?s)<tool_call>(.*?)</tool_call>"
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
  replace_function_results: 
  # Replace everything that is not JSON array or object
  # 
  - key: '(?s)^[^{\[]*'
    value: ""
  - key: '(?s)[^}\]]*$'
    value: ""
  - key: "'([^']*?)'"
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{ .RoleName }}
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall }}
-    Function call:
+    <tool_call>
-    {{ else if eq .RoleName "tool" -}}
+    {{- else if eq .RoleName "tool" }}
-    Function response:
+    <tool_response>
-    {{ end -}}
+    {{- end }}
-    {{ if .Content -}}
+    {{- if .Content}}
    {{.Content }}
-    {{ end -}}
+    {{- end }}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
+    {{- end }}
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+  function: |-
    <|im_start|>system
-    You are an AI assistant that executes function calls, and these are the tools at your disposal:
+    You are a function calling AI model.
    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    <|im_end|>
+    </tools>
    You should call the tools provided to you sequentially
    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
    <scratchpad>
    {step-by-step reasoning and plan in bullet points}
    </scratchpad>
    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
-    <|im_start|>assistant
+    <|im_start|>assistant
 download_files:
 - filename: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
  sha256: 4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4
  uri: huggingface://mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
--- a/aio/gpu-8g/vad.yaml
+++ b/aio/gpu-8g/vad.yaml
@@ -1,8 +0,0 @@
 backend: silero-vad
 name: silero-vad
 parameters:
  model: silero-vad.onnx
 download_files:
 - filename: silero-vad.onnx
  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -1,49 +1,35 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
-stopwords:
+  temperature: 0.2
- <|im_end|>
+  top_k: 40
- <dummy32000>
+  top_p: 0.95
- </s>
+  seed: -1
- <|endoftext|>
+
 template:
  chat: |
-    {{.Input -}}
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+    ASSISTANT:
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/intel/embeddings.yaml
+++ b/aio/intel/embeddings.yaml
@@ -1,7 +1,7 @@
 embeddings: true
 name: text-embedding-ada-002
 backend: sentencetransformers
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: all-MiniLM-L6-v2
 usage: |
    You can test this model with curl like this:
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -1,53 +1,103 @@
 context_size: 4096
 f16: true
 function:
  capture_llm_results:
  - (?s)<Thought>(.*?)</Thought>
  grammar:
    properties_order: name,arguments
  json_regex_match:
  - (?s)<Output>(.*?)</Output>
  replace_llm_results:
  - key: (?s)<Thought>(.*?)</Thought>
    value: ""
 mmap: true
 name: gpt-4
 mmap: false
 context_size: 8192
 f16: false
 parameters:
-  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 stopwords:
- <|im_end|>
+- "<|im_end|>"
- <dummy32000>
+- "<dummy32000>"
- </s>
+- "</tool_call>"
 - "<|eot_id|>"
 - "<|end_of_text|>"
 function:
  # disable injecting the "answer" tool
  disable_no_action: true
  grammar:
    # This allows the grammar to also return messages
    mixed_mode: true
    # Suffix to add to the grammar
    #prefix: '<tool_call>\n'
    # Force parallel calls in the grammar
    # parallel_calls: true
  return_name_in_function_response: true
  # Without grammar uncomment the lines below
  # Warning: this is relying only on the capability of the
  # LLM model to generate the correct function call.
  json_regex_match: 
   - "(?s)<tool_call>(.*?)</tool_call>"
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
  replace_function_results: 
  # Replace everything that is not JSON array or object
  # 
  - key: '(?s)^[^{\[]*'
    value: ""
  - key: '(?s)[^}\]]*$'
    value: ""
  - key: "'([^']*?)'"
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{ .RoleName }}
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall }}
-    Function call:
+    <tool_call>
-    {{ else if eq .RoleName "tool" -}}
+    {{- else if eq .RoleName "tool" }}
-    Function response:
+    <tool_response>
-    {{ end -}}
+    {{- end }}
-    {{ if .Content -}}
+    {{- if .Content}}
    {{.Content }}
-    {{ end -}}
+    {{- end }}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
+    {{- end }}
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+  function: |-
    <|im_start|>system
-    You are an AI assistant that executes function calls, and these are the tools at your disposal:
+    You are a function calling AI model.
    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    <|im_end|>
+    </tools>
    You should call the tools provided to you sequentially
    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
    <scratchpad>
    {step-by-step reasoning and plan in bullet points}
    </scratchpad>
    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
 - filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
--- a/aio/intel/vad.yaml
+++ b/aio/intel/vad.yaml
@@ -1,8 +0,0 @@
 backend: silero-vad
 name: silero-vad
 parameters:
  model: silero-vad.onnx
 download_files:
 - filename: silero-vad.onnx
  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -1,50 +1,35 @@
 backend: llama-cpp
 context_size: 4096
-f16: true
+mmap: false
-mmap: true
+f16: false
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
-stopwords:
+  temperature: 0.2
- <|im_end|>
+  top_k: 40
- <dummy32000>
+  top_p: 0.95
- </s>
+  seed: -1
- <|endoftext|>
+
 template:
  chat: |
-    {{.Input -}}
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+    ASSISTANT:
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/assets.go
+++ b/assets.go
@@ -1,15 +1,6 @@
 package main
-import (
+import "embed"
 	rice "github.com/GeertJohan/go.rice"
 )
-var backendAssets *rice.Box
+//go:embed backend-assets/*
-
+var backendAssets embed.FS
 func init() {
 	var err error
 	backendAssets, err = rice.FindBox("backend-assets")
 	if err != nil {
 		panic(err)
 	}
 }
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -14,7 +14,6 @@ service Backend {
  rpc PredictStream(PredictOptions) returns (stream Reply) {}
  rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
  rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
@@ -29,8 +28,6 @@ service Backend {
  rpc Rerank(RerankRequest) returns (RerankResult) {}
  rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
  rpc VAD(VADRequest) returns (VADResponse) {}
 }
 // Define the empty request
@@ -160,12 +157,6 @@ message Reply {
  bytes message = 1;
  int32 tokens = 2;
  int32 prompt_tokens = 3;
  double timing_prompt_processing = 4;
  double timing_token_generation = 5;
 }
 message GrammarTrigger {
  string word = 1;
 }
 message ModelOptions {
@@ -191,7 +182,11 @@ message ModelOptions {
  int32 NGQA = 20;
  string ModelFile = 21;
-
+  // AutoGPTQ
  string Device = 22;
  bool UseTriton = 23;
  string ModelBaseName = 24;
  bool UseFastTokenizer = 25;
  // Diffusers
  string PipelineType = 26;
@@ -224,12 +219,6 @@ message ModelOptions {
  int32  SwapSpace = 53;
  int32  MaxModelLen = 54;
  int32  TensorParallelSize = 55;
  string LoadFormat = 58;
  bool   DisableLogStatus = 66;
  string DType = 67;
  int32  LimitImagePerPrompt = 68;
  int32  LimitVideoPerPrompt = 69;
  int32  LimitAudioPerPrompt = 70;
  string MMProj = 41;
@@ -243,18 +232,6 @@ message ModelOptions {
  bool FlashAttention = 56;
  bool NoKVOffload = 57;
  string ModelPath = 59;
  repeated string LoraAdapters = 60;
  repeated float LoraScales = 61;
  repeated string Options = 62;
  string CacheTypeKey = 63;
  string CacheTypeValue = 64;
  repeated GrammarTrigger GrammarTriggers = 65;
 }
 message Result {
@@ -302,19 +279,6 @@ message GenerateImageRequest {
  int32 CLIPSkip = 11;
 }
 message GenerateVideoRequest {
  string prompt = 1;
  string start_image = 2;  // Path or base64 encoded image for the start frame
  string end_image = 3;    // Path or base64 encoded image for the end frame
  int32 width = 4;
  int32 height = 5;
  int32 num_frames = 6;    // Number of frames to generate
  int32 fps = 7;          // Frames per second
  int32 seed = 8;
  float cfg_scale = 9;    // Classifier-free guidance scale
  string dst = 10;        // Output path for the generated video
 }
 message TTSRequest {
  string text = 1;
  string model = 2;
@@ -323,19 +287,6 @@ message TTSRequest {
  optional string language = 5;
 }
 message VADRequest {
  repeated float audio = 1;
 }
 message VADSegment {
  float start = 1;
  float end = 2;
 }
 message VADResponse {
  repeated VADSegment segments = 1;
 }
 message SoundGenerationRequest {
  string text = 1;
  string model = 2;
@@ -371,4 +322,4 @@ message StatusResponse {
 message Message {
  string role = 1;
  string content = 2;
-}
+}
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -2,7 +2,7 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
+add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_include_directories(myclip PUBLIC .)
 target_include_directories(myclip PUBLIC ../..)
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
@@ -22,7 +22,7 @@ else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIP=ON
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)
@@ -30,24 +30,19 @@ else ifeq ($(OS),Darwin)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
 		CMAKE_ARGS+=-DGGML_METAL=ON
-		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+# Until this is tested properly, we disable embedded metal file
 # as we already embed it as part of the LocalAI assets
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=OFF
 		TARGET+=--target ggml-metal
 	endif
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
+	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DCMAKE_CXX_FLAGS="-fsycl" \
 		-DGGML_SYCL_F16=ON
 endif
 ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
+	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DCMAKE_CXX_FLAGS="-fsycl"
 endif
 llama.cpp:
@@ -59,8 +54,8 @@ llama.cpp:
 	git checkout -b build $(LLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
-llama.cpp/tools/grpc-server: llama.cpp
+llama.cpp/examples/grpc-server: llama.cpp
-	mkdir -p llama.cpp/tools/grpc-server
+	mkdir -p llama.cpp/examples/grpc-server
 	bash prepare.sh
 rebuild:
@@ -70,13 +65,13 @@ rebuild:
 purge:
 	rm -rf llama.cpp/build
-	rm -rf llama.cpp/tools/grpc-server
+	rm -rf llama.cpp/examples/grpc-server
 	rm -rf grpc-server
 clean: purge
 	rm -rf llama.cpp
-grpc-server: llama.cpp llama.cpp/tools/grpc-server
+grpc-server: llama.cpp llama.cpp/examples/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
@@ -84,4 +79,4 @@ ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 else
 	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
 endif
-	cp llama.cpp/build/bin/grpc-server .
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -52,7 +52,7 @@ struct server_params
 {
    std::string hostname = "127.0.0.1";
    std::vector<std::string> api_keys;
-    std::string public_path = "tools/server/public";
+    std::string public_path = "examples/server/public";
    std::string chat_template = "";
    int32_t port = 8080;
    int32_t read_timeout = 600;
@@ -113,7 +113,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
    std::string ret;
    for (; begin != end; ++begin)
    {
-        ret += common_token_to_piece(ctx, *begin);
+        ret += llama_token_to_piece(ctx, *begin);
    }
    return ret;
 }
@@ -121,7 +121,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
 // format incomplete utf-8 multibyte character for output
 static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
 {
-    std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
+    std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
    // if the size is 1 and first bit is 1, meaning it's a partial character
    //   (size > 1 meaning it's already a known token)
    if (out.size() == 1 && (out[0] & 0x80) == 0x80)
@@ -134,32 +134,6 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
    return out;
 }
 // Adds an RPC server
 // https://github.com/ggerganov/llama.cpp/compare/4dbc8b9cb71876e005724f4e8f73a3544646bcf5..3edfa7d3753c29e44b964c0ff424d2ea8d5fdee6
 static void add_rpc_devices(std::string servers) {
    auto rpc_servers = string_split<std::string>(servers, ',');
    if (rpc_servers.empty()) {
        throw std::invalid_argument("no RPC servers specified");
    }
    ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
    if (!rpc_reg) {
        throw std::invalid_argument("failed to find RPC backend");
    }
    typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
    ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
    if (!ggml_backend_rpc_add_device_fn) {
        throw std::invalid_argument("failed to find RPC device add function");
    }
    for (const auto & server : rpc_servers) {
        ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
        if (dev) {
            ggml_backend_device_register(dev);
        } else {
            throw std::invalid_argument("failed to register RPC device");
        }
    }
 }
 // convert a vector of completion_token_output to json
 static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
 {
@@ -229,8 +203,8 @@ struct llama_client_slot
    std::string stopping_word;
    // sampling
-    struct common_params_sampling sparams;
+    struct gpt_sampler_params sparams;
-    common_sampler *ctx_sampling = nullptr;
+    gpt_sampler *ctx_sampling = nullptr;
    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
@@ -283,7 +257,7 @@ struct llama_client_slot
        images.clear();
    }
-    bool has_budget(common_params &global_params) {
+    bool has_budget(gpt_params &global_params) {
        if (params.n_predict == -1 && global_params.n_predict == -1)
        {
            return true; // limitless
@@ -417,48 +391,14 @@ struct llama_metrics {
    }
 };
 struct llava_embd_batch {
    std::vector<llama_pos>      pos;
    std::vector<int32_t>        n_seq_id;
    std::vector<llama_seq_id>   seq_id_0;
    std::vector<llama_seq_id *> seq_ids;
    std::vector<int8_t>         logits;
    llama_batch batch;
    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
        pos     .resize(n_tokens);
        n_seq_id.resize(n_tokens);
        seq_ids .resize(n_tokens + 1);
        logits  .resize(n_tokens);
        seq_id_0.resize(1);
        seq_id_0[0] = seq_id;
        seq_ids [n_tokens] = nullptr;
        batch = {
            /*n_tokens       =*/ n_tokens,
            /*tokens         =*/ nullptr,
            /*embd           =*/ embd,
            /*pos            =*/ pos.data(),
            /*n_seq_id       =*/ n_seq_id.data(),
            /*seq_id         =*/ seq_ids.data(),
            /*logits         =*/ logits.data(),
        };
        for (int i = 0; i < n_tokens; i++) {
            batch.pos     [i] = pos_0 + i;
            batch.n_seq_id[i] = 1;
            batch.seq_id  [i] = seq_id_0.data();
            batch.logits  [i] = false;
        }
    }
 };
 struct llama_server_context
 {
    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
    const llama_vocab * vocab = nullptr;
    clip_ctx *clp_ctx = nullptr;
-    common_params params;
+    gpt_params params;
    llama_batch batch;
@@ -466,11 +406,6 @@ struct llama_server_context
    bool clean_kv_cache     = true;
    bool all_slots_are_idle = false;
    bool add_bos_token      = true;
    bool has_eos_token      = true;
    bool has_gpu = false;
    bool grammar_lazy = false;
    std::vector<common_grammar_trigger> grammar_triggers;
    int32_t n_ctx;  // total context for all clients / slots
@@ -506,18 +441,15 @@ struct llama_server_context
        }
    }
-    bool load_model(const common_params &params_)
+    bool load_model(const gpt_params &params_)
    {
        params = params_;
-        if (!params.mmproj.path.empty()) {
+        if (!params.mmproj.empty()) {
            multimodal = true;
            LOG_INFO("Multi Modal Mode Enabled", {});
-            clp_ctx = clip_init(params.mmproj.path.c_str(), clip_context_params {
+            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
                /* use_gpu */ has_gpu,
                /*verbosity=*/ GGML_LOG_LEVEL_INFO,
            });
            if(clp_ctx == nullptr) {
-                LOG_ERR("unable to load clip model: %s", params.mmproj.path.c_str());
+                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
                return false;
            }
@@ -526,18 +458,18 @@ struct llama_server_context
            }
        }
-        common_init_result common_init = common_init_from_params(params);
+        llama_init_result llama_init = llama_init_from_gpt_params(params);
-        model = common_init.model.release();
+        model = llama_init.model;
-        ctx = common_init.context.release();
+        ctx = llama_init.context;
        if (model == nullptr)
        {
-            LOG_ERR("unable to load model: %s", params.model.path.c_str());
+            LOG_ERR("unable to load model: %s", params.model.c_str());
            return false;
        }
        if (multimodal) {
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
-            const int n_embd_llm  = llama_model_n_embd(model);
+            const int n_embd_llm  = llama_n_embd(model);
            if (n_embd_clip != n_embd_llm) {
                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                llama_free(ctx);
@@ -546,15 +478,23 @@ struct llama_server_context
            }
        }
        vocab = llama_model_get_vocab(model);
        n_ctx = llama_n_ctx(ctx);
-        add_bos_token = llama_vocab_get_add_bos(vocab);
+        add_bos_token = llama_add_bos_token(model);
        has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
        return true;
    }
    void validate_model_chat_template(server_params & sparams) {
        llama_chat_message chat[] = {{"user", "test"}};
        std::vector<char> buf(1);
        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
        if (res < 0) {
            LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
        }
    }
    llama_client_slot* get_active_slot() {
        for (llama_client_slot& slot : slots) {
            // Check if the slot is currently processing
@@ -638,12 +578,12 @@ struct llama_server_context
                    std::vector<llama_token> p;
                    if (first)
                    {
-                        p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+                        p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
                        first = false;
                    }
                    else
                    {
-                        p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
+                        p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
                    }
                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
                }
@@ -660,7 +600,7 @@ struct llama_server_context
        else
        {
            auto s = json_prompt.template get<std::string>();
-            prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
+            prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
        }
        return prompt_tokens;
@@ -689,7 +629,7 @@ struct llama_server_context
    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
        slot_params default_params;
-        common_params_sampling default_sparams;
+        gpt_sampler_params default_sparams;
        slot->params.stream             = json_value(data, "stream",            false);
        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
@@ -697,6 +637,7 @@ struct llama_server_context
        slot->sparams.top_k             = json_value(data, "top_k",             default_sparams.top_k);
        slot->sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
        slot->sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
        slot->sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
        slot->sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
        slot->sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
        slot->sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
@@ -708,13 +649,12 @@ struct llama_server_context
        slot->sparams.mirostat          = json_value(data, "mirostat",          default_sparams.mirostat);
        slot->sparams.mirostat_tau      = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
        slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
        slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
        slot->sparams.seed               = json_value(data, "seed",              default_sparams.seed);
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
        slot->sparams.grammar_triggers = grammar_triggers;
        slot->sparams.grammar_lazy = grammar_lazy;
        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
            // Might be better to reject the request with a 400 ?
@@ -754,8 +694,8 @@ struct llama_server_context
            slot->prompt = "";
        }
-        if (json_value(data, "ignore_eos", false) && has_eos_token) {
+        if (json_value(data, "ignore_eos", false)) {
-                slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
+                slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
        }
        /*
        slot->sparams.penalty_prompt_tokens.clear();
@@ -794,13 +734,13 @@ struct llama_server_context
            }
        }
      */
        slot->sparams.logit_bias.clear();
        const auto &logit_bias = data.find("logit_bias");
        if (logit_bias != data.end() && logit_bias->is_array())
        {
-            const llama_vocab * vocab = llama_model_get_vocab(model);
+            const int n_vocab = llama_n_vocab(model);
            const int n_vocab = llama_vocab_n_tokens(vocab);
            for (const auto &el : *logit_bias)
            {
                if (el.is_array() && el.size() == 2)
@@ -829,7 +769,7 @@ struct llama_server_context
                    }
                    else if (el[0].is_string())
                    {
-                        auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
+                        auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
                        for (auto tok : toks)
                        {
                            slot->sparams.logit_bias.push_back({tok, bias});
@@ -861,7 +801,7 @@ struct llama_server_context
                        sampler_names.emplace_back(name);
                    }
                }
-                slot->sparams.samplers = common_sampler_types_from_names(sampler_names, false);
+                slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
        }
        else
        {
@@ -945,9 +885,9 @@ struct llama_server_context
        if (slot->ctx_sampling != nullptr)
        {
-            common_sampler_free(slot->ctx_sampling);
+            gpt_sampler_free(slot->ctx_sampling);
        }
-        slot->ctx_sampling = common_sampler_init(model, slot->sparams);
+        slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
        //llama_set_rng_seed(ctx, slot->params.seed);
        slot->command = LOAD_PROMPT;
@@ -974,13 +914,13 @@ struct llama_server_context
        system_tokens.clear();
        if (!system_prompt.empty()) {
-            system_tokens = common_tokenize(ctx, system_prompt, add_bos_token);
+            system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
-            common_batch_clear(batch);
+            llama_batch_clear(batch);
            for (int i = 0; i < (int)system_tokens.size(); ++i)
            {
-                common_batch_add(batch, system_tokens[i], i, { 0 }, false);
+                llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
            }
            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
@@ -994,6 +934,7 @@ struct llama_server_context
                    batch.n_seq_id + i,
                    batch.seq_id   + i,
                    batch.logits   + i,
                    0, 0, 0, // unused
                };
                if (llama_decode(ctx, batch_view) != 0)
                {
@@ -1068,7 +1009,7 @@ struct llama_server_context
    bool process_token(completion_token_output &result, llama_client_slot &slot) {
        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = common_token_to_piece(ctx, result.tok);
+        const std::string token_str = llama_token_to_piece(ctx, result.tok);
        slot.sampled = result.tok;
        // search stop word and delete it
@@ -1159,15 +1100,7 @@ struct llama_server_context
            slot.has_next_token = false;
        }
-        if (slot.n_past >= slot.n_ctx) {
+        if (result.tok == llama_token_eos(model))
            slot.truncated      = true;
            slot.stopped_limit = true;
            slot.has_next_token = false;
            LOG_VERBOSE("stopped due to running out of context capacity", {});
        }
        if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
        {
            slot.stopped_eos = true;
            slot.has_next_token = false;
@@ -1227,7 +1160,7 @@ struct llama_server_context
        samplers.reserve(slot.sparams.samplers.size());
        for (const auto & sampler : slot.sparams.samplers)
        {
-            samplers.emplace_back(common_sampler_type_to_str(sampler));
+            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
        }
        return json {
@@ -1241,6 +1174,7 @@ struct llama_server_context
            {"top_k",             slot.sparams.top_k},
            {"top_p",             slot.sparams.top_p},
            {"min_p",             slot.sparams.min_p},
            {"tfs_z",             slot.sparams.tfs_z},
            {"typical_p",         slot.sparams.typ_p},
            {"repeat_last_n",     slot.sparams.penalty_last_n},
            {"repeat_penalty",    slot.sparams.penalty_repeat},
@@ -1249,12 +1183,13 @@ struct llama_server_context
            {"mirostat",          slot.sparams.mirostat},
            {"mirostat_tau",      slot.sparams.mirostat_tau},
            {"mirostat_eta",      slot.sparams.mirostat_eta},
            {"penalize_nl",       slot.sparams.penalize_nl},
            {"stop",              slot.params.antiprompt},
            {"n_predict",         slot.params.n_predict},
            {"n_keep",            params.n_keep},
            {"ignore_eos",        slot.sparams.ignore_eos},
            {"stream",            slot.params.stream},
-             //      {"logit_bias",        slot.sparams.logit_bias},
+      //      {"logit_bias",        slot.sparams.logit_bias},
            {"n_probs",           slot.sparams.n_probs},
            {"min_keep",          slot.sparams.min_keep},
            {"grammar",           slot.sparams.grammar},
@@ -1281,7 +1216,7 @@ struct llama_server_context
        if (slot.sparams.n_probs > 0)
        {
            std::vector<completion_token_output> probs_output = {};
-            const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
+            const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
            size_t probs_pos      = std::min(slot.sent_token_probs_index,                       slot.generated_token_probs.size());
            size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
            if (probs_pos < probs_stop_pos)
@@ -1333,7 +1268,7 @@ struct llama_server_context
            std::vector<completion_token_output> probs = {};
            if (!slot.params.stream && slot.stopped_word)
            {
-                const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
+                const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
                probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
            }
            else
@@ -1354,7 +1289,7 @@ struct llama_server_context
        queue_results.send(res);
    }
-    void send_embedding(llama_client_slot &slot, const llama_batch & batch)
+    void send_embedding(llama_client_slot &slot)
    {
        task_result res;
        res.id = slot.task_id;
@@ -1362,7 +1297,7 @@ struct llama_server_context
        res.error = false;
        res.stop = true;
-        const int n_embd = llama_model_n_embd(model);
+        const int n_embd = llama_n_embd(model);
        if (!params.embedding)
        {
            LOG_WARNING("embedding disabled", {
@@ -1376,38 +1311,10 @@ struct llama_server_context
        else
        {
            const float *data = llama_get_embeddings(ctx);
-            std::vector<float> embd_res(n_embd, 0.0f);
+            std::vector<float> embedding(data, data + n_embd);
            std::vector<std::vector<float>> embedding;
            for (int i = 0; i < batch.n_tokens; ++i) {
                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
                    continue;
                }
                const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
                if (embd == NULL) {
                    embd = llama_get_embeddings_ith(ctx, i);
                }
                if (embd == NULL) {
                    LOG("failed to get embeddings");
                    continue;
                }
                // normalize only when there is pooling
                // TODO: configurable
                if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
                    common_embd_normalize(embd, embd_res.data(), n_embd, 2);
                    embedding.push_back(embd_res);
                } else {
                    embedding.push_back({ embd, embd + n_embd });
                }
            }
            // OAI compat
            res.result_json = json
            {
-                {"embedding", embedding[0] },
+                {"embedding", embedding },
            };
        }
        queue_results.send(res);
@@ -1472,6 +1379,7 @@ struct llama_server_context
                    batch.n_seq_id + i,
                    batch.seq_id   + i,
                    batch.logits   + i,
                    0, 0, 0, // unused
                };
                if (llama_decode(ctx, batch_view))
                {
@@ -1489,10 +1397,9 @@ struct llama_server_context
                    n_eval = n_batch;
                }
-                const int n_embd = llama_model_n_embd(model);
+                const int n_embd = llama_n_embd(model);
-                float * embd = img.image_embedding + i * n_embd;
+                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
-                llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
+                if (llama_decode(ctx, batch_img))
                if (llama_decode(ctx, llava_batch.batch))
                {
                    LOG("%s : failed to eval image\n", __func__);
                    return false;
@@ -1501,7 +1408,7 @@ struct llama_server_context
            }
            image_idx++;
-            common_batch_clear(batch);
+            llama_batch_clear(batch);
            // append prefix of next image
            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
@@ -1511,7 +1418,7 @@ struct llama_server_context
            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
            for (int i = 0; i < (int) append_tokens.size(); ++i)
            {
-                common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
+                llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
                slot.n_past += 1;
            }
        }
@@ -1643,7 +1550,7 @@ struct llama_server_context
            update_system_prompt();
        }
-        common_batch_clear(batch);
+        llama_batch_clear(batch);
        if (all_slots_are_idle)
        {
@@ -1667,17 +1574,17 @@ struct llama_server_context
            {
                if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
                {
                    // this check is redundant (for good)
                    // we should never get here, because generation should already stopped in process_token()
                    // START LOCALAI changes
                    // Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
                    // See: https://github.com/mudler/LocalAI/issues/1333
                    // Context is exhausted, release the slot
                    slot.release();
                    send_final_response(slot);
-                    slot.has_next_token = false;
+                    slot.cache_tokens.clear();
-                    LOG_ERROR("context is exhausted, release the slot", {});
+                    slot.n_past = 0;
                    slot.truncated = false;
                    slot.has_next_token = true;
                    LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
                    continue;
                    // END LOCALAI changes
@@ -1721,7 +1628,7 @@ struct llama_server_context
            // TODO: we always have to take into account the "system_tokens"
            //       this is not great and needs to be improved somehow
-            common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
+            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
            slot.n_past += 1;
        }
@@ -1770,11 +1677,11 @@ struct llama_server_context
                            suffix_tokens.erase(suffix_tokens.begin());
                        }
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_fim_pre(vocab));
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                        prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_bos(vocab)); // always add BOS
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
-                        prefix_tokens.insert(prefix_tokens.end(),   llama_vocab_fim_suf(vocab));
+                        prefix_tokens.insert(prefix_tokens.end(),   llama_token_suffix(model));
                        prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());
-                        prefix_tokens.push_back(llama_vocab_fim_mid(vocab));
+                        prefix_tokens.push_back(llama_token_middle(model));
                        prompt_tokens = prefix_tokens;
                    }
                    else
@@ -1815,7 +1722,7 @@ struct llama_server_context
                    if (!slot.params.cache_prompt)
                    {
-                        common_sampler_reset(slot.ctx_sampling);
+                        gpt_sampler_reset(slot.ctx_sampling);
                        slot.n_past = 0;
                        slot.n_past_se = 0;
@@ -1827,7 +1734,7 @@ struct llama_server_context
                        // push the prompt into the sampling context (do not apply grammar)
                        for (auto &token : prompt_tokens)
                        {
-                            common_sampler_accept(slot.ctx_sampling, token, false);
+                            gpt_sampler_accept(slot.ctx_sampling, token, false);
                        }
                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@@ -1919,7 +1826,7 @@ struct llama_server_context
                                ga_i += ga_w/ga_n;
                            }
                        }
-                        common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
+                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
                        slot_npast++;
                    }
@@ -1997,6 +1904,7 @@ struct llama_server_context
                batch.n_seq_id + i,
                batch.seq_id   + i,
                batch.logits   + i,
                0, 0, 0, // unused
            };
            const int ret = llama_decode(ctx, batch_view);
@@ -2028,16 +1936,16 @@ struct llama_server_context
                // prompt evaluated for embedding
                if (slot.embedding)
                {
-                    send_embedding(slot, batch_view);
+                    send_embedding(slot);
                    slot.release();
                    slot.i_batch = -1;
                    continue;
                }
                completion_token_output result;
-                const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
+                const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
-                common_sampler_accept(slot.ctx_sampling, id, true);
+                gpt_sampler_accept(slot.ctx_sampling, id, true);
                slot.n_decoded += 1;
                if (slot.n_decoded == 1)
@@ -2048,7 +1956,7 @@ struct llama_server_context
                }
                result.tok = id;
-                const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling);
+                const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
                    result.probs.push_back({
@@ -2101,7 +2009,7 @@ static json format_partial_response(
 struct token_translator
 {
    llama_context * ctx;
-    std::string operator()(llama_token tok)                    const { return common_token_to_piece(ctx, tok); }
+    std::string operator()(llama_token tok)                    const { return llama_token_to_piece(ctx, tok); }
    std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
 };
@@ -2122,11 +2030,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
 }
 std::function<void(int)> shutdown_handler;
-
+inline void signal_handler(int signal) { shutdown_handler(signal); }
 inline void signal_handler(int signal) {
    exit(1);
 }
 /////////////////////////////////
 ////////////////////////////////
@@ -2170,6 +2074,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    //     slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
    //     slot->sparams.top_k           = json_value(data, "top_k",             default_sparams.top_k);
    //     slot->sparams.top_p           = json_value(data, "top_p",             default_sparams.top_p);
    //     slot->sparams.tfs_z           = json_value(data, "tfs_z",             default_sparams.tfs_z);
    //     slot->sparams.typical_p       = json_value(data, "typical_p",         default_sparams.typical_p);
    //     slot->sparams.temp            = json_value(data, "temperature",       default_sparams.temp);
    //     slot->sparams.penalty_last_n  = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
@@ -2179,6 +2084,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    //     slot->sparams.mirostat        = json_value(data, "mirostat",          default_sparams.mirostat);
    //     slot->sparams.mirostat_tau    = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
    //     slot->sparams.mirostat_eta    = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
    //     slot->sparams.penalize_nl     = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
    //     slot->params.n_keep           = json_value(data, "n_keep",            slot->params.n_keep);
    //     slot->params.seed             = json_value(data, "seed",              default_params.seed);
    //     slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
@@ -2192,6 +2098,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    data["n_predict"] = predict->tokens() == 0 ? -1 : predict->tokens();
    data["top_k"] = predict->topk();
    data["top_p"] = predict->topp();
    data["tfs_z"] = predict->tailfreesamplingz();
    data["typical_p"] = predict->typicalp();
    data["temperature"] = predict->temperature();
    data["repeat_last_n"] = predict->repeat();
@@ -2201,6 +2108,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    data["mirostat"] = predict->mirostat();
    data["mirostat_tau"] = predict->mirostattau();
    data["mirostat_eta"] = predict->mirostateta();
    data["penalize_nl"] = predict->penalizenl();
    data["n_keep"] = predict->nkeep();
    data["seed"] = predict->seed();
    data["grammar"] = predict->grammar();
@@ -2237,6 +2145,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
 //     llama.params.sparams.top_k = predict->topk();
 //     llama.params.sparams.top_p = predict->topp();
 //     llama.params.sparams.tfs_z = predict->tailfreesamplingz();
 //     llama.params.sparams.typical_p = predict->typicalp();
 //     llama.params.sparams.penalty_last_n = predict->repeat();
 //     llama.params.sparams.temp = predict->temperature();
@@ -2246,6 +2155,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     llama.params.sparams.mirostat = predict->mirostat();
 //     llama.params.sparams.mirostat_tau = predict->mirostattau();
 //     llama.params.sparams.mirostat_eta = predict->mirostateta();
 //     llama.params.sparams.penalize_nl = predict->penalizenl();
 //     llama.params.n_keep = predict->nkeep();
 //     llama.params.seed = predict->seed();
 //     llama.params.sparams.grammar = predict->grammar();
@@ -2292,54 +2202,19 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     }
 // }
 const std::vector<ggml_type> kv_cache_types = {
    GGML_TYPE_F32,
    GGML_TYPE_F16,
    GGML_TYPE_BF16,
    GGML_TYPE_Q8_0,
    GGML_TYPE_Q4_0,
    GGML_TYPE_Q4_1,
    GGML_TYPE_IQ4_NL,
    GGML_TYPE_Q5_0,
    GGML_TYPE_Q5_1,
 };
 static ggml_type kv_cache_type_from_str(const std::string & s) {
    for (const auto & type : kv_cache_types) {
        if (ggml_type_name(type) == s) {
            return type;
        }
    }
    throw std::runtime_error("Unsupported cache type: " + s);
 }
 static std::string get_all_kv_cache_types() {
    std::ostringstream msg;
    for (const auto & type : kv_cache_types) {
        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
    }
    return msg.str();
 }
 static void params_parse(const backend::ModelOptions* request,
-                                common_params & params, llama_server_context &llama) {
+                                gpt_params & params) {
    // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
-    params.model.path = request->modelfile();
+    params.model = request->modelfile();
    if (!request->mmproj().empty()) {
    // get the directory of modelfile
-      std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
+      std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
-      params.mmproj.path = model_dir + "/"+ request->mmproj();
+      params.mmproj = model_dir + "/"+ request->mmproj();
    }
    //  params.model_alias ??
    params.model_alias =  request->modelfile();
    if (!request->cachetypekey().empty()) {
        params.cache_type_k = kv_cache_type_from_str(request->cachetypekey());
    }
    if (!request->cachetypevalue().empty()) {
        params.cache_type_v = kv_cache_type_from_str(request->cachetypevalue());
    }
    params.n_ctx = request->contextsize();
    //params.memory_f16 = request->f16memory();
    params.cpuparams.n_threads = request->threads();
@@ -2357,23 +2232,9 @@ static void params_parse(const backend::ModelOptions* request,
    const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
    if (llama_grpc_servers != NULL) {
-        add_rpc_devices(std::string(llama_grpc_servers));
+        params.rpc_servers = std::string(llama_grpc_servers);
    }
     // decode options. Options are in form optname:optvale, or if booleans only optname.
    for (int i = 0; i < request->options_size(); i++) {
        std::string opt = request->options(i);
        char *optname = strtok(&opt[0], ":");
        char *optval = strtok(NULL, ":");
        if (optval == NULL) {
            optval = "true";
        }
        if (!strcmp(optname, "gpu")) {
            llama.has_gpu = true;
        }
    }
    // TODO: Add yarn
    if (!request->tensorsplit().empty()) {
@@ -2405,14 +2266,13 @@ static void params_parse(const backend::ModelOptions* request,
        scale_factor = request->lorascale();
     }
     // get the directory of modelfile
-     std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
+     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
     params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
    }
    params.use_mlock = request->mlock();
    params.use_mmap = request->mmap();
    params.flash_attn = request->flashattention();
    params.no_kv_offload = request->nokvoffload();
    params.ctx_shift = false; // We control context-shifting in any case (and we disable it as it could just lead to infinite loops)
    params.embedding = request->embeddings();
@@ -2437,21 +2297,6 @@ static void params_parse(const backend::ModelOptions* request,
    if ( request->ropefreqscale() != 0.0f ) {
        params.rope_freq_scale = request->ropefreqscale();
    }
    if (request->grammartriggers_size() > 0) {
        LOG_INFO("configuring grammar triggers", {});
        llama.grammar_lazy = true;
        for (int i = 0; i < request->grammartriggers_size(); i++) {
            common_grammar_trigger trigger;
 	    trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_WORD;
            trigger.value = request->grammartriggers(i).word();
 	    // trigger.at_start = request->grammartriggers(i).at_start();
            llama.grammar_triggers.push_back(trigger);
            LOG_INFO("grammar trigger", {
                { "word", trigger.value },
            });
        }
    }
 }
@@ -2466,8 +2311,8 @@ public:
  grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
    // Implement LoadModel RPC
-    common_params params;
+    gpt_params params;
-    params_parse(request, params, llama);
+    params_parse(request, params);
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -2512,13 +2357,6 @@ public:
                int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
                reply.set_prompt_tokens(tokens_evaluated);
                if (result.result_json.contains("timings")) {
                    double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
                    reply.set_timing_prompt_processing(timing_prompt_processing);
                    double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
                    reply.set_timing_token_generation(timing_token_generation);
                }
                // Log Request Correlation Id
                LOG_VERBOSE("correlation:", {
                    { "id", data["correlation_id"] }
@@ -2559,13 +2397,6 @@ public:
            reply->set_prompt_tokens(tokens_evaluated);
            reply->set_tokens(tokens_predicted);
            reply->set_message(completion_text);
            if (result.result_json.contains("timings")) {
                double timing_prompt_processing = result.result_json.at("timings").value("prompt_ms", 0.0);
                reply->set_timing_prompt_processing(timing_prompt_processing);
                double timing_token_generation = result.result_json.at("timings").value("predicted_ms", 0.0);
                reply->set_timing_token_generation(timing_token_generation);
            }
        }
        else
        {
@@ -2600,18 +2431,6 @@ public:
        return grpc::Status::OK;
    }
    grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
         json data = parse_options(false, request, llama);
         std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
         for (int i=0 ; i< tokens.size(); i++){
            response->add_tokens(tokens[i]);
         }
        return grpc::Status::OK;
    }
    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
        llama_client_slot* active_slot = llama.get_active_slot();
@@ -2644,9 +2463,7 @@ void RunServer(const std::string& server_address) {
  ServerBuilder builder;
  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
  builder.RegisterService(&service);
-  builder.SetMaxMessageSize(50 * 1024 * 1024); // 50MB
+
  builder.SetMaxSendMessageSize(50 * 1024 * 1024); // 50MB
  builder.SetMaxReceiveMessageSize(50 * 1024 * 1024); // 50MB
  std::unique_ptr<Server> server(builder.BuildAndStart());
  std::cout << "Server listening on " << server_address << std::endl;
  server->Wait();
@@ -2655,20 +2472,6 @@ void RunServer(const std::string& server_address) {
 int main(int argc, char** argv) {
  std::string server_address("localhost:50051");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
    struct sigaction sigint_action;
    sigint_action.sa_handler = signal_handler;
    sigemptyset (&sigint_action.sa_mask);
    sigint_action.sa_flags = 0;
    sigaction(SIGINT, &sigint_action, NULL);
    sigaction(SIGTERM, &sigint_action, NULL);
 #elif defined (_WIN32)
    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
    };
    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
  // Define long and short options
  struct option long_options[] = {
      {"addr", required_argument, nullptr, 'a'},
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -1,13 +1,13 @@
-diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
+diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 3cd0d2fa..6c5e811a 100644
+index 342042ff..224db9b5 100644
--- a/tools/mtmd/clip.cpp
+--- a/examples/llava/clip.cpp
-+++ b/tools/mtmd/clip.cpp
+++ b/examples/llava/clip.cpp
-@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
+@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
-                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+             struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
-                 int* patches_data = (int*)malloc(ggml_nbytes(patches));
+             int* patches_data = (int*)malloc(ggml_nbytes(patches));
-                 for (int i = 0; i < num_patches; i++) {
+             for (int i = 0; i < num_patches; i++) {
-                    patches_data[i] = i + 1;
+-                patches_data[i] = i + 1;
-+                    patches_data[i] = i;
+                patches_data[i] = i;
-                 }
+             }
-                 ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
+             ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-                 free(patches_data);
+             free(patches_data);
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -7,22 +7,21 @@ for patch in $(ls patches); do
    patch -d llama.cpp/ -p1 < patches/$patch
 done 
-cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
+cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
-cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
+cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
-cp -rfv json.hpp llama.cpp/tools/grpc-server/
+cp -rfv json.hpp llama.cpp/examples/grpc-server/
-cp -rfv utils.hpp llama.cpp/tools/grpc-server/
+cp -rfv utils.hpp llama.cpp/examples/grpc-server/
-if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
+if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
    echo "grpc-server already added"
 else
-    echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
+    echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
 fi
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
-cp -rfv llama.cpp/tools/mtmd/clip.h llama.cpp/tools/grpc-server/clip.h
+cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
-cp -rfv llama.cpp/tools/mtmd/clip-impl.h llama.cpp/tools/grpc-server/clip-impl.h
+cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
-cp -rfv llama.cpp/tools/mtmd/llava.cpp llama.cpp/tools/grpc-server/llava.cpp
+echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
-echo '#include "llama.h"' > llama.cpp/tools/grpc-server/llava.h
+cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
-cat llama.cpp/tools/mtmd/llava.h >> llama.cpp/tools/grpc-server/llava.h
+cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
 cp -rfv llama.cpp/tools/mtmd/clip.cpp llama.cpp/tools/grpc-server/clip.cpp
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -1,4 +1,4 @@
-// https://github.com/ggerganov/llama.cpp/blob/master/tools/server/utils.hpp
+// https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
 #pragma once
@@ -11,7 +11,7 @@
 #include "json.hpp"
-#include "../mtmd/clip.h"
+#include "../llava/clip.h"
 using json = nlohmann::json;
--- a/backend/go/bark/Makefile
+++ b/backend/go/bark/Makefile
@@ -1,25 +0,0 @@
 INCLUDE_PATH := $(abspath ./)
 LIBRARY_PATH := $(abspath ./)
 AR?=ar
 BUILD_TYPE?=
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../sources/bark.cpp/examples -I$(INCLUDE_PATH)/../../../sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/../../../sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
 LDFLAGS  = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/../../../sources/bark.cpp/build/examples -lbark -lstdc++ -lm
 # warnings
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
 gobark.o:
 	$(CXX) $(CXXFLAGS) gobark.cpp -o gobark.o -c $(LDFLAGS)
 libbark.a: gobark.o
 	cp $(INCLUDE_PATH)/../../../sources/bark.cpp/build/libbark.a ./
 	$(AR) rcs libbark.a gobark.o
 	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml.c.o
 	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-alloc.c.o
 	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-backend.c.o
 clean:
 	rm -f gobark.o libbark.a
--- a/backend/go/bark/gobark.cpp
+++ b/backend/go/bark/gobark.cpp
@@ -1,85 +0,0 @@
 #include <iostream>
 #include <tuple>
 #include "bark.h"
 #include "gobark.h"
 #include "common.h"
 #include "ggml.h"
 struct bark_context *c;
 void bark_print_progress_callback(struct bark_context *bctx, enum bark_encoding_step step, int progress, void *user_data) {
    if (step == bark_encoding_step::SEMANTIC) {
        printf("\rGenerating semantic tokens... %d%%", progress);
    } else if (step == bark_encoding_step::COARSE) {
        printf("\rGenerating coarse tokens... %d%%", progress);
    } else if (step == bark_encoding_step::FINE) {
        printf("\rGenerating fine tokens... %d%%", progress);
    }
    fflush(stdout);
 }
 int load_model(char *model) {
    // initialize bark context
    struct bark_context_params ctx_params = bark_context_default_params();
    bark_params params;
    params.model_path = model;
   // ctx_params.verbosity = verbosity;
    ctx_params.progress_callback = bark_print_progress_callback;
    ctx_params.progress_callback_user_data = nullptr;
    struct bark_context *bctx = bark_load_model(params.model_path.c_str(), ctx_params, params.seed);
    if (!bctx) {
        fprintf(stderr, "%s: Could not load model\n", __func__);
        return 1;
    }
    c = bctx;
    return 0;
 }
 int tts(char *text,int  threads, char *dst ) {
    ggml_time_init();
    const int64_t t_main_start_us = ggml_time_us();
    // generate audio
    if (!bark_generate_audio(c, text, threads)) {
        fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
        return 1;
    }
    const float *audio_data = bark_get_audio_data(c);
    if (audio_data == NULL) {
        fprintf(stderr, "%s: Could not get audio data\n", __func__);
        return 1;
    }
    const int audio_arr_size = bark_get_audio_data_size(c);
    std::vector<float> audio_arr(audio_data, audio_data + audio_arr_size);
    write_wav_on_disk(audio_arr, dst);
    // report timing
    {
        const int64_t t_main_end_us = ggml_time_us();
        const int64_t t_load_us = bark_get_load_time(c);
        const int64_t t_eval_us = bark_get_eval_time(c);
        printf("\n\n");
        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
        printf("%s:     eval time = %8.2f ms\n", __func__, t_eval_us / 1000.0f);
        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
    }
    return 0;
 }
 int unload() {
    bark_free(c);
 }
--- a/backend/go/bark/gobark.go
+++ b/backend/go/bark/gobark.go
@@ -1,52 +0,0 @@
 package main
 // #cgo CXXFLAGS: -I${SRCDIR}/../../../sources/bark.cpp/ -I${SRCDIR}/../../../sources/bark.cpp/encodec.cpp -I${SRCDIR}/../../../sources/bark.cpp/examples -I${SRCDIR}/../../../sources/bark.cpp/spm-headers
 // #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../sources/bark.cpp/build/examples -L${SRCDIR}/../../../sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon
 // #include <gobark.h>
 // #include <stdlib.h>
 import "C"
 import (
 	"fmt"
 	"unsafe"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type Bark struct {
 	base.SingleThread
 	threads int
 }
 func (sd *Bark) Load(opts *pb.ModelOptions) error {
 	sd.threads = int(opts.Threads)
 	modelFile := C.CString(opts.ModelFile)
 	defer C.free(unsafe.Pointer(modelFile))
 	ret := C.load_model(modelFile)
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
 	return nil
 }
 func (sd *Bark) TTS(opts *pb.TTSRequest) error {
 	t := C.CString(opts.Text)
 	defer C.free(unsafe.Pointer(t))
 	dst := C.CString(opts.Dst)
 	defer C.free(unsafe.Pointer(dst))
 	threads := C.int(sd.threads)
 	ret := C.tts(t, threads, dst)
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
 	return nil
 }
--- a/backend/go/bark/gobark.h
+++ b/backend/go/bark/gobark.h
@@ -1,8 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 int load_model(char *model);
 int tts(char *text,int  threads, char *dst );
 #ifdef __cplusplus
 }
 #endif
--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -1,135 +0,0 @@
 INCLUDE_PATH := $(abspath ./)
 LIBRARY_PATH := $(abspath ./)
 AR?=ar
 CMAKE_ARGS?=
 BUILD_TYPE?=
 ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
 GOCMD?=go
 CGO_LDFLAGS?=
 # Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
 CGO_LDFLAGS_SYCL=
 GO_TAGS?=
 LD_FLAGS?=
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DSD_CUDA=ON
 # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DSD_HIPBLAS=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)
 	ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DSD_METAL=OFF
 	else
 		CMAKE_ARGS+=-DSD_METAL=ON
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		TARGET+=--target ggml-metal
 	endif
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
 	CMAKE_ARGS+=-DGGML_SYCL=ON \
 		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DSD_SYCL=ON \
 		-DGGML_SYCL_F16=ON
 	CC=icx
 	CXX=icpx
 	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
 	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
 	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
 	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
 endif
 ifeq ($(BUILD_TYPE),sycl_f32)
 	CMAKE_ARGS+=-DGGML_SYCL=ON \
 		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DSD_SYCL=ON
 	CC=icx
 	CXX=icpx
 	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
 	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
 	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
 	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
 endif
 # warnings
 # CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
 # Find all .a archives in ARCHIVE_DIR
 # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
 GGML_ARCHIVE_DIR := build/ggml/src/
 ALL_ARCHIVES := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.a')
 # Name of the single merged library
 COMBINED_LIB := libggmlall.a
 # Rule to merge all the .a files into one
 $(COMBINED_LIB): $(ALL_ARCHIVES)
 	@echo "Merging all .a into $(COMBINED_LIB)"
 	rm -f $@
 	mkdir -p merge-tmp
 	for a in $(ALL_ARCHIVES); do \
 		( cd merge-tmp && ar x ../$$a ); \
 	done
 	( cd merge-tmp && ar rcs ../$@ *.o )
 	# Ensure we have a proper index
 	ranlib $@
 	# Clean up
 	rm -rf merge-tmp
 build/libstable-diffusion.a:
 	@echo "Building SD with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
 	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
 	cmake --build . --config Release"
 else
 	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
 	cmake --build . --config Release
 endif
 	$(MAKE) $(COMBINED_LIB)
 gosd.o:
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
 else
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
 endif
 libsd.a: gosd.o
 	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
 	$(AR) rcs libsd.a gosd.o
 stablediffusion-ggml:
 	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
 	CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
 ifneq ($(UPX),)
 	$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
 endif
 clean:
 	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/go/image/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/image/stablediffusion-ggml/gosd.cpp
@@ -1,231 +0,0 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
 #include <iostream>
 #include <random>
 #include <string>
 #include <vector>
 #include "gosd.h"
 // #include "preprocessing.hpp"
 #include "flux.hpp"
 #include "stable-diffusion.h"
 #define STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_STATIC
 #include "stb_image.h"
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #define STB_IMAGE_WRITE_STATIC
 #include "stb_image_write.h"
 #define STB_IMAGE_RESIZE_IMPLEMENTATION
 #define STB_IMAGE_RESIZE_STATIC
 #include "stb_image_resize.h"
 // Names of the sampler method, same order as enum sample_method in stable-diffusion.h
 const char* sample_method_str[] = {
    "euler_a",
    "euler",
    "heun",
    "dpm2",
    "dpm++2s_a",
    "dpm++2m",
    "dpm++2mv2",
    "ipndm",
    "ipndm_v",
    "lcm",
    "ddim_trailing",
    "tcd",
 };
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
 const char* schedule_str[] = {
    "default",
    "discrete",
    "karras",
    "exponential",
    "ays",
    "gits",
 };
 sd_ctx_t* sd_c;
 sample_method_t sample_method;
 int load_model(char *model, char* options[], int threads, int diff) {
    fprintf (stderr, "Loading model!\n");
    char *stableDiffusionModel = "";
    if (diff == 1 ) {
        stableDiffusionModel = model;
        model = "";
    }
    // decode options. Options are in form optname:optvale, or if booleans only optname.
    char *clip_l_path  = "";
    char *clip_g_path  = "";
    char *t5xxl_path  = "";
    char *vae_path  = "";
    char *scheduler = "";
    char *sampler = "";
    // If options is not NULL, parse options
    for (int i = 0; options[i] != NULL; i++) {
        char *optname = strtok(options[i], ":");
        char *optval = strtok(NULL, ":");
        if (optval == NULL) {
            optval = "true";
        }
        if (!strcmp(optname, "clip_l_path")) {
            clip_l_path = optval;
        }
        if (!strcmp(optname, "clip_g_path")) {
            clip_g_path = optval;
        }
        if (!strcmp(optname, "t5xxl_path")) {
            t5xxl_path = optval;
        }
        if (!strcmp(optname, "vae_path")) {
            vae_path = optval;
        }
        if (!strcmp(optname, "scheduler")) {
            scheduler = optval;
        }
        if (!strcmp(optname, "sampler")) {
            sampler = optval;
        }
    }
    int sample_method_found = -1;
    for (int m = 0; m < N_SAMPLE_METHODS; m++) {
        if (!strcmp(sampler, sample_method_str[m])) {
            sample_method_found = m;
        }
    }
    if (sample_method_found == -1) {
        fprintf(stderr, "Invalid sample method, default to EULER_A!\n");
        sample_method_found = EULER_A;
    }
    sample_method = (sample_method_t)sample_method_found;
    int schedule_found            = -1;
    for (int d = 0; d < N_SCHEDULES; d++) {
        if (!strcmp(scheduler, schedule_str[d])) {
            schedule_found = d;
                fprintf (stderr, "Found scheduler: %s\n", scheduler);
        }
    }
    if (schedule_found == -1) {
        fprintf (stderr, "Invalid scheduler! using DEFAULT\n");
        schedule_found = DEFAULT;
    }
    schedule_t schedule = (schedule_t)schedule_found;
    fprintf (stderr, "Creating context\n");
    sd_ctx_t* sd_ctx = new_sd_ctx(model,
                                  clip_l_path,
                                  clip_g_path,
                                  t5xxl_path,
                                  stableDiffusionModel,
                                  vae_path,
                                  "",
                                  "",
                                  "",
                                  "",
                                  "",
                                  false,
                                  false,
                                  false,
                                  threads,
                                  SD_TYPE_COUNT,
                                  STD_DEFAULT_RNG,
                                  schedule,
                                  false,
                                  false,
                                  false,
                                  false);
    if (sd_ctx == NULL) {
        fprintf (stderr, "failed loading model (generic error)\n");
        return 1;
    }
    fprintf (stderr, "Created context: OK\n");
    sd_c = sd_ctx;
    return 0;
 }
 int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed , char *dst, float cfg_scale) {
    sd_image_t* results;
    std::vector<int> skip_layers = {7, 8, 9};
    fprintf (stderr, "Generating image\n");
    results = txt2img(sd_c,
                            text,
                            negativeText,
                            -1, //clip_skip
                            cfg_scale, // sfg_scale
                            3.5f,
 			    0, // eta
                            width,
                            height,
                            sample_method, 
                            steps,
                            seed,
                            1,
                            NULL,
                            0.9f,
                            20.f,
                            false,
                            "",
                            skip_layers.data(),
                            skip_layers.size(),
                            0,
                            0.01,
                            0.2);
    if (results == NULL) {
        fprintf (stderr, "NO results\n");
        return 1;
    }
    if (results[0].data == NULL) {
        fprintf (stderr, "Results with no data\n");
        return 1;
    }
    fprintf (stderr, "Writing PNG\n");
    fprintf (stderr, "DST: %s\n", dst);
    fprintf (stderr, "Width: %d\n", results[0].width);
    fprintf (stderr, "Height: %d\n", results[0].height);
    fprintf (stderr, "Channel: %d\n", results[0].channel);
    fprintf (stderr, "Data: %p\n", results[0].data);
    stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
                       results[0].data, 0, NULL);
    fprintf (stderr, "Saved resulting image to '%s'\n", dst);
    // TODO: free results. Why does it crash?
    free(results[0].data);
    results[0].data = NULL;
    free(results);
    fprintf (stderr, "gen_image is done", dst);
    return 0;
 }
 int unload() {
    free_sd_ctx(sd_c);
 }
--- a/backend/go/image/stablediffusion-ggml/gosd.go
+++ b/backend/go/image/stablediffusion-ggml/gosd.go
@@ -1,96 +0,0 @@
 package main
 // #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
 // #cgo LDFLAGS: -L${SRCDIR}/ -lsd -lstdc++ -lm -lggmlall -lgomp
 // #include <gosd.h>
 // #include <stdlib.h>
 import "C"
 import (
 	"fmt"
 	"os"
 	"path/filepath"
 	"strings"
 	"unsafe"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/utils"
 )
 type SDGGML struct {
 	base.SingleThread
 	threads      int
 	sampleMethod string
 	cfgScale     float32
 }
 func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
 	sd.threads = int(opts.Threads)
 	modelFile := C.CString(opts.ModelFile)
 	defer C.free(unsafe.Pointer(modelFile))
 	var options **C.char
 	// prepare the options array to pass to C
 	size := C.size_t(unsafe.Sizeof((*C.char)(nil)))
 	length := C.size_t(len(opts.Options))
 	options = (**C.char)(C.malloc(length * size))
 	view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0:len(opts.Options):len(opts.Options)]
 	var diffusionModel int
 	var oo []string
 	for _, op := range opts.Options {
 		if op == "diffusion_model" {
 			diffusionModel = 1
 			continue
 		}
 		// If it's an option path, we resolve absolute path from the model path
 		if strings.Contains(op, ":") && strings.Contains(op, "path") {
 			data := strings.Split(op, ":")
 			data[1] = filepath.Join(opts.ModelPath, data[1])
 			if err := utils.VerifyPath(data[1], opts.ModelPath); err == nil {
 				oo = append(oo, strings.Join(data, ":"))
 			}
 		} else {
 			oo = append(oo, op)
 		}
 	}
 	fmt.Fprintf(os.Stderr, "Options: %+v\n", oo)
 	for i, x := range oo {
 		view[i] = C.CString(x)
 	}
 	sd.cfgScale = opts.CFGScale
 	ret := C.load_model(modelFile, options, C.int(opts.Threads), C.int(diffusionModel))
 	if ret != 0 {
 		return fmt.Errorf("could not load model")
 	}
 	return nil
 }
 func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
 	t := C.CString(opts.PositivePrompt)
 	defer C.free(unsafe.Pointer(t))
 	dst := C.CString(opts.Dst)
 	defer C.free(unsafe.Pointer(dst))
 	negative := C.CString(opts.NegativePrompt)
 	defer C.free(unsafe.Pointer(negative))
 	ret := C.gen_image(t, negative, C.int(opts.Width), C.int(opts.Height), C.int(opts.Step), C.int(opts.Seed), dst, C.float(sd.cfgScale))
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
 	return nil
 }
--- a/backend/go/image/stablediffusion-ggml/gosd.h
+++ b/backend/go/image/stablediffusion-ggml/gosd.h
@@ -1,8 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 int load_model(char *model, char* options[], int threads, int diffusionModel);
 int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed, char *dst, float cfg_scale);
 #ifdef __cplusplus
 }
 #endif
--- a/backend/go/image/stablediffusion/main.go
+++ b/backend/go/image/stablediffusion/main.go
@@ -1,6 +1,7 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
@@ -14,7 +15,7 @@ var (
 func main() {
 	flag.Parse()
-	if err := grpc.StartServer(*addr, &Bark{}); err != nil {
+	if err := grpc.StartServer(*addr, &Image{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/image/stablediffusion/stablediffusion.go
+++ b/backend/go/image/stablediffusion/stablediffusion.go
@@ -0,0 +1,33 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/stablediffusion"
 )
 type Image struct {
 	base.SingleThread
 	stablediffusion *stablediffusion.StableDiffusion
 }
 func (image *Image) Load(opts *pb.ModelOptions) error {
 	var err error
 	// Note: the Model here is a path to a directory containing the model files
 	image.stablediffusion, err = stablediffusion.New(opts.ModelFile)
 	return err
 }
 func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
 	return image.stablediffusion.GenerateImage(
 		int(opts.Height),
 		int(opts.Width),
 		int(opts.Mode),
 		int(opts.Step),
 		int(opts.Seed),
 		opts.PositivePrompt,
 		opts.NegativePrompt,
 		opts.Dst)
 }
--- a/backend/go/image/stablediffusion-ggml/main.go
+++ b/backend/go/image/stablediffusion-ggml/main.go
@@ -1,6 +1,7 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
@@ -14,7 +15,7 @@ var (
 func main() {
 	flag.Parse()
-	if err := grpc.StartServer(*addr, &SDGGML{}); err != nil {
+	if err := grpc.StartServer(*addr, &Image{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/image/tinydream/tinydream.go
+++ b/backend/go/image/tinydream/tinydream.go
@@ -0,0 +1,32 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/tinydream"
 )
 type Image struct {
 	base.SingleThread
 	tinydream *tinydream.TinyDream
 }
 func (image *Image) Load(opts *pb.ModelOptions) error {
 	var err error
 	// Note: the Model here is a path to a directory containing the model files
 	image.tinydream, err = tinydream.New(opts.ModelFile)
 	return err
 }
 func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
 	return image.tinydream.GenerateImage(
 		int(opts.Height),
 		int(opts.Width),
 		int(opts.Step),
 		int(opts.Seed),
 		opts.PositivePrompt,
 		opts.NegativePrompt,
 		opts.Dst)
 }
--- a/backend/go/llm/bert/bert.go
+++ b/backend/go/llm/bert/bert.go
@@ -0,0 +1,34 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	bert "github.com/go-skynet/go-bert.cpp"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type Embeddings struct {
 	base.SingleThread
 	bert *bert.Bert
 }
 func (llm *Embeddings) Load(opts *pb.ModelOptions) error {
 	model, err := bert.New(opts.ModelFile)
 	llm.bert = model
 	return err
 }
 func (llm *Embeddings) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
 	if len(opts.EmbeddingTokens) > 0 {
 		tokens := []int{}
 		for _, t := range opts.EmbeddingTokens {
 			tokens = append(tokens, int(t))
 		}
 		return llm.bert.TokenEmbeddings(tokens, bert.SetThreads(int(opts.Threads)))
 	}
 	return llm.bert.Embeddings(opts.Embeddings, bert.SetThreads(int(opts.Threads)))
 }
--- a/backend/go/llm/bert/main.go
+++ b/backend/go/llm/bert/main.go
@@ -0,0 +1,21 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &Embeddings{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/llama-ggml/llama.go
+++ b/backend/go/llm/llama-ggml/llama.go
@@ -0,0 +1,204 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/go-skynet/go-llama.cpp"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type LLM struct {
 	base.SingleThread
 	llama *llama.LLama
 }
 func (llm *LLM) Load(opts *pb.ModelOptions) error {
 	ropeFreqBase := float32(10000)
 	ropeFreqScale := float32(1)
 	if opts.RopeFreqBase != 0 {
 		ropeFreqBase = opts.RopeFreqBase
 	}
 	if opts.RopeFreqScale != 0 {
 		ropeFreqScale = opts.RopeFreqScale
 	}
 	llamaOpts := []llama.ModelOption{
 		llama.WithRopeFreqBase(ropeFreqBase),
 		llama.WithRopeFreqScale(ropeFreqScale),
 	}
 	if opts.NGQA != 0 {
 		llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
 	}
 	if opts.RMSNormEps != 0 {
 		llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
 	}
 	if opts.ContextSize != 0 {
 		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
 	}
 	if opts.F16Memory {
 		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
 	}
 	if opts.Embeddings {
 		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
 	}
 	if opts.NGPULayers != 0 {
 		llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
 	}
 	llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
 	llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
 	llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
 	if opts.NBatch != 0 {
 		llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
 	} else {
 		llamaOpts = append(llamaOpts, llama.SetNBatch(512))
 	}
 	if opts.NUMA {
 		llamaOpts = append(llamaOpts, llama.EnableNUMA)
 	}
 	if opts.LowVRAM {
 		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
 	}
 	model, err := llama.New(opts.ModelFile, llamaOpts...)
 	llm.llama = model
 	return err
 }
 func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
 	ropeFreqBase := float32(10000)
 	ropeFreqScale := float32(1)
 	if opts.RopeFreqBase != 0 {
 		ropeFreqBase = opts.RopeFreqBase
 	}
 	if opts.RopeFreqScale != 0 {
 		ropeFreqScale = opts.RopeFreqScale
 	}
 	predictOptions := []llama.PredictOption{
 		llama.SetTemperature(opts.Temperature),
 		llama.SetTopP(opts.TopP),
 		llama.SetTopK(int(opts.TopK)),
 		llama.SetTokens(int(opts.Tokens)),
 		llama.SetThreads(int(opts.Threads)),
 		llama.WithGrammar(opts.Grammar),
 		llama.SetRopeFreqBase(ropeFreqBase),
 		llama.SetRopeFreqScale(ropeFreqScale),
 		llama.SetNegativePromptScale(opts.NegativePromptScale),
 		llama.SetNegativePrompt(opts.NegativePrompt),
 	}
 	if opts.PromptCacheAll {
 		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
 	}
 	if opts.PromptCacheRO {
 		predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
 	}
 	// Expected absolute path
 	if opts.PromptCachePath != "" {
 		predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
 	}
 	if opts.Mirostat != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
 	}
 	if opts.MirostatETA != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
 	}
 	if opts.MirostatTAU != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
 	}
 	if opts.Debug {
 		predictOptions = append(predictOptions, llama.Debug)
 	}
 	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
 	if opts.PresencePenalty != 0 {
 		predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
 	}
 	if opts.NKeep != 0 {
 		predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
 	}
 	if opts.Batch != 0 {
 		predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
 	}
 	if opts.F16KV {
 		predictOptions = append(predictOptions, llama.EnableF16KV)
 	}
 	if opts.IgnoreEOS {
 		predictOptions = append(predictOptions, llama.IgnoreEOS)
 	}
 	if opts.Seed != 0 {
 		predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
 	}
 	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
 	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
 	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
 	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
 	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
 	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
 	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
 	predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
 	return predictOptions
 }
 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	predictOptions := buildPredictOptions(opts)
 	predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
 		results <- token
 		return true
 	}))
 	go func() {
 		_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		close(results)
 	}()
 	return nil
 }
 func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
 	predictOptions := buildPredictOptions(opts)
 	if len(opts.EmbeddingTokens) > 0 {
 		tokens := []int{}
 		for _, t := range opts.EmbeddingTokens {
 			tokens = append(tokens, int(t))
 		}
 		return llm.llama.TokenEmbeddings(tokens, predictOptions...)
 	}
 	return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
 }
--- a/backend/go/llm/llama-ggml/main.go
+++ b/backend/go/llm/llama-ggml/main.go
@@ -0,0 +1,19 @@
 package main
 import (
 	"flag"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/vad/silero/main.go
+++ b/backend/go/vad/silero/main.go
@@ -15,7 +15,7 @@ var (
 func main() {
 	flag.Parse()
-	if err := grpc.StartServer(*addr, &VAD{}); err != nil {
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/rwkv/rwkv.go
+++ b/backend/go/llm/rwkv/rwkv.go
@@ -0,0 +1,95 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"path/filepath"
 	"github.com/donomii/go-rwkv.cpp"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 const tokenizerSuffix = ".tokenizer.json"
 type LLM struct {
 	base.SingleThread
 	rwkv *rwkv.RwkvState
 }
 func (llm *LLM) Load(opts *pb.ModelOptions) error {
 	tokenizerFile := opts.Tokenizer
 	if tokenizerFile == "" {
 		modelFile := filepath.Base(opts.ModelFile)
 		tokenizerFile = modelFile + tokenizerSuffix
 	}
 	modelPath := filepath.Dir(opts.ModelFile)
 	tokenizerPath := filepath.Join(modelPath, tokenizerFile)
 	model := rwkv.LoadFiles(opts.ModelFile, tokenizerPath, uint32(opts.GetThreads()))
 	if model == nil {
 		return fmt.Errorf("rwkv could not load model")
 	}
 	llm.rwkv = model
 	return nil
 }
 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 	stopWord := "\n"
 	if len(opts.StopPrompts) > 0 {
 		stopWord = opts.StopPrompts[0]
 	}
 	if err := llm.rwkv.ProcessInput(opts.Prompt); err != nil {
 		return "", err
 	}
 	response := llm.rwkv.GenerateResponse(int(opts.Tokens), stopWord, float32(opts.Temperature), float32(opts.TopP), nil)
 	return response, nil
 }
 func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		stopWord := "\n"
 		if len(opts.StopPrompts) > 0 {
 			stopWord = opts.StopPrompts[0]
 		}
 		if err := llm.rwkv.ProcessInput(opts.Prompt); err != nil {
 			fmt.Println("Error processing input: ", err)
 			return
 		}
 		llm.rwkv.GenerateResponse(int(opts.Tokens), stopWord, float32(opts.Temperature), float32(opts.TopP), func(s string) bool {
 			results <- s
 			return true
 		})
 		close(results)
 	}()
 	return nil
 }
 func (llm *LLM) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationResponse, error) {
 	tokens, err := llm.rwkv.Tokenizer.Encode(opts.Prompt)
 	if err != nil {
 		return pb.TokenizationResponse{}, err
 	}
 	l := len(tokens)
 	i32Tokens := make([]int32, l)
 	for i, t := range tokens {
 		i32Tokens[i] = int32(t.ID)
 	}
 	return pb.TokenizationResponse{
 		Length: int32(l),
 		Tokens: i32Tokens,
 	}, nil
 }
--- a/backend/go/stores/store.go
+++ b/backend/go/stores/store.go
@@ -311,16 +311,12 @@ func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error)
 }
 func isNormalized(k []float32) bool {
-	var sum float64
+	var sum float32
 	for _, v := range k {
-		v64 := float64(v)
+		sum += v
 		sum += v64*v64
 	}
-	s := math.Sqrt(sum)
+	return sum == 1.0
 	return s >= 0.99 && s <= 1.01
 }
 // TODO: This we could replace with handwritten SIMD code
@@ -332,7 +328,7 @@ func normalizedCosineSimilarity(k1, k2 []float32) float32 {
 		dot += k1[i] * k2[i]
 	}
-	assert(dot >= -1.01 && dot <= 1.01, fmt.Sprintf("dot = %f", dot))
+	assert(dot >= -1 && dot <= 1, fmt.Sprintf("dot = %f", dot))
 	// 2.0 * (1.0 - dot) would be the Euclidean distance
 	return dot
@@ -422,7 +418,7 @@ func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {
 	sim := float32(dot / (mag1 * math.Sqrt(mag2)))
-	assert(sim >= -1.01 && sim <= 1.01, fmt.Sprintf("sim = %f", sim))
+	assert(sim >= -1 && sim <= 1, fmt.Sprintf("sim = %f", sim))
 	return sim
 }
--- a/backend/go/transcribe/whisper/whisper.go
+++ b/backend/go/transcribe/whisper/whisper.go
@@ -74,7 +74,7 @@ func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.Transcript
 		context.SetTranslate(true)
 	}
-	if err := context.Process(data, nil, nil, nil); err != nil {
+	if err := context.Process(data, nil, nil); err != nil {
 		return pb.TranscriptResult{}, err
 	}
--- a/backend/go/vad/silero/vad.go
+++ b/backend/go/vad/silero/vad.go
@@ -1,54 +0,0 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/streamer45/silero-vad-go/speech"
 )
 type VAD struct {
 	base.SingleThread
 	detector *speech.Detector
 }
 func (vad *VAD) Load(opts *pb.ModelOptions) error {
 	v, err := speech.NewDetector(speech.DetectorConfig{
 		ModelPath:  opts.ModelFile,
 		SampleRate: 16000,
 		//WindowSize:           1024,
 		Threshold:            0.5,
 		MinSilenceDurationMs: 0,
 		SpeechPadMs:          0,
 	})
 	if err != nil {
 		return fmt.Errorf("create silero detector: %w", err)
 	}
 	vad.detector = v
 	return err
 }
 func (vad *VAD) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
 	audio := req.Audio
 	segments, err := vad.detector.Detect(audio)
 	if err != nil {
 		return pb.VADResponse{}, fmt.Errorf("detect: %w", err)
 	}
 	vadSegments := []*pb.VADSegment{}
 	for _, s := range segments {
 		vadSegments = append(vadSegments, &pb.VADSegment{
 			Start: float32(s.SpeechStartAt),
 			End:   float32(s.SpeechEndAt),
 		})
 	}
 	return pb.VADResponse{
 		Segments: vadSegments,
 	}, nil
 }
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -1,9 +1,6 @@
-.DEFAULT_GOAL := install
+.PHONY: autogptq
-
+autogptq: protogen
 .PHONY: install
 install:
 	bash install.sh
 	$(MAKE) protogen
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
@@ -13,7 +10,7 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
-	bash protogen.sh
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
 .PHONY: clean
 clean: protogen-clean
--- a/backend/python/autogptq/README.md
+++ b/backend/python/autogptq/README.md
@@ -0,0 +1,5 @@
 # Creating a separate environment for the autogptq project
 ```
 make autogptq
 ```
--- a/backend/python/autogptq/backend.py
+++ b/backend/python/autogptq/backend.py
@@ -0,0 +1,153 @@
 #!/usr/bin/env python3
 from concurrent import futures
 import argparse
 import signal
 import sys
 import os
 import time
 import base64
 import grpc
 import backend_pb2
 import backend_pb2_grpc
 from auto_gptq import AutoGPTQForCausalLM
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers import TextGenerationPipeline
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    def Health(self, request, context):
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        try:
            device = "cuda:0"
            if request.Device != "":
                device = request.Device
            # support loading local model files
            model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
            # support model `Qwen/Qwen-VL-Chat-Int4`
            if "qwen-vl" in request.Model.lower():
                self.model_name = "Qwen-VL-Chat"
                model = AutoModelForCausalLM.from_pretrained(model_path, 
                    trust_remote_code=request.TrustRemoteCode,
                    device_map="auto").eval()
            else:
                model = AutoGPTQForCausalLM.from_quantized(model_path,
                    model_basename=request.ModelBaseName,
                    use_safetensors=True,
                    trust_remote_code=request.TrustRemoteCode,
                    device=device,
                    use_triton=request.UseTriton,
                    quantize_config=None)
            self.model = model
            self.tokenizer = tokenizer
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def Predict(self, request, context):
        penalty = 1.0
        if request.Penalty != 0.0:
            penalty = request.Penalty
        tokens = 512
        if request.Tokens != 0:
            tokens = request.Tokens
        top_p = 0.95
        if request.TopP != 0.0:
            top_p = request.TopP
        prompt_images = self.recompile_vl_prompt(request)
        compiled_prompt = prompt_images[0]
        print(f"Prompt: {compiled_prompt}", file=sys.stderr)
        # Implement Predict RPC
        pipeline = TextGenerationPipeline(
            model=self.model, 
            tokenizer=self.tokenizer,
            max_new_tokens=tokens,
            temperature=request.Temperature,
            top_p=top_p,
            repetition_penalty=penalty,
            )
        t = pipeline(compiled_prompt)[0]["generated_text"]
        print(f"generated_text: {t}", file=sys.stderr)
        if compiled_prompt in t:
            t = t.replace(compiled_prompt, "")
        # house keeping. Remove the image files from /tmp folder
        for img_path in prompt_images[1]:
            try:
                os.remove(img_path)
            except Exception as e:
                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
    def PredictStream(self, request, context):
        # Implement PredictStream RPC
        #for reply in some_data_generator():
        #    yield reply
        # Not implemented yet
        return self.Predict(request, context)
    def recompile_vl_prompt(self, request):
        prompt = request.Prompt
        image_paths = []
        if "qwen-vl" in self.model_name.lower():
            # request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
            # Then, save the image file paths to an array "image_paths".
            # read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
            for i, img in enumerate(request.Images):
                timestamp = str(int(time.time() * 1000))  # Generate timestamp
                img_path = f"/tmp/vl-{timestamp}.jpg"  # Use timestamp in filename
                with open(img_path, "wb") as f:
                    f.write(base64.b64decode(img))
                image_paths.append(img_path)
                prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
        else:
            prompt = request.Prompt
        return (prompt, image_paths)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    serve(args.addr)
--- a/backend/python/faster-whisper/install.sh
+++ b/backend/python/faster-whisper/install.sh
--- a/backend/python/autogptq/requirements-cublas11.txt
+++ b/backend/python/autogptq/requirements-cublas11.txt
@@ -1,3 +1,2 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch==2.4.1+cu118
+torch
 transformers
--- a/backend/python/autogptq/requirements-cublas12.txt
+++ b/backend/python/autogptq/requirements-cublas12.txt
@@ -0,0 +1 @@
 torch
--- a/backend/python/faster-whisper/requirements-hipblas.txt
+++ b/backend/python/faster-whisper/requirements-hipblas.txt
@@ -1,3 +1,2 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch
+torch
 faster-whisper
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -0,0 +1,5 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 intel-extension-for-pytorch
 torch
 optimum[openvino]
 setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -0,0 +1,6 @@
 accelerate
 auto-gptq==0.7.1
 grpcio==1.66.2
 protobuf
 certifi
 transformers
--- a/backend/python/faster-whisper/run.sh
+++ b/backend/python/faster-whisper/run.sh
--- a/backend/python/faster-whisper/test.sh
+++ b/backend/python/faster-whisper/test.sh
--- a/backend/python/bark/backend.py
+++ b/backend/python/bark/backend.py
@@ -61,12 +61,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/bark/requirements-cpu.txt
+++ b/backend/python/bark/requirements-cpu.txt
@@ -1,4 +1,4 @@
 transformers
 accelerate
-torch==2.4.1
+torch
-torchaudio==2.4.1
+torchaudio
--- a/backend/python/bark/requirements-cublas11.txt
+++ b/backend/python/bark/requirements-cublas11.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch==2.4.1+cu118
+torch
-torchaudio==2.4.1+cu118
+torchaudio
 transformers
 accelerate
--- a/backend/python/bark/requirements-cublas12.txt
+++ b/backend/python/bark/requirements-cublas12.txt
@@ -1,4 +1,4 @@
-torch==2.4.1
+torch
-torchaudio==2.4.1
+torchaudio
 transformers
 accelerate
--- a/backend/python/bark/requirements-hipblas.txt
+++ b/backend/python/bark/requirements-hipblas.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.4.1+rocm6.0
+torch
-torchaudio==2.4.1+rocm6.0
+torchaudio
 transformers
 accelerate
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -1,9 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
-torchaudio==2.3.1+cxx11.abi
+torchaudio
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.72.0
+grpcio==1.66.2
 protobuf
 certifi
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -17,9 +17,6 @@
 # LIMIT_TARGETS="cublas12"
 # source $(dirname $0)/../common/libbackend.sh
 #
 PYTHON_VERSION="3.10"
 function init() {
    # Name of the backend (directory name)
    BACKEND_NAME=${PWD##*/}
@@ -91,7 +88,7 @@ function getBuildProfile() {
 # always result in an activated virtual environment
 function ensureVenv() {
    if [ ! -d "${EDIR}/venv" ]; then
-        uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
+        uv venv ${EDIR}/venv
        echo "virtualenv created"
    fi
--- a/backend/python/common/template/Makefile
+++ b/backend/python/common/template/Makefile
@@ -1,9 +1,8 @@
 .DEFAULT_GOAL := install
 .PHONY: install
-install:
+install: protogen
 	bash install.sh
 	$(MAKE) protogen
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
@@ -13,7 +12,7 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
-	bash protogen.sh
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
 .PHONY: clean
 clean: protogen-clean
--- a/backend/python/common/template/protogen.sh
+++ b/backend/python/common/template/protogen.sh
@@ -1,6 +0,0 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/common/template/requirements-intel.txt
+++ b/backend/python/common/template/requirements-intel.txt
@@ -1,5 +1,4 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,2 @@
-grpcio==1.72.0
+grpcio==1.66.2
-protobuf
+protobuf
 grpcio-tools
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@@ -86,12 +86,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/coqui/requirements-cpu.txt
+++ b/backend/python/coqui/requirements-cpu.txt
@@ -1,4 +1,3 @@
-transformers==4.48.3
+transformers
 accelerate
-torch==2.4.1
+torch
 coqui-tts
--- a/backend/python/coqui/requirements-cublas11.txt
+++ b/backend/python/coqui/requirements-cublas11.txt
@@ -1,6 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch==2.4.1+cu118
+torch
-torchaudio==2.4.1+cu118
+torchaudio
-transformers==4.48.3
+transformers
-accelerate
+accelerate
 coqui-tts
--- a/backend/python/coqui/requirements-cublas12.txt
+++ b/backend/python/coqui/requirements-cublas12.txt
@@ -1,5 +1,4 @@
-torch==2.4.1
+torch
-torchaudio==2.4.1
+torchaudio
-transformers==4.48.3
+transformers
-accelerate
+accelerate
 coqui-tts
--- a/backend/python/coqui/requirements-hipblas.txt
+++ b/backend/python/coqui/requirements-hipblas.txt
@@ -1,6 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.4.1+rocm6.0
+torch
-torchaudio==2.4.1+rocm6.0
+torchaudio
-transformers==4.48.3
+transformers
-accelerate
+accelerate
 coqui-tts
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -1,10 +1,8 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
-torchaudio==2.3.1+cxx11.abi
+torchaudio
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
-transformers==4.48.3
+transformers
-accelerate
+accelerate
 coqui-tts
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.72.0
+coqui-tts
 grpcio==1.66.2
 protobuf
-certifi
+certifi
 packaging==24.1
--- a/Show More
+++ b/Show More
`@@ -1,2 +1 @@`
	`*.sh text eol=lf`	`*.sh text eol=lf`
	`backend/cpp/llama/*.hpp linguist-vendored`