chore(deps): bump llama.cpp to '10f2e81809bbb69ecfe64fc8b4686285f84b0c07'

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-25 01:02:05 -04:00 · 2025-03-12 09:12:59 +01:00
180 changed files with 2252 additions and 7306 deletions
--- a/.env
+++ b/.env
@@ -29,9 +29,6 @@
 ## Enable/Disable single backend (useful if only one GPU is available)
 # LOCALAI_SINGLE_ACTIVE_BACKEND=true
 # Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
 # LOCALAI_FORCE_BACKEND_SHUTDOWN=true
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
@@ -76,7 +73,7 @@
 ### Define a list of GRPC Servers for llama-cpp workers to distribute the load
 # https://github.com/ggerganov/llama.cpp/pull/6829
-# https://github.com/ggerganov/llama.cpp/blob/master/tools/rpc/README.md
+# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
 # LLAMACPP_GRPC_SERVERS=""
 ### Enable to run parallel requests
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -29,6 +29,10 @@ updates:
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/autogptq"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/bark"
    schedule:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -12,7 +12,7 @@ jobs:
          - repository: "ggml-org/llama.cpp"
            variable: "CPPLLAMA_VERSION"
            branch: "master"
-          - repository: "ggml-org/whisper.cpp"
+          - repository: "ggerganov/whisper.cpp"
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
          - repository: "PABannier/bark.cpp"
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.4.0
+        uses: dependabot/fetch-metadata@v2.3.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -42,7 +42,7 @@ jobs:
            script: |
                sudo rm -rf local-ai/ || true
      - name: copy file via ssh
-        uses: appleboy/scp-action@v1.0.0
+        uses: appleboy/scp-action@v0.1.7
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -33,7 +33,6 @@ jobs:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
      max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
      fail-fast: false
      matrix:
        include:
          # This is basically covered by the AIO test
@@ -57,35 +56,26 @@ jobs:
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
+          # - build-type: 'hipblas'
-            platforms: 'linux/amd64'
+          #   platforms: 'linux/amd64'
-            tag-latest: 'false'
+          #   tag-latest: 'false'
-            tag-suffix: '-hipblas'
+          #   tag-suffix: '-hipblas'
-            ffmpeg: 'false'
+          #   ffmpeg: 'false'
-            image-type: 'extras'
+          #   image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
+          #   base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
+          #   grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
+          #   runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
+          #   makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'sycl_f16'
+          # - build-type: 'sycl_f16'
-            platforms: 'linux/amd64'
+          #   platforms: 'linux/amd64'
-            tag-latest: 'false'
+          #   tag-latest: 'false'
-            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+          #   base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-            grpc-base-image: "ubuntu:22.04"
+          #   grpc-base-image: "ubuntu:22.04"
-            tag-suffix: 'sycl-f16-ffmpeg'
+          #   tag-suffix: 'sycl-f16-ffmpeg'
-            ffmpeg: 'true'
+          #   ffmpeg: 'true'
-            image-type: 'extras'
+          #   image-type: 'extras'
-            runs-on: 'arc-runner-set'
+          #   runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
+          #   makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-vulkan-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
  # core-image-build:
  #   uses: ./.github/workflows/image_build.yml
  #   with:
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -45,13 +45,13 @@ jobs:
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-hipblas-extras'
+            tag-suffix: '-hipblas-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
-            latest-image: 'latest-gpu-hipblas-extras'
+            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
@@ -59,13 +59,32 @@ jobs:
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-            latest-image: 'latest-gpu-hipblas'
+          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-core'
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -95,58 +114,110 @@ jobs:
      max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
      matrix:
        include:
          # Extra images
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: ''
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-extras'
+            tag-suffix: '-cublas-cuda11'
-            ffmpeg: 'true'
+            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            aio: "-aio-gpu-nvidia-cuda-11"
            latest-image: 'latest-gpu-nvidia-cuda-11-extras'
            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-extras'
+            tag-suffix: '-cublas-cuda12'
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda11-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            aio: "-aio-gpu-nvidia-cuda-11"
            latest-image: 'latest-gpu-nvidia-cuda-11'
            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda12-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            aio: "-aio-gpu-nvidia-cuda-12"
-            latest-image: 'latest-gpu-nvidia-cuda-12-extras'
+            latest-image: 'latest-gpu-nvidia-cuda-12'
            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
            image-type: 'extras'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f16-extras'
+            tag-suffix: '-sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            aio: "-aio-gpu-intel-f16"
-            latest-image: 'latest-gpu-intel-f16-extras'
+            latest-image: 'latest-gpu-intel-f16'
            latest-image-aio: 'latest-aio-gpu-intel-f16'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
-            tag-latest: 'false'
+            tag-latest: 'auto'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f32-extras'
+            tag-suffix: '-sycl-f32-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            aio: "-aio-gpu-intel-f32"
-            latest-image: 'latest-gpu-intel-f32-extras'
+            latest-image: 'latest-gpu-intel-f32'
            latest-image-aio: 'latest-aio-gpu-intel-f32'
            makeflags: "--jobs=3 --output-sync=target"
          # Core images
@@ -155,23 +226,41 @@ jobs:
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f16'
+            tag-suffix: '-sycl-f16-core'
-            ffmpeg: 'true'
+            ffmpeg: 'false'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
            latest-image: 'latest-gpu-intel-f16'
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: '-sycl-f32'
+            tag-suffix: '-sycl-f32-core'
            ffmpeg: 'false'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
            latest-image: 'latest-gpu-intel-f32'
  core-image-build:
    uses: ./.github/workflows/image_build.yml
@@ -204,7 +293,7 @@ jobs:
          - build-type: ''
            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'auto'
-            tag-suffix: ''
+            tag-suffix: '-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "ubuntu:22.04"
@@ -219,38 +308,60 @@ jobs:
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11'
+            tag-suffix: '-cublas-cuda11-core'
-            ffmpeg: 'true'
+            ffmpeg: ''
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
            latest-image: 'latest-gpu-nvidia-cuda-12'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12'
+            tag-suffix: '-cublas-cuda12-core'
            ffmpeg: ''
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
            latest-image: 'latest-gpu-nvidia-cuda-12'
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-vulkan'
+            tag-suffix: '-vulkan-ffmpeg-core'
            latest-image: 'latest-vulkan-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
            latest-image: 'latest-gpu-vulkan'
  gh-runner:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -283,8 +394,8 @@ jobs:
            cuda-minor-version: "0"
            platforms: 'linux/arm64'
            tag-latest: 'false'
-            tag-suffix: '-nvidia-l4t-arm64'
+            tag-suffix: '-nvidia-l4t-arm64-core'
-            latest-image: 'latest-nvidia-l4t-arm64'
+            latest-image: 'latest-nvidia-l4t-arm64-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -8,7 +8,7 @@ jobs:
  notify-discord:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: gemma-3-12b-it
+        MODEL_NAME: hermes-2-theta-llama-3-8b
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@@ -16,7 +16,7 @@ jobs:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.8.0
      id: git-diff-action
@@ -79,7 +79,7 @@ jobs:
        args: ${{ steps.summarize.outputs.message }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.22
+      uses: mxschmitt/action-tmate@v3.19
      with:
        detached: true
        connect-timeout-seconds: 180
@@ -87,7 +87,7 @@ jobs:
  notify-twitter:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: gemma-3-12b-it
+        MODEL_NAME: hermes-2-theta-llama-3-8b
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@@ -161,7 +161,7 @@ jobs:
        TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.22
+      uses: mxschmitt/action-tmate@v3.19
      with:
        detached: true
        connect-timeout-seconds: 180
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@@ -14,7 +14,7 @@ jobs:
    steps:
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
    - name: Summarize
      id: summarize
      run: |
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -36,7 +36,6 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
          make install-go-tools
      - name: Install CUDA Dependencies
        run: |
          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
@@ -124,7 +123,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -152,7 +151,6 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
          make install-go-tools
      - name: Intel Dependencies
        run: |
          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
@@ -234,7 +232,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -255,7 +253,8 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc
-          make install-go-tools
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
      - name: Build
        id: build
        run: |
@@ -276,7 +275,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -296,7 +295,8 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc libomp llvm
-          make install-go-tools
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build
        id: build
        run: |
@@ -317,7 +317,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.4
+        uses: securego/gosec@v2.22.0
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -78,26 +78,6 @@ jobs:
          make --jobs=5 --output-sync=target -C backend/python/diffusers
          make --jobs=5 --output-sync=target -C backend/python/diffusers test
  #tests-vllm:
  #  runs-on: ubuntu-latest
  #  steps:
  #    - name: Clone
  #      uses: actions/checkout@v4
  #      with:
  #        submodules: true
  #    - name: Dependencies
  #      run: |
  #        sudo apt-get update
  #        sudo apt-get install -y build-essential ffmpeg
  #        sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #        sudo apt-get install -y libopencv-dev
  #        # Install UV
  #        curl -LsSf https://astral.sh/uv/install.sh | sh
  #        pip install --user --no-cache-dir grpcio-tools==1.64.1
  #    - name: Test vllm backend
  #      run: |
  #        make --jobs=5 --output-sync=target -C backend/python/vllm
  #        make --jobs=5 --output-sync=target -C backend/python/vllm test
  # tests-transformers-musicgen:
  #   runs-on: ubuntu-latest
  #   steps:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -71,7 +71,7 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
-          sudo apt-get install -y libgmock-dev clang
+          sudo apt-get install -y libgmock-dev
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
@@ -96,7 +96,6 @@ jobs:
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install github.com/GeertJohan/go.rice/rice@latest
          # The python3-grpc-tools package in 22.04 is too old
          pip install --user grpcio-tools
@@ -131,7 +130,7 @@ jobs:
          PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -184,7 +183,6 @@ jobs:
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install github.com/GeertJohan/go.rice/rice@latest
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Build images
        run: |
@@ -196,7 +194,7 @@ jobs:
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -224,7 +222,6 @@ jobs:
        run: |
          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
          pip install --user --no-cache-dir grpcio-tools
          go install github.com/GeertJohan/go.rice/rice@latest
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
@@ -235,7 +232,7 @@ jobs:
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/18
+++ b/18
@@ -15,7 +15,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
@@ -24,7 +24,6 @@ RUN apt-get update && \
        ca-certificates \
        curl libssl-dev \
        git \
        git-lfs \
        unzip upx-ucl && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
@@ -46,10 +45,9 @@ EOT
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
-# Install grpc compilers and rice
+# Install grpc compilers
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
-    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af && \
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
    go install github.com/GeertJohan/go.rice/rice@latest
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
@@ -301,9 +299,10 @@ COPY .git .
 RUN make prepare
 ## Build the binary
-## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space
+## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
-## Otherwise just run the normal build
+## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
-RUN if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
+## (both will use CUDA or hipblas for the actual computation)
 RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
@@ -431,6 +430,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/vllm \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/autogptq \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/bark \
    ; fi && \
--- a/102
+++ b/102
@@ -6,11 +6,11 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 # llama.cpp versions
-CPPLLAMA_VERSION?=e5c834f718a32b7584f142799bbf508fddb9021c
+CPPLLAMA_VERSION?=10f2e81809bbb69ecfe64fc8b4686285f84b0c07
 # whisper.cpp version
-WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
+WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=e41bc5c61ae66af6be2bd7011769bb821a83e8ae
+WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
 # go-piper version
 PIPER_REPO?=https://github.com/mudler/go-piper
@@ -21,11 +21,8 @@ BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
 BARKCPP_VERSION?=v1.0.0
 # stablediffusion.cpp (ggml)
-STABLEDIFFUSION_GGML_REPO?=https://github.com/richiejp/stable-diffusion.cpp
+STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=53e3b17eb3d0b5760ced06a1f98320b68b34aaae
+STABLEDIFFUSION_GGML_VERSION?=19d876ee300a055629926ff836489901f734f2b7
 # ONEAPI variables for SYCL
 export ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
@@ -33,12 +30,8 @@ ONNX_OS?=linux
 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
-export CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
+export CMAKE_ARGS?=
 export WHISPER_CMAKE_ARGS?=-DBUILD_SHARED_LIBS=OFF
 export BACKEND_LIBS?=
 export WHISPER_DIR=$(abspath ./sources/whisper.cpp)
 export WHISPER_INCLUDE_PATH=$(WHISPER_DIR)/include:$(WHISPER_DIR)/ggml/include
 export WHISPER_LIBRARY_PATH=$(WHISPER_DIR)/build/src/:$(WHISPER_DIR)/build/ggml/src
 CGO_LDFLAGS?=
 CGO_LDFLAGS_WHISPER?=
@@ -88,7 +81,6 @@ endif
 # IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 	WHISPER_CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # Detect if we are running on arm64
@@ -116,31 +108,13 @@ ifeq ($(OS),Darwin)
 	# disable metal if on Darwin and any other value is explicitly passed.
 	else ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 		WHISPER_CMAKE_ARGS+=-DGGML_METAL=OFF
 		export GGML_NO_ACCELERATE=1
 		export GGML_NO_METAL=1
 		GO_LDFLAGS_WHISPER+=-lggml-blas
 		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
 	endif
 	ifeq ($(BUILD_TYPE),metal)
 #			-lcblas 	removed: it seems to always be listed as a duplicate flag.
 		CGO_LDFLAGS += -framework Accelerate
 		CGO_LDFLAGS_WHISPER+=-lggml-metal -lggml-blas
 		CMAKE_ARGS+=-DGGML_METAL=ON
 		CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		CMAKE_ARGS+=-DGGML_OPENMP=OFF
 		WHISPER_CMAKE_ARGS+=-DGGML_METAL=ON
 		WHISPER_CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
 		WHISPER_CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_EXAMPLES=OFF
 		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_TESTS=OFF
 		WHISPER_CMAKE_ARGS+=-DWHISPER_BUILD_SERVER=OFF
 		WHISPER_CMAKE_ARGS+=-DGGML_OPENMP=OFF
 		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-metal/:$(WHISPER_DIR)/build/ggml/src/ggml-blas
 	else
 		CGO_LDFLAGS_WHISPER+=-lggml-blas
 		export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-blas
 	endif
 else
 CGO_LDFLAGS_WHISPER+=-lgomp
@@ -152,29 +126,21 @@ ifeq ($(BUILD_TYPE),openblas)
 endif
 ifeq ($(BUILD_TYPE),cublas)
-	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH) -L$(CUDA_LIBPATH)/stubs/ -lcuda
+	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
 	export GGML_CUDA=1
-	CMAKE_ARGS+=-DGGML_CUDA=ON
+	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
 	WHISPER_CMAKE_ARGS+=-DGGML_CUDA=ON
 	CGO_LDFLAGS_WHISPER+=-lcufft -lggml-cuda
 	export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-cuda/
 endif
 ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=1
 	WHISPER_CMAKE_ARGS+=-DGGML_VULKAN=1
 	CGO_LDFLAGS_WHISPER+=-lggml-vulkan -lvulkan
 	export WHISPER_LIBRARY_PATH:=$(WHISPER_LIBRARY_PATH):$(WHISPER_DIR)/build/ggml/src/ggml-vulkan/
 endif
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	export GGML_SYCL=1
 	CMAKE_ARGS+=-DGGML_SYCL=ON
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
 	export GGML_SYCL_F16=1
 	CMAKE_ARGS+=-DGGML_SYCL_F16=ON
 endif
 ifeq ($(BUILD_TYPE),hipblas)
@@ -185,7 +151,7 @@ ifeq ($(BUILD_TYPE),hipblas)
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	export STABLE_BUILD_TYPE=
 	export GGML_HIP=1
-	GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102
+	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
 	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
@@ -294,7 +260,11 @@ backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
 	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
 backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
-	$(MAKE) -C backend/go/image/stablediffusion-ggml CGO_LDFLAGS="$(CGO_LDFLAGS)" stablediffusion-ggml
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/stablediffusion-ggml
 endif
 sources/onnxruntime:
 	mkdir -p sources/onnxruntime
@@ -320,9 +290,8 @@ sources/whisper.cpp:
 	git checkout $(WHISPER_CPP_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
-sources/whisper.cpp/build/src/libwhisper.a: sources/whisper.cpp
+sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
-	cd sources/whisper.cpp && cmake $(WHISPER_CMAKE_ARGS) . -B ./build
+	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
 	cd sources/whisper.cpp/build && cmake --build . --config Release
 get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
@@ -372,14 +341,8 @@ clean-tests:
 clean-dc: clean
 	cp -r /build/backend-assets /workspace/backend-assets
 ## Install Go tools
 install-go-tools:
 	go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
 	go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
 	go install github.com/GeertJohan/go.rice/rice@latest
 ## Build:
-build: prepare backend-assets grpcs install-go-tools ## Build the project
+build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
@@ -389,9 +352,7 @@ ifneq ($(BACKEND_LIBS),)
 	$(MAKE) backend-assets/lib
 	cp -f $(BACKEND_LIBS) backend-assets/lib/
 endif
 	rm -rf $(BINARY_NAME) || true
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
 	rice append --exec $(BINARY_NAME)
 build-minimal:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
@@ -463,7 +424,6 @@ prepare-test: grpcs
 	cp -rf backend-assets core/http
 	cp tests/models_fixtures/* test-models
 ## Test targets
 test: prepare test-models/testmodel.ggml grpcs
 	@echo 'Running tests'
 	export GO_TAGS="tts debug"
@@ -538,7 +498,7 @@ protogen: protogen-go protogen-python
 protogen-clean: protogen-go-clean protogen-python-clean
 .PHONY: protogen-go
-protogen-go: install-go-tools
+protogen-go:
 	mkdir -p pkg/grpc/proto
 	protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
    backend/backend.proto
@@ -549,10 +509,18 @@ protogen-go-clean:
 	$(RM) bin/*
 .PHONY: protogen-python
-protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
 .PHONY: protogen-python-clean
-protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
 .PHONY: autogptq-protogen
 autogptq-protogen:
 	$(MAKE) -C backend/python/autogptq protogen
 .PHONY: autogptq-protogen-clean
 autogptq-protogen-clean:
 	$(MAKE) -C backend/python/autogptq protogen-clean
 .PHONY: bark-protogen
 bark-protogen:
@@ -629,6 +597,7 @@ vllm-protogen-clean:
 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
@@ -642,12 +611,10 @@ prepare-extra-conda-environments: protogen-python
 prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/diffusers
 	$(MAKE) -C backend/python/vllm
 test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/transformers test
 	$(MAKE) -C backend/python/diffusers test
 	$(MAKE) -C backend/python/vllm test
 backend-assets:
 	mkdir -p backend-assets
@@ -789,8 +756,8 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/silero-vad
 endif
-backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/build/src/libwhisper.a backend-assets/grpc
+backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="${WHISPER_INCLUDE_PATH}" LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" LD_LIBRARY_PATH="${WHISPER_LIBRARY_PATH}" \
+	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/whisper
@@ -842,8 +809,7 @@ docker-aio-all:
 docker-image-intel:
 	docker build \
-		--progress plain \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
 		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu24.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -851,7 +817,7 @@ docker-image-intel:
 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.1.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 <h1 align="center">
  <br>
-  <img height="300" src="./core/http/static/logo.png"> <br>
+  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
    LocalAI
 <br>
 </h1>
@@ -30,7 +31,7 @@
 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
-<img src="https://img.shields.io/badge/X-%23000000.svg?style=for-the-badge&logo=X&logoColor=white&label=LocalAI_API" alt="Follow LocalAI_API"/>
+<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
 </a>
 <a href="https://discord.gg/uJAeKSAGDy" target="blank">
 <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
@@ -43,89 +44,32 @@
 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) 
 [![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/localaiofficial_bot)
 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
-
+![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
 ## 📚🆕 Local Stack Family
 🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
 <table>
  <tr>
    <td width="50%" valign="top">
      <a href="https://github.com/mudler/LocalAGI">
        <img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
      </a>
    </td>
    <td width="50%" valign="top">
      <h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
      <p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
    </td>
  </tr>
  <tr>
    <td width="50%" valign="top">
      <a href="https://github.com/mudler/LocalRecall">
        <img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
      </a>
    </td>
    <td width="50%" valign="top">
      <h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
      <p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
    </td>
  </tr>
 </table>
 ## Screenshots
 | Talk Interface | Generate Audio |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
 | Models Overview | Generate Images |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
 | Chat Interface | Home |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
 | Login | Swarm |
 | --- | --- |
 |![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
 ## 💻 Quickstart
 Run the installer script:
 ```bash
 # Basic installation
 curl https://localai.io/install.sh | sh
 ```
 For more installation options, see [Installer Options](https://localai.io/docs/advanced/installer/).
 Or run with docker:
 ### CPU only image:
 ```bash
 # CPU only image:
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
-```
+
-### Nvidia GPU:
+# Nvidia GPU:
 ```bash
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
-```
+
-### CPU and GPU image (bigger size):
+# CPU and GPU image (bigger size):
 ```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
-```
+
-### AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
+# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
 ```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 ```
@@ -144,13 +88,10 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 local-ai run oci://localai/phi-2:latest
 ```
-For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)
+[💻 Getting started](https://localai.io/basics/getting_started/index.html)
 ## 📰 Latest project news
 - Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
 - Apr 2025: WebUI overhaul, AIO images updates
 - Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
 - Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
 - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
 - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
@@ -164,6 +105,19 @@ For more information, see [💻 Getting started](https://localai.io/basics/getti
 Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 ## 🔥🔥 Hot topics (looking for help):
 - Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
 - Realtime API https://github.com/mudler/LocalAI/issues/3714
 - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
 - Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
 - Assistant API: https://github.com/mudler/LocalAI/issues/1273
 - Vulkan: https://github.com/mudler/LocalAI/issues/1647
 - Anthropic API: https://github.com/mudler/LocalAI/issues/1808
 If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
 ## 🚀 [Features](https://localai.io/features/)
 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
@@ -177,10 +131,12 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
 - [Agentic capabilities](https://github.com/mudler/LocalAGI)
 - 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!
 ## 💻 Usage
 Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
 ### 🔗 Community and integrations
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -1,7 +1,7 @@
 embeddings: true
 name: text-embedding-ada-002
 embeddings: true
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
 usage: |
    You can test this model with curl like this:
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -1,57 +1,101 @@
 context_size: 8192
 f16: true
 function:
  grammar:
    no_mixed_free_string: true
    schema_type: llama3.1 # or JSON is supported too (json)
  response_regex:
  - <function=(?P<name>\w+)>(?P<arguments>.*)</function>
 mmap: true
 name: gpt-4
 mmap: true
 parameters:
-  model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 context_size: 8192
 stopwords:
- <|im_end|>
+- "<|im_end|>"
- <dummy32000>
+- "<dummy32000>"
- <|eot_id|>
+- "</tool_call>"
- <|end_of_text|>
+- "<|eot_id|>"
 - "<|end_of_text|>"
 function:
  # disable injecting the "answer" tool
  disable_no_action: true
  grammar:
    # This allows the grammar to also return messages
    mixed_mode: true
    # Suffix to add to the grammar
    #prefix: '<tool_call>\n'
    # Force parallel calls in the grammar
    # parallel_calls: true
  return_name_in_function_response: true
  # Without grammar uncomment the lines below
  # Warning: this is relying only on the capability of the
  # LLM model to generate the correct function call.
  json_regex_match: 
   - "(?s)<tool_call>(.*?)</tool_call>"
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
  replace_function_results: 
  # Replace everything that is not JSON array or object
  # 
  - key: '(?s)^[^{\[]*'
    value: ""
  - key: '(?s)[^}\]]*$'
    value: ""
  - key: "'([^']*?)'"
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
-    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+    {{.Input -}}
-    You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
+    <|im_start|>assistant
    {{.Input }}
    <|start_header_id|>assistant<|end_header_id|>
  chat_message: |
-    <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall }}
-    {{ else if eq .RoleName "tool" -}}
+    <tool_call>
-    The Function was executed and the response was:
+    {{- else if eq .RoleName "tool" }}
-    {{ end -}}
+    <tool_response>
-    {{ if .Content -}}
+    {{- end }}
-    {{.Content -}}
+    {{- if .Content}}
-    {{ else if .FunctionCall -}}
+    {{.Content }}
-    {{ range .FunctionCall }}
+    {{- end }}
-    [{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
+    {{- if .FunctionCall}}
-    {{ end }}
+    {{toJson .FunctionCall}}
-    {{ end -}}
+    {{- end }}
-    <|eot_id|>
+    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+  function: |-
-    <|start_header_id|>system<|end_header_id|>
+    <|im_start|>system
-    You are an expert in composing functions. You are given a question and a set of possible functions.
+    You are a function calling AI model.
-    Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
+    Here are the available tools:
-    If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
+    <tools>
-    If you decide to invoke any of the function(s), you MUST put it in the format as follows:
+    {{range .Functions}}
-    [func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-    You SHOULD NOT include any other text in the response.
+    {{end}}
-    Here is a list of functions in JSON format that you can invoke.
+    </tools>
-    {{toJson .Functions}}
+    You should call the tools provided to you sequentially
-    <|eot_id|><|start_header_id|>user<|end_header_id|>
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
-    {{.Input}}
+    <scratchpad>
-    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+    {step-by-step reasoning and plan in bullet points}
-
+    </scratchpad>
-download_files:
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+    <tool_call>
-  sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
+    {"arguments": <args-dict>, "name": <function-name>}
-  uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -1,49 +1,31 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: bakllava-mmproj.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: bakllava.gguf
-stopwords:
+
 - <|im_end|>
 - <dummy32000>
 - </s>
 - <|endoftext|>
 template:
  chat: |
-    {{.Input -}}
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+    ASSISTANT:
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
+- filename: bakllava.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: bakllava-mmproj.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@@ -1,7 +1,7 @@
 embeddings: true
 name: text-embedding-ada-002
 backend: sentencetransformers
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: all-MiniLM-L6-v2
 usage: |
    You can test this model with curl like this:
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -1,53 +1,101 @@
 context_size: 4096
 f16: true
 function:
  capture_llm_results:
  - (?s)<Thought>(.*?)</Thought>
  grammar:
    properties_order: name,arguments
  json_regex_match:
  - (?s)<Output>(.*?)</Output>
  replace_llm_results:
  - key: (?s)<Thought>(.*?)</Thought>
    value: ""
 mmap: true
 name: gpt-4
 mmap: true
 parameters:
-  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 context_size: 8192
 stopwords:
- <|im_end|>
+- "<|im_end|>"
- <dummy32000>
+- "<dummy32000>"
- </s>
+- "</tool_call>"
 - "<|eot_id|>"
 - "<|end_of_text|>"
 function:
  # disable injecting the "answer" tool
  disable_no_action: true
  grammar:
    # This allows the grammar to also return messages
    mixed_mode: true
    # Suffix to add to the grammar
    #prefix: '<tool_call>\n'
    # Force parallel calls in the grammar
    # parallel_calls: true
  return_name_in_function_response: true
  # Without grammar uncomment the lines below
  # Warning: this is relying only on the capability of the
  # LLM model to generate the correct function call.
  json_regex_match: 
   - "(?s)<tool_call>(.*?)</tool_call>"
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
  replace_function_results: 
  # Replace everything that is not JSON array or object
  # 
  - key: '(?s)^[^{\[]*'
    value: ""
  - key: '(?s)[^}\]]*$'
    value: ""
  - key: "'([^']*?)'"
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{ .RoleName }}
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall }}
-    Function call:
+    <tool_call>
-    {{ else if eq .RoleName "tool" -}}
+    {{- else if eq .RoleName "tool" }}
-    Function response:
+    <tool_response>
-    {{ end -}}
+    {{- end }}
-    {{ if .Content -}}
+    {{- if .Content}}
    {{.Content }}
-    {{ end -}}
+    {{- end }}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
+    {{- end }}
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+  function: |-
    <|im_start|>system
-    You are an AI assistant that executes function calls, and these are the tools at your disposal:
+    You are a function calling AI model.
    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    <|im_end|>
+    </tools>
    You should call the tools provided to you sequentially
    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
    <scratchpad>
    {step-by-step reasoning and plan in bullet points}
    </scratchpad>
    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
 - filename: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
  sha256: 4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4
  uri: huggingface://mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -1,49 +1,35 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
-stopwords:
+  temperature: 0.2
- <|im_end|>
+  top_k: 40
- <dummy32000>
+  top_p: 0.95
- </s>
+  seed: -1
- <|endoftext|>
+
 template:
  chat: |
-    {{.Input -}}
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+    ASSISTANT:
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/intel/embeddings.yaml
+++ b/aio/intel/embeddings.yaml
@@ -1,7 +1,7 @@
 embeddings: true
 name: text-embedding-ada-002
 backend: sentencetransformers
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: all-MiniLM-L6-v2
 usage: |
    You can test this model with curl like this:
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -1,53 +1,103 @@
 context_size: 4096
 f16: true
 function:
  capture_llm_results:
  - (?s)<Thought>(.*?)</Thought>
  grammar:
    properties_order: name,arguments
  json_regex_match:
  - (?s)<Output>(.*?)</Output>
  replace_llm_results:
  - key: (?s)<Thought>(.*?)</Thought>
    value: ""
 mmap: true
 name: gpt-4
 mmap: false
 context_size: 8192
 f16: false
 parameters:
-  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 stopwords:
- <|im_end|>
+- "<|im_end|>"
- <dummy32000>
+- "<dummy32000>"
- </s>
+- "</tool_call>"
 - "<|eot_id|>"
 - "<|end_of_text|>"
 function:
  # disable injecting the "answer" tool
  disable_no_action: true
  grammar:
    # This allows the grammar to also return messages
    mixed_mode: true
    # Suffix to add to the grammar
    #prefix: '<tool_call>\n'
    # Force parallel calls in the grammar
    # parallel_calls: true
  return_name_in_function_response: true
  # Without grammar uncomment the lines below
  # Warning: this is relying only on the capability of the
  # LLM model to generate the correct function call.
  json_regex_match: 
   - "(?s)<tool_call>(.*?)</tool_call>"
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
  replace_function_results: 
  # Replace everything that is not JSON array or object
  # 
  - key: '(?s)^[^{\[]*'
    value: ""
  - key: '(?s)[^}\]]*$'
    value: ""
  - key: "'([^']*?)'"
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{ .RoleName }}
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall }}
-    Function call:
+    <tool_call>
-    {{ else if eq .RoleName "tool" -}}
+    {{- else if eq .RoleName "tool" }}
-    Function response:
+    <tool_response>
-    {{ end -}}
+    {{- end }}
-    {{ if .Content -}}
+    {{- if .Content}}
    {{.Content }}
-    {{ end -}}
+    {{- end }}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
+    {{- end }}
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+  function: |-
    <|im_start|>system
-    You are an AI assistant that executes function calls, and these are the tools at your disposal:
+    You are a function calling AI model.
    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    <|im_end|>
+    </tools>
    You should call the tools provided to you sequentially
    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
    <scratchpad>
    {step-by-step reasoning and plan in bullet points}
    </scratchpad>
    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
 - filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -1,50 +1,35 @@
 backend: llama-cpp
 context_size: 4096
-f16: true
+mmap: false
-mmap: true
+f16: false
 mmproj: minicpm-v-2_6-mmproj-f16.gguf
 name: gpt-4o
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
-stopwords:
+  temperature: 0.2
- <|im_end|>
+  top_k: 40
- <dummy32000>
+  top_p: 0.95
- </s>
+  seed: -1
- <|endoftext|>
+
 template:
  chat: |
-    {{.Input -}}
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+    ASSISTANT:
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/assets.go
+++ b/assets.go
@@ -1,15 +1,6 @@
 package main
-import (
+import "embed"
 	rice "github.com/GeertJohan/go.rice"
 )
-var backendAssets *rice.Box
+//go:embed backend-assets/*
-
+var backendAssets embed.FS
 func init() {
 	var err error
 	backendAssets, err = rice.FindBox("backend-assets")
 	if err != nil {
 		panic(err)
 	}
 }
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -14,7 +14,6 @@ service Backend {
  rpc PredictStream(PredictOptions) returns (stream Reply) {}
  rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
  rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
@@ -191,7 +190,11 @@ message ModelOptions {
  int32 NGQA = 20;
  string ModelFile = 21;
-
+  // AutoGPTQ
  string Device = 22;
  bool UseTriton = 23;
  string ModelBaseName = 24;
  bool UseFastTokenizer = 25;
  // Diffusers
  string PipelineType = 26;
@@ -302,19 +305,6 @@ message GenerateImageRequest {
  int32 CLIPSkip = 11;
 }
 message GenerateVideoRequest {
  string prompt = 1;
  string start_image = 2;  // Path or base64 encoded image for the start frame
  string end_image = 3;    // Path or base64 encoded image for the end frame
  int32 width = 4;
  int32 height = 5;
  int32 num_frames = 6;    // Number of frames to generate
  int32 fps = 7;          // Frames per second
  int32 seed = 8;
  float cfg_scale = 9;    // Classifier-free guidance scale
  string dst = 10;        // Output path for the generated video
 }
 message TTSRequest {
  string text = 1;
  string model = 2;
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -1,17 +1,17 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
-# set(TARGET myclip)
+set(TARGET myclip)
-# add_library(${TARGET} clip.cpp clip.h clip-impl.h llava.cpp llava.h)
+add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
-# install(TARGETS ${TARGET} LIBRARY)
+install(TARGETS ${TARGET} LIBRARY)
-# target_include_directories(myclip PUBLIC .)
+target_include_directories(myclip PUBLIC .)
-# target_include_directories(myclip PUBLIC ../..)
+target_include_directories(myclip PUBLIC ../..)
-# target_include_directories(myclip PUBLIC ../../common)
+target_include_directories(myclip PUBLIC ../../common)
-# target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
-# target_compile_features(${TARGET} PRIVATE cxx_std_11)
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
-# if (NOT MSVC)
+if (NOT MSVC)
-#     target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
+    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
-# endif()
+endif()
 # END CLIP hack
@@ -75,11 +75,7 @@ add_library(hw_grpc_proto
  ${hw_proto_hdrs} )
 add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
-
+target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
 target_include_directories(${TARGET} PRIVATE ../llava)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
 target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
  absl::flags_parse
  gRPC::${_REFLECTION}
  gRPC::${_GRPC_GRPCPP}
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -8,7 +8,7 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
@@ -36,18 +36,11 @@ else ifeq ($(OS),Darwin)
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
+	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DCMAKE_CXX_FLAGS="-fsycl" \
 		-DGGML_SYCL_F16=ON
 endif
 ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
+	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DCMAKE_CXX_FLAGS="-fsycl"
 endif
 llama.cpp:
@@ -59,8 +52,8 @@ llama.cpp:
 	git checkout -b build $(LLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
-llama.cpp/tools/grpc-server: llama.cpp
+llama.cpp/examples/grpc-server: llama.cpp
-	mkdir -p llama.cpp/tools/grpc-server
+	mkdir -p llama.cpp/examples/grpc-server
 	bash prepare.sh
 rebuild:
@@ -70,13 +63,13 @@ rebuild:
 purge:
 	rm -rf llama.cpp/build
-	rm -rf llama.cpp/tools/grpc-server
+	rm -rf llama.cpp/examples/grpc-server
 	rm -rf grpc-server
 clean: purge
 	rm -rf llama.cpp
-grpc-server: llama.cpp llama.cpp/tools/grpc-server
+grpc-server: llama.cpp llama.cpp/examples/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -11,7 +11,8 @@
 #include <memory>
 #include <string>
 #include <getopt.h>
-#include "mtmd.h"
+#include "clip.h"
 #include "llava.h"
 #include "log.h"
 #include "stb_image.h"
 #include "common.h"
@@ -51,7 +52,7 @@ struct server_params
 {
    std::string hostname = "127.0.0.1";
    std::vector<std::string> api_keys;
-    std::string public_path = "tools/server/public";
+    std::string public_path = "examples/server/public";
    std::string chat_template = "";
    int32_t port = 8080;
    int32_t read_timeout = 600;
@@ -209,8 +210,6 @@ struct llama_client_slot
    int32_t num_prompt_tokens_processed = 0;
    json prompt;
    json data;
    std::string generated_text;
    llama_token sampled;
    std::vector<llama_token> cache_tokens;
@@ -240,7 +239,7 @@ struct llama_client_slot
    int32_t n_past_se = 0; // self-extend
    // multimodal
-    mtmd_context * mctx = nullptr;
+    std::vector<slot_image> images;
    // stats
    size_t sent_count = 0;
@@ -271,6 +270,17 @@ struct llama_client_slot
        n_past_se              = 0;
        generated_token_probs.clear();
        for (slot_image & img : images)
        {
            free(img.image_embedding);
            if (img.img_data) {
                clip_image_u8_free(img.img_data);
            }
            img.prefix_prompt = "";
        }
        images.clear();
    }
    bool has_budget(common_params &global_params) {
@@ -446,9 +456,6 @@ struct llama_server_context
    llama_context *ctx = nullptr;
    const llama_vocab * vocab = nullptr;
    // multimodal
    mtmd_context * mctx = nullptr;
    clip_ctx *clp_ctx = nullptr;
    common_params params;
@@ -460,7 +467,6 @@ struct llama_server_context
    bool all_slots_are_idle = false;
    bool add_bos_token      = true;
    bool has_eos_token      = true;
    bool has_gpu = false;
    bool grammar_lazy = false;
    std::vector<common_grammar_trigger> grammar_triggers;
@@ -487,10 +493,6 @@ struct llama_server_context
    ~llama_server_context()
    {
        if (mctx) {
            mtmd_free(mctx);
            mctx = nullptr;
        }
        if (ctx)
        {
            llama_free(ctx);
@@ -506,17 +508,12 @@ struct llama_server_context
    bool load_model(const common_params &params_)
    {
        params = params_;
-        if (!params.mmproj.path.empty()) {
+        if (!params.mmproj.empty()) {
            multimodal = true;
            LOG_INFO("Multi Modal Mode Enabled", {});
-            mtmd_context_params mparams = mtmd_context_params_default();
+            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
-            mparams.use_gpu       = has_gpu;
+            if(clp_ctx == nullptr) {
-            mparams.print_timings = false;
+                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
            mparams.n_threads     = params.cpuparams.n_threads;
            mparams.verbosity     = GGML_LOG_LEVEL_INFO;
            mctx = mtmd_init_from_file(params.mmproj.path.c_str(), model, mparams);
            if (mctx == nullptr) {
                LOG_ERR("failed to load multimodal model, '%s'\n", params.mmproj.path.c_str());
                return false;
            }
@@ -530,7 +527,7 @@ struct llama_server_context
        ctx = common_init.context.release();
        if (model == nullptr)
        {
-            LOG_ERR("unable to load model: %s", params.model.path.c_str());
+            LOG_ERR("unable to load model: %s", params.model.c_str());
            return false;
        }
@@ -578,8 +575,6 @@ struct llama_server_context
            slot.id = i;
            slot.n_ctx = n_ctx_slot;
            slot.n_predict = params.n_predict;
            slot.mctx = mctx;
            //slot.cache_tokens.has_mtmd = mctx != nullptr;
            LOG_INFO("new slot", {
                {"slot_id",    slot.id},
@@ -617,61 +612,54 @@ struct llama_server_context
        batch = llama_batch_init(n_ctx, 0, params.n_parallel);
    }
-    std::vector<server_tokens> tokenize(json &data, const json & json_prompt, bool add_bos) const
+    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
    {
-        mtmd::bitmaps bitmaps;
+        // TODO: currently, we tokenize using special tokens by default
-        std::vector<server_tokens> inputs;
+        //       this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
        //       but it's better compared to completely ignoring ChatML and other chat templates
        const bool TMP_FORCE_SPECIAL = true;
-        if (mctx != nullptr)
+        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
        // or the first element of the json_prompt array is a string.
        std::vector<llama_token> prompt_tokens;
        if (json_prompt.is_array())
        {
-            const auto &images_data = data.find("image_data");
+            bool first = true;
-            if (images_data != data.end() && images_data->is_array())
+            for (const auto& p : json_prompt)
            {
-                for (const auto &img : *images_data)
+                if (p.is_string())
                {
-                    const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
+                    auto s = p.template get<std::string>();
-
+                    std::vector<llama_token> p;
-                    mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(image_buffer.data(), image_buffer.size()));
+                    if (first)
-                    if (!bmp.ptr) {
+                    {
-                            throw std::runtime_error("Failed to load image");
+                        p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
                        first = false;
                    }
-                    // calculate bitmap hash (for KV caching)
+                    else
-                    std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3);
+                    {
-                    bmp.set_id(hash.c_str());
+                        p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
-                    bitmaps.entries.push_back(std::move(bmp));
+                    }
                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
                }
                else
                {
                    if (first)
                    {
                        first = false;
                    }
                    prompt_tokens.push_back(p.template get<llama_token>());
                }
            }
-
+        }
-            // multimodal
+        else
-            std::string prompt_str = json_prompt.template get<std::string>();
+        {
-            mtmd_input_text inp_txt = {
+            auto s = json_prompt.template get<std::string>();
-                prompt_str.c_str(),
+            prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
                /* add_special */   true,
                /* parse_special */ true,
            };
            mtmd::input_chunks chunks(mtmd_input_chunks_init());
            auto bitmaps_c_ptr = bitmaps.c_ptr();
            int32_t tokenized = mtmd_tokenize(mctx,
                                                chunks.ptr.get(),
                                                &inp_txt,
                                                bitmaps_c_ptr.data(),
                                                bitmaps_c_ptr.size());
            if (tokenized != 0) {
                throw std::runtime_error("Failed to tokenize prompt");
            }
            server_tokens tmp(chunks, true);
            inputs.push_back(std::move(tmp));
        } else {
            // non-multimodal version
            auto tokenized_prompts = tokenize_input_prompts(vocab, json_prompt, true, true);
            for (auto & p : tokenized_prompts) {
                auto tmp = server_tokens(p, mctx != nullptr);
                inputs.push_back(std::move(tmp));
            }
        }
-        return inputs;
+        return prompt_tokens;
    }
    llama_client_slot* get_slot(int id) {
@@ -724,8 +712,6 @@ struct llama_server_context
        slot->sparams.grammar_triggers = grammar_triggers;
        slot->sparams.grammar_lazy = grammar_lazy;
        slot->data = data;
        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
            // Might be better to reject the request with a 400 ?
            LOG_WARNING("Max tokens to predict exceeds server configuration", {
@@ -767,7 +753,43 @@ struct llama_server_context
        if (json_value(data, "ignore_eos", false) && has_eos_token) {
                slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
        }
-        
+        /*
        slot->sparams.penalty_prompt_tokens.clear();
        slot->sparams.use_penalty_prompt_tokens = false;
        const auto &penalty_prompt = data.find("penalty_prompt");
        if (penalty_prompt != data.end())
        {
            if (penalty_prompt->is_string())
            {
                const auto penalty_prompt_string = penalty_prompt->get<std::string>();
                auto penalty_tokens = llama_tokenize(model, penalty_prompt_string, false);
                slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
                if (slot->params.n_predict > 0)
                {
                    slot->sparams.penalty_prompt_tokens.reserve(slot->sparams.penalty_prompt_tokens.size() + slot->params.n_predict);
                }
                slot->sparams.use_penalty_prompt_tokens = true;
            }
            else if (penalty_prompt->is_array())
            {
                const auto n_tokens = penalty_prompt->size();
                slot->sparams.penalty_prompt_tokens.reserve(n_tokens + std::max(0, slot->params.n_predict));
                const int n_vocab = llama_n_vocab(model);
                for (const auto &penalty_token : *penalty_prompt)
                {
                    if (penalty_token.is_number_integer())
                    {
                        const auto tok = penalty_token.get<llama_token>();
                        if (tok >= 0 && tok < n_vocab)
                        {
                            slot->sparams.penalty_prompt_tokens.push_back(tok);
                        }
                    }
                }
                slot->sparams.use_penalty_prompt_tokens = true;
            }
        }
      */
        slot->sparams.logit_bias.clear();
        const auto &logit_bias = data.find("logit_bias");
@@ -843,6 +865,79 @@ struct llama_server_context
        }
        if (multimodal)
        {
            const auto &images_data = data.find("image_data");
            if (images_data != data.end() && images_data->is_array())
            {
                for (const auto &img : *images_data)
                {
                    const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
                    slot_image img_sl;
                    img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
                    img_sl.img_data = clip_image_u8_init();
                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
                    {
                        LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d", 
                             __func__,
                             slot->id,
                             img_sl.id
                        );
                        return false;
                    }
                    LOG_VERBOSE("image loaded", {
                        {"slot_id",   slot->id},
                        {"img_sl_id", img_sl.id}
                    });
                    img_sl.request_encode_image = true;
                    slot->images.push_back(img_sl);
                }
                // process prompt
                // example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
                if (slot->images.size() > 0 && !slot->prompt.is_array())
                {
                    std::string prompt = slot->prompt.get<std::string>();
                    size_t pos = 0, begin_prefix = 0;
                    std::string pattern = "[img-";
                    while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
                        size_t end_prefix = pos;
                        pos += pattern.length();
                        size_t end_pos = prompt.find(']', pos);
                        if (end_pos != std::string::npos)
                        {
                            std::string image_id = prompt.substr(pos, end_pos - pos);
                            try
                            {
                                int img_id = std::stoi(image_id);
                                bool found = false;
                                for (slot_image &img : slot->images)
                                {
                                    if (img.id == img_id) {
                                        found = true;
                                        img.prefix_prompt = prompt.substr(begin_prefix, end_prefix - begin_prefix);
                                        begin_prefix = end_pos + 1;
                                        break;
                                    }
                                }
                                if (!found) {
                                    LOG("ERROR: Image with id: %i, not found.\n", img_id);
                                    slot->images.clear();
                                    return false;
                                }
                            } catch (const std::invalid_argument& e) {
                                LOG("Invalid image number id in prompt\n");
                                slot->images.clear();
                                return false;
                            }
                        }
                    }
                    slot->prompt = "";
                    slot->params.input_suffix = prompt.substr(begin_prefix);
                    slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
                }
            }
        }
        if (slot->ctx_sampling != nullptr)
        {
@@ -1090,6 +1185,26 @@ struct llama_server_context
        return slot.has_next_token; // continue
    }
    bool process_images(llama_client_slot &slot) const
    {
        for (slot_image &img : slot.images)
        {
            if (!img.request_encode_image)
            {
                continue;
            }
            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
                LOG("Error processing the given image");
                return false;
            }
            img.request_encode_image = false;
        }
        return slot.images.size() > 0;
    }
    void send_error(task_server& task, const std::string &error)
    {
        LOG("task %i - error: %s\n", task.id, error.c_str());
@@ -1332,6 +1447,74 @@ struct llama_server_context
        }
    }
    // for multiple images processing
    bool ingest_images(llama_client_slot &slot, int n_batch)
    {
        int image_idx = 0;
        while (image_idx < (int) slot.images.size())
        {
            slot_image &img = slot.images[image_idx];
            // process prefix prompt
            for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch)
            {
                const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
                llama_batch batch_view = {
                    n_tokens,
                    batch.token    + i,
                    nullptr,
                    batch.pos      + i,
                    batch.n_seq_id + i,
                    batch.seq_id   + i,
                    batch.logits   + i,
                };
                if (llama_decode(ctx, batch_view))
                {
                    LOG("%s : failed to eval\n", __func__);
                    return false;
                }
            }
            // process image with llm
            for (int i = 0; i < img.image_tokens; i += n_batch)
            {
                int n_eval = img.image_tokens - i;
                if (n_eval > n_batch)
                {
                    n_eval = n_batch;
                }
                const int n_embd = llama_model_n_embd(model);
                float * embd = img.image_embedding + i * n_embd;
                llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, slot.n_past, 0);
                if (llama_decode(ctx, llava_batch.batch))
                {
                    LOG("%s : failed to eval image\n", __func__);
                    return false;
                }
                slot.n_past += n_eval;
            }
            image_idx++;
            common_batch_clear(batch);
            // append prefix of next image
            const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
                slot.params.input_suffix : // no more images, then process suffix prompt
                (json)(slot.images[image_idx].prefix_prompt);
            std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
            for (int i = 0; i < (int) append_tokens.size(); ++i)
            {
                common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
                slot.n_past += 1;
            }
        }
        return true;
    }
    void request_cancel(int task_id)
    {
        task_server task;
@@ -1546,7 +1729,7 @@ struct llama_server_context
        {
            for (auto & slot : slots)
            {
-                const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
+                const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
                // empty prompt passed -> release the slot and send empty response
                // note: infill mode allows empty prompt
@@ -1563,7 +1746,7 @@ struct llama_server_context
                {
                    slot.state = PROCESSING;
                    slot.command = NONE;
-                    std::vector<server_tokens> prompt_tokens;
+                    std::vector<llama_token> prompt_tokens;
                    slot.t_start_process_prompt = ggml_time_us();
                    slot.t_start_genereration = 0;
@@ -1575,41 +1758,24 @@ struct llama_server_context
                            params.input_suffix.erase(0, 1);
                            suff_rm_leading_spc = false;
                        }
-                        auto prefix_tokens = tokenize(slot.data, slot.params.input_prefix, false);
+                        auto prefix_tokens = tokenize(slot.params.input_prefix, false);
-                        auto suffix_tokens = tokenize(slot.data, slot.params.input_suffix, false);
+                        auto suffix_tokens = tokenize(slot.params.input_suffix, false);
                        const int space_token = 29871; // TODO: this should not be hardcoded
-                        if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0][0] == space_token) {
+                        if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
                            suffix_tokens.erase(suffix_tokens.begin());
                        }
-                        // Create llama_tokens vectors for the special tokens
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_fim_pre(vocab));
-                        llama_tokens fim_pre_tokens;
+                        prefix_tokens.insert(prefix_tokens.begin(), llama_vocab_bos(vocab)); // always add BOS
-                        fim_pre_tokens.push_back(llama_vocab_fim_pre(vocab));
+                        prefix_tokens.insert(prefix_tokens.end(),   llama_vocab_fim_suf(vocab));
-                        llama_tokens bos_tokens;
+                        prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());
-                        bos_tokens.push_back(llama_vocab_bos(vocab));
+                        prefix_tokens.push_back(llama_vocab_fim_mid(vocab));
                        llama_tokens fim_suf_tokens;
                        fim_suf_tokens.push_back(llama_vocab_fim_suf(vocab));
                        llama_tokens fim_mid_tokens;
                        fim_mid_tokens.push_back(llama_vocab_fim_mid(vocab));
                        // Create server_tokens objects
                        server_tokens fim_pre_token(fim_pre_tokens, mctx != nullptr);
                        server_tokens bos_token(bos_tokens, mctx != nullptr);
                        server_tokens fim_suf_token(fim_suf_tokens, mctx != nullptr);
                        server_tokens fim_mid_token(fim_mid_tokens, mctx != nullptr);
                        // Insert tokens in the correct order
                        prefix_tokens.insert(prefix_tokens.begin(), fim_pre_token);
                        prefix_tokens.insert(prefix_tokens.begin(), bos_token); // always add BOS
                        prefix_tokens.insert(prefix_tokens.end(), fim_suf_token);
                        prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
                        prefix_tokens.push_back(fim_mid_token);
                        prompt_tokens = prefix_tokens;
                    }
                    else
                    {
-                        prompt_tokens = tokenize(slot.data, slot.prompt, system_prompt.empty() && add_bos_token);  // add BOS if there isn't system prompt
+                        prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token);  // add BOS if there isn't system prompt
                    }
                    slot.num_prompt_tokens = prompt_tokens.size();
@@ -1637,12 +1803,7 @@ struct llama_server_context
                            {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
                        });
                        slot.truncated = true;
-                        
+                        prompt_tokens = new_tokens;
                        // Convert new_tokens to server_tokens
                        std::vector<server_tokens> new_prompt_tokens;
                        server_tokens new_server_tokens(new_tokens, mctx != nullptr);
                        new_prompt_tokens.push_back(std::move(new_server_tokens));
                        prompt_tokens = std::move(new_prompt_tokens);
                        slot.num_prompt_tokens = prompt_tokens.size();
                        GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
@@ -1662,17 +1823,10 @@ struct llama_server_context
                        // push the prompt into the sampling context (do not apply grammar)
                        for (auto &token : prompt_tokens)
                        {
-                            // Convert server_tokens to llama_token for sampling
+                            common_sampler_accept(slot.ctx_sampling, token, false);
                            llama_token tok = token[0];  // Get first token
                            common_sampler_accept(slot.ctx_sampling, tok, false);
                        }
-                        // Convert server_tokens to llama_tokens for comparison
+                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
                        std::vector<llama_token> prompt_llama_tokens;
                        for (const auto &token : prompt_tokens) {
                            prompt_llama_tokens.push_back(token[0]);
                        }
                        slot.n_past = common_part(slot.cache_tokens, prompt_llama_tokens);
                        // the last token of the cache is not in the KV cache until the next call to llama_decode
                        // (it was sampled, pushed into the "cache_tokens", but not yet put in the context)
@@ -1710,12 +1864,7 @@ struct llama_server_context
                        });
                    }
-                    // Convert server_tokens to llama_tokens for cache
+                    slot.cache_tokens = prompt_tokens;
                    std::vector<llama_token> cache_llama_tokens;
                    for (const auto &token : prompt_tokens) {
                        cache_llama_tokens.push_back(token[0]);
                    }
                    slot.cache_tokens = cache_llama_tokens;
                    if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
                    {
@@ -1739,36 +1888,18 @@ struct llama_server_context
                    });
                    llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
                    // process the prefix of first image
                    std::vector<server_tokens> prefix_tokens = prompt_tokens;
                    int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
                    // check if we should process the image
                    if (slot.n_past < slot.n_prompt_tokens
                            && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
                        // process the image
                        int32_t new_n_past;
                        int32_t res = prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
                        int32_t n_pos = new_n_past - slot.n_past;
                        if (res != 0) {
                            slot.release();
                            LOG_ERR("failed to process image, res = %d\n", res);
                            continue;
                        }
                        slot.n_past                    += n_pos;
                       // slot.n_prompt_tokens_processed += n_pos;
                    }
                    LOG_VERBOSE("prompt ingested", {
                                                    {"n_past",  slot.n_past},
                                                    {"cached",  tokens_to_str(ctx, slot.cache_tokens.cbegin(), slot.cache_tokens.cbegin() + slot.n_past)},
                                                    {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
                                                });
                    const bool has_images = process_images(slot);
                    // process the prefix of first image
                    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
                    int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
                    int32_t ga_i = slot.ga_i;
                    int32_t ga_n = slot.ga_n;
@@ -1788,6 +1919,19 @@ struct llama_server_context
                        slot_npast++;
                    }
                    if (has_images && !ingest_images(slot, n_batch))
                    {
                        LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d", 
                            __func__,
                            slot.id,
                            slot.task_id
                        );
                        // FIXME @phymbert: to be properly tested
                        //  early returning without changing the slot state will block the slot for ever
                        // no one at the moment is checking the return value
                        return false;
                    }
                    // extract the logits only for the last token
                    if (batch.n_tokens > 0)
                    {
@@ -1974,11 +2118,7 @@ static void append_to_generated_text_from_generated_token_probs(llama_server_con
 }
 std::function<void(int)> shutdown_handler;
-
+inline void signal_handler(int signal) { shutdown_handler(signal); }
 inline void signal_handler(int signal) {
    exit(1);
 }
 /////////////////////////////////
 ////////////////////////////////
@@ -2016,6 +2156,26 @@ static void start_llama_server() {
 json parse_options(bool streaming, const backend::PredictOptions* predict, llama_server_context &llama)
 {
    // This is for example a slot data from the json data
    //     slot->params.stream           = json_value(data, "stream",            false);
    //     slot->params.cache_prompt     = json_value(data, "cache_prompt",      false);
    //     slot->params.n_predict        = json_value(data, "n_predict",         default_params.n_predict);
    //     slot->sparams.top_k           = json_value(data, "top_k",             default_sparams.top_k);
    //     slot->sparams.top_p           = json_value(data, "top_p",             default_sparams.top_p);
    //     slot->sparams.typical_p       = json_value(data, "typical_p",         default_sparams.typical_p);
    //     slot->sparams.temp            = json_value(data, "temperature",       default_sparams.temp);
    //     slot->sparams.penalty_last_n  = json_value(data, "repeat_last_n",     default_sparams.penalty_last_n);
    //     slot->sparams.penalty_repeat  = json_value(data, "repeat_penalty",    default_sparams.penalty_repeat);
    //     slot->sparams.penalty_freq    = json_value(data, "frequency_penalty", default_sparams.penalty_freq);
    //     slot->sparams.penalty_present = json_value(data, "presence_penalty",  default_sparams.penalty_present);
    //     slot->sparams.mirostat        = json_value(data, "mirostat",          default_sparams.mirostat);
    //     slot->sparams.mirostat_tau    = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
    //     slot->sparams.mirostat_eta    = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
    //     slot->params.n_keep           = json_value(data, "n_keep",            slot->params.n_keep);
    //     slot->params.seed             = json_value(data, "seed",              default_params.seed);
    //     slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
    //     slot->sparams.n_probs         = json_value(data, "n_probs",           default_sparams.n_probs);
    // Create now a json data from the prediction options instead
    //
    json data;
@@ -2060,6 +2220,69 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    return data;
 }
 // static void parse_options_completion(bool streaming,const backend::PredictOptions* predict, llama_server_context &llama)
 // {
 //     // https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L673
 //     gpt_params default_params;
 //     llama.stream = streaming;
 //     llama.params.n_predict = predict->tokens() == 0 ? -1 : predict->tokens();
 //     llama.params.sparams.top_k = predict->topk();
 //     llama.params.sparams.top_p = predict->topp();
 //     llama.params.sparams.typical_p = predict->typicalp();
 //     llama.params.sparams.penalty_last_n = predict->repeat();
 //     llama.params.sparams.temp = predict->temperature();
 //     llama.params.sparams.penalty_repeat = predict->penalty();
 //     llama.params.sparams.penalty_present = predict->presencepenalty();
 //     llama.params.sparams.penalty_freq = predict->frequencypenalty();
 //     llama.params.sparams.mirostat = predict->mirostat();
 //     llama.params.sparams.mirostat_tau = predict->mirostattau();
 //     llama.params.sparams.mirostat_eta = predict->mirostateta();
 //     llama.params.n_keep = predict->nkeep();
 //     llama.params.seed = predict->seed();
 //     llama.params.sparams.grammar = predict->grammar();
 //     // llama.params.n_probs = predict->
 //     llama.params.prompt = predict->prompt();
 //     llama.params.sparams.logit_bias.clear();
 //     if (predict->ignoreeos())
 //     {
 //         llama.params.sparams.logit_bias[llama_token_eos(llama.model)] = -INFINITY;
 //     }
 //     // const auto &logit_bias = body.find("logit_bias");
 //     // if (logit_bias != body.end() && logit_bias->is_array())
 //     // {
 //     //     const int n_vocab = llama_n_vocab(llama.model);
 //     //     for (const auto &el : *logit_bias)
 //     //     {
 //     //         if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
 //     //         {
 //     //             llama_token tok = el[0].get<llama_token>();
 //     //             if (tok >= 0 && tok < n_vocab)
 //     //             {
 //     //                 if (el[1].is_number())
 //     //                 {
 //     //                     llama.params.logit_bias[tok] = el[1].get<float>();
 //     //                 }
 //     //                 else if (el[1].is_boolean() && !el[1].get<bool>())
 //     //                 {
 //     //                     llama.params.logit_bias[tok] = -INFINITY;
 //     //                 }
 //     //             }
 //     //         }
 //     //     }
 //     // }
 //     llama.params.antiprompt.clear();
 //     for (const std::string& stopPrompt : predict->stopprompts()) {
 //     if (!stopPrompt.empty())
 //             {
 //                 llama.params.antiprompt.push_back(stopPrompt);
 //             }
 //     }
 // }
 const std::vector<ggml_type> kv_cache_types = {
    GGML_TYPE_F32,
@@ -2091,15 +2314,15 @@ static std::string get_all_kv_cache_types() {
 }
 static void params_parse(const backend::ModelOptions* request,
-                                common_params & params, llama_server_context &llama) {
+                                common_params & params) {
    // this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
-    params.model.path = request->modelfile();
+    params.model = request->modelfile();
    if (!request->mmproj().empty()) {
    // get the directory of modelfile
-      std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
+      std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
-      params.mmproj.path = model_dir + "/"+ request->mmproj();
+      params.mmproj = model_dir + "/"+ request->mmproj();
    }
    //  params.model_alias ??
    params.model_alias =  request->modelfile();
@@ -2129,20 +2352,6 @@ static void params_parse(const backend::ModelOptions* request,
        add_rpc_devices(std::string(llama_grpc_servers));
    }
     // decode options. Options are in form optname:optvale, or if booleans only optname.
    for (int i = 0; i < request->options_size(); i++) {
        std::string opt = request->options(i);
        char *optname = strtok(&opt[0], ":");
        char *optval = strtok(NULL, ":");
        if (optval == NULL) {
            optval = "true";
        }
        if (!strcmp(optname, "gpu")) {
            llama.has_gpu = true;
        }
    }
    // TODO: Add yarn
    if (!request->tensorsplit().empty()) {
@@ -2174,7 +2383,7 @@ static void params_parse(const backend::ModelOptions* request,
        scale_factor = request->lorascale();
     }
     // get the directory of modelfile
-     std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
+     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
     params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
    }
    params.use_mlock = request->mlock();
@@ -2236,7 +2445,7 @@ public:
  grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
    // Implement LoadModel RPC
    common_params params;
-    params_parse(request, params, llama);
+    params_parse(request, params);
    llama_backend_init();
    llama_numa_init(params.numa);
@@ -2372,10 +2581,10 @@ public:
    grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
         json data = parse_options(false, request, llama);
-         std::vector<server_tokens> tokens = llama.tokenize(data, data["prompt"],false);
+         std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
         for (int i=0 ; i< tokens.size(); i++){
-            response->add_tokens(tokens[i].llama_token);
+            response->add_tokens(tokens[i]);
         }
        return grpc::Status::OK;
@@ -2413,9 +2622,7 @@ void RunServer(const std::string& server_address) {
  ServerBuilder builder;
  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
  builder.RegisterService(&service);
-  builder.SetMaxMessageSize(50 * 1024 * 1024); // 50MB
+
  builder.SetMaxSendMessageSize(50 * 1024 * 1024); // 50MB
  builder.SetMaxReceiveMessageSize(50 * 1024 * 1024); // 50MB
  std::unique_ptr<Server> server(builder.BuildAndStart());
  std::cout << "Server listening on " << server_address << std::endl;
  server->Wait();
@@ -2424,20 +2631,6 @@ void RunServer(const std::string& server_address) {
 int main(int argc, char** argv) {
  std::string server_address("localhost:50051");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
    struct sigaction sigint_action;
    sigint_action.sa_handler = signal_handler;
    sigemptyset (&sigint_action.sa_mask);
    sigint_action.sa_flags = 0;
    sigaction(SIGINT, &sigint_action, NULL);
    sigaction(SIGTERM, &sigint_action, NULL);
 #elif defined (_WIN32)
    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
    };
    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
  // Define long and short options
  struct option long_options[] = {
      {"addr", required_argument, nullptr, 'a'},
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -1,13 +1,13 @@
-diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
+diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 3cd0d2fa..6c5e811a 100644
+index 7f892beb..0517e529 100644
--- a/tools/mtmd/clip.cpp
+--- a/examples/llava/clip.cpp
-+++ b/tools/mtmd/clip.cpp
+++ b/examples/llava/clip.cpp
-@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
+@@ -2766,7 +2766,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
-                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+                 int patch_offset = ctx->has_class_embedding ? 1 : 0;
                 int* patches_data = (int*)malloc(ggml_nbytes(patches));
                 for (int i = 0; i < num_patches; i++) {
-                    patches_data[i] = i + 1;
+-                    patches_data[i] = i + patch_offset;
-+                    patches_data[i] = i;
+                    patches_data[i] = i + 1;
                 }
                 ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
                 free(patches_data);
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 set -e
 ## Patches
 ## Apply patches from the `patches` directory
 for patch in $(ls patches); do
@@ -7,22 +9,21 @@ for patch in $(ls patches); do
    patch -d llama.cpp/ -p1 < patches/$patch
 done 
-cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
+cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
-cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
+cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
-cp -rfv json.hpp llama.cpp/tools/grpc-server/
+cp -rfv json.hpp llama.cpp/examples/grpc-server/
-cp -rfv utils.hpp llama.cpp/tools/grpc-server/
+cp -rfv utils.hpp llama.cpp/examples/grpc-server/
-if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
+if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
    echo "grpc-server already added"
 else
-    echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
+    echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
 fi
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
-# cp -rfv llama.cpp/tools/mtmd/clip.h llama.cpp/tools/grpc-server/clip.h
+cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
-# cp -rfv llama.cpp/tools/mtmd/clip-impl.h llama.cpp/tools/grpc-server/clip-impl.h
+cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
-# cp -rfv llama.cpp/tools/mtmd/llava.cpp llama.cpp/tools/grpc-server/llava.cpp
+echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
-# echo '#include "llama.h"' > llama.cpp/tools/grpc-server/llava.h
+cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
-# cat llama.cpp/tools/mtmd/llava.h >> llama.cpp/tools/grpc-server/llava.h
+cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
 # cp -rfv llama.cpp/tools/mtmd/clip.cpp llama.cpp/tools/grpc-server/clip.cpp
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -1,4 +1,4 @@
-// https://github.com/ggerganov/llama.cpp/blob/master/tools/server/utils.hpp
+// https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
 #pragma once
@@ -11,7 +11,7 @@
 #include "json.hpp"
-#include "../mtmd/clip.h"
+#include "../llava/clip.h"
 using json = nlohmann::json;
@@ -480,431 +480,4 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
    }
    return ret;
 }
 //
 // tokenizer and input processing utils
 //
 static bool json_is_array_of_numbers(const json & data) {
    if (data.is_array()) {
        for (const auto & e : data) {
            if (!e.is_number_integer()) {
                return false;
            }
        }
        return true;
    }
    return false;
 }
 // is array having BOTH numbers & strings?
 static bool json_is_array_of_mixed_numbers_strings(const json & data) {
    bool seen_string = false;
    bool seen_number = false;
    if (data.is_array()) {
        for (const auto & e : data) {
            seen_string |= e.is_string();
            seen_number |= e.is_number_integer();
            if (seen_number && seen_string) {
                return true;
            }
        }
    }
    return false;
 }
 // get value by path(key1 / key2)
 static json json_get_nested_values(const std::vector<std::string> & paths, const json & js) {
    json result = json::object();
    for (const std::string & path : paths) {
        json current = js;
        const auto keys = string_split<std::string>(path, /*separator*/ '/');
        bool valid_path = true;
        for (const std::string & k : keys) {
            if (valid_path && current.is_object() && current.contains(k)) {
                current = current[k];
            } else {
                valid_path = false;
            }
        }
        if (valid_path) {
            result[path] = current;
        }
    }
    return result;
 }
 /**
 * this handles 2 cases:
 * - only string, example: "string"
 * - mixed string and tokens, example: [12, 34, "string", 56, 78]
 */
 static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
    // If `add_bos` is true, we only add BOS, when json_prompt is a string,
    // or the first element of the json_prompt array is a string.
    llama_tokens prompt_tokens;
    if (json_prompt.is_array()) {
        bool first = true;
        for (const auto & p : json_prompt) {
            if (p.is_string()) {
                auto s = p.template get<std::string>();
                llama_tokens p;
                if (first) {
                    p = common_tokenize(vocab, s, add_special, parse_special);
                    first = false;
                } else {
                    p = common_tokenize(vocab, s, false, parse_special);
                }
                prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
            } else {
                if (first) {
                    first = false;
                }
                prompt_tokens.push_back(p.template get<llama_token>());
            }
        }
    } else {
        auto s = json_prompt.template get<std::string>();
        prompt_tokens = common_tokenize(vocab, s, add_special, parse_special);
    }
    return prompt_tokens;
 }
 /**
 * break the input "prompt" object into multiple prompt if needed, then tokenize them
 * this supports these cases:
 * - "prompt": "string"
 * - "prompt": [12, 34, 56]
 * - "prompt": [12, 34, "string", 56, 78]
 * and multiple prompts (multi-tasks):
 * - "prompt": ["string1", "string2"]
 * - "prompt": ["string1", [12, 34, 56]]
 * - "prompt": [[12, 34, 56], [78, 90, 12]]
 * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]]
 */
 static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) {
    std::vector<llama_tokens> result;
    if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) {
        // string or mixed
        result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special));
    } else if (json_is_array_of_numbers(json_prompt)) {
        // array of tokens
        result.push_back(json_prompt.get<llama_tokens>());
    } else if (json_prompt.is_array()) {
        // array of prompts
        result.reserve(json_prompt.size());
        for (const auto & p : json_prompt) {
            if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) {
                result.push_back(tokenize_mixed(vocab, p, add_special, parse_special));
            } else if (json_is_array_of_numbers(p)) {
                // array of tokens
                result.push_back(p.get<llama_tokens>());
            } else {
                throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens");
            }
        }
    } else {
        throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
    }
    if (result.empty()) {
        throw std::runtime_error("\"prompt\" must not be empty");
    }
    return result;
 }
 //
 // utils for interacting with libmtmd
 // (may need to refactor in near future)
 //
 /**
 * server_tokens is a helper to manage the input tokens and image for the server.
 * it is made this way to simplify the logic of KV cache management.
 */
 struct server_tokens {
    bool has_mtmd = false;
 private: // disallow accessing these members directly, risking out-of-sync
    // map a **start** position in tokens to the image chunk
    std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image;
    // list of tokens
    // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token
    // a mtmd_input_chunk can occupy multiple tokens, one llama_token per **position**
    // important: for models using mrope, an image can contain multiple tokens but will use only one **position**
    llama_tokens tokens;
    // for ex. with input of 5 text tokens and 2 images:
    //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
    // pos  0   1   2   3   4   5      6      7      8      9
    // map_pos_to_image will contain: {5, img0}, {8, img1}
 public:
    server_tokens() = default;
    ~server_tokens() = default;
    // Prevent copying
    server_tokens(const server_tokens&) = delete;
    server_tokens& operator=(const server_tokens&) = delete;
    // Allow moving (usually implicitly generated if members are movable)
    server_tokens(server_tokens&&) = default;
    server_tokens& operator=(server_tokens&&) = default;
    // Allow accessing elements using [] operator
    llama_token operator[](size_t index) { return tokens[index]; }
    const llama_token& operator[](size_t index) const { return tokens[index]; }
    server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) : has_mtmd(has_mtmd) {
        for (size_t i = 0; i < mtmd_chunks.size(); ++i) {
            push_back(mtmd_chunks[i]);
        }
    }
    server_tokens(llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {}
    // for debugging
    std::string str() const {
        std::ostringstream oss;
        oss << "tokens: ";
        for (const auto & t : tokens) {
            if (t == LLAMA_TOKEN_NULL) {
                oss << "<embd> ";
            } else {
                oss << t << " ";
            }
        }
        oss << "\n";
        oss << "image pos: ";
        for (const auto & it : map_pos_to_image) {
            oss << it.first << ", ";
        }
        return oss.str();
    }
    const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const {
        auto it = map_pos_to_image.find(pos);
        if (it != map_pos_to_image.end()) {
            return it->second;
        } else {
            throw std::runtime_error("Chunk not found");
        }
    }
    void push_back(llama_token tok) {
        if (tok == LLAMA_TOKEN_NULL) {
            throw std::runtime_error("Invalid token");
        }
        tokens.emplace_back(tok);
    }
    // will create a copy of the chunk if it contains non-text data
    void push_back(const mtmd_input_chunk * chunk) {
        auto type = mtmd_input_chunk_get_type(chunk);
        if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
            GGML_ASSERT(has_mtmd);
            auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk);
            const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
            llama_pos start_pos = tokens.size();
            for (int i = 0; i < n_pos; ++i) {
                tokens.emplace_back(LLAMA_TOKEN_NULL);
            }
            mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk));
            map_pos_to_image[start_pos] = std::move(new_chunk);
        } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
            size_t n_tokens;
            auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens);
            for (size_t i = 0; i < n_tokens; ++i) {
                push_back(text_tokens[i]);
            }
        } else {
            GGML_ABORT("Invalid chunk type");
        }
    }
    // for compatibility with context shift and prompt truncation
    void insert(const llama_tokens & inp_tokens) {
        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
        tokens.insert(tokens.end(), inp_tokens.begin(), inp_tokens.end());
    }
    // for compatibility with speculative decoding, ctx shift, slot save/load
    const llama_tokens & get_text_tokens() const {
        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
        return tokens;
    }
    // for compatibility with speculative decoding
    void set_token(llama_pos pos, llama_token id) {
        GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled
        tokens[pos] = id;
    }
    size_t size() const {
        return tokens.size();
    }
    bool empty() const {
        return tokens.empty();
    }
    void clear() {
        tokens.clear();
    }
    void resize(size_t n) {
        GGML_ASSERT(n <= tokens.size());
        if (has_mtmd) {
            // we throw an error if we try to remove a token in the middle of an image
            // for ex. with input of 5 text tokens and 2 images:
            //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1]
            // n  1   2   3   4   5   6      7      8      9      10
            // allowed to resize      ^                    ^
            // disallowed to resize          ^      ^             ^
            if (n > 0) {
                llama_token last_token = tokens[n - 1];
                // make sure we never remove tokens in the middle of an image
                if (last_token == LLAMA_TOKEN_NULL) {
                    find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
                }
            }
            // remove all image chunks that are not used anymore
            for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) {
                llama_pos pos = it->first;
                if (pos >= (llama_pos)n) {
                    it = map_pos_to_image.erase(it);
                } else {
                    ++it;
                }
            }
        }
        tokens.resize(n);
    }
    std::string detokenize(const llama_context * ctx, bool special) const {
        llama_tokens text_tokens;
        text_tokens.reserve(tokens.size());
        for (const auto & t : tokens) {
            if (t != LLAMA_TOKEN_NULL) {
                text_tokens.push_back(t);
            }
        }
        return common_detokenize(ctx, text_tokens, special);
    }
    size_t get_common_prefix(const server_tokens & b) const {
        size_t max_idx = std::min(tokens.size(), b.tokens.size());
        for (size_t i = 0; i < max_idx; ++i) {
            auto & ai =   tokens[i];
            auto & bi = b.tokens[i];
            if (ai == LLAMA_TOKEN_NULL && bi == LLAMA_TOKEN_NULL) {
                GGML_ASSERT(has_mtmd);
                const auto & a_chunk =   find_chunk(i);
                const auto & b_chunk = b.find_chunk(i);
                GGML_ASSERT(a_chunk && b_chunk);
                const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get());
                const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get());
                std::string ai_id  = mtmd_image_tokens_get_id(a_img);
                std::string bi_id  = mtmd_image_tokens_get_id(b_img);
                size_t a_pos       = mtmd_image_tokens_get_n_pos(a_img);
                size_t b_pos       = mtmd_image_tokens_get_n_pos(b_img);
                if (ai_id == bi_id && a_pos == b_pos) {
                    GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen
                    i += a_pos - 1; // will be +1 by the for loop
                    continue;
                } else {
                    return i;
                }
            } else if (ai == bi) {
                continue;
            } else {
                return i;
            }
        }
        return max_idx; // all tokens are equal
    }
    // make sure all text tokens are within the vocab range
    bool validate(const struct llama_context * ctx) const {
        const llama_model * model = llama_get_model(ctx);
        const llama_vocab * vocab = llama_model_get_vocab(model);
        const int32_t n_vocab = llama_vocab_n_tokens(vocab);
        for (size_t i = 0; i < tokens.size(); ++i) {
            auto & t = tokens[i];
            if (t == LLAMA_TOKEN_NULL) {
                try {
                    const auto & chunk = find_chunk(i);
                    const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get());
                    size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens);
                    i += n_pos - 1; // will be +1 by the for loop
                } catch (const std::exception & e) {
                    return false;
                }
            } else if (t < 0 || t >= n_vocab) {
                return false;
            }
        }
        return true;
    }
    // encode and decode the image chunk
    int32_t process_chunk(
                llama_context * ctx,
                mtmd_context * mctx,
                llama_pos n_past,
                int32_t seq_id,
                llama_pos & n_pos_out) {
        auto it = map_pos_to_image.find(n_past);
        if (it == map_pos_to_image.end()) {
            throw std::runtime_error("Chunk not found");
        }
     //   SRV_INF("%s\n", "processing image...");
        int32_t n_batch = llama_n_batch(ctx);
        int64_t t0 = ggml_time_ms();
        llama_pos new_n_past = n_past;
        int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx,
            it->second.get(), // chunk
            n_past,
            seq_id,
            n_batch,
            true, // logits last
            &new_n_past);
        //SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0);
        if (result != 0) {
            LOG_ERR("mtmd_helper_eval failed with status %d", result);
            n_pos_out = n_past;
            return result;
        }
        n_pos_out = new_n_past;
        return 0;
    }
 };
 // Computes FNV-1a hash of the data
 static std::string fnv_hash(const uint8_t * data, size_t len) {
    const uint64_t fnv_prime = 0x100000001b3ULL;
    uint64_t hash = 0xcbf29ce484222325ULL;
    for (size_t i = 0; i < len; ++i) {
        hash ^= data[i];
        hash *= fnv_prime;
    }
    return std::to_string(hash);
 }
--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -8,19 +8,12 @@ ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
 GOCMD?=go
 CGO_LDFLAGS?=
 # Avoid parent make file overwriting CGO_LDFLAGS which is needed for hipblas
 CGO_LDFLAGS_SYCL=
 GO_TAGS?=
 LD_FLAGS?=
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DSD_CUDA=ON
+	CMAKE_ARGS+=-DGGML_CUDA=ON
 # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
@@ -30,48 +23,29 @@ else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DSD_HIPBLAS=ON
+	CMAKE_ARGS+=-DGGML_HIP=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)
 	ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DSD_METAL=OFF
+		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
-		CMAKE_ARGS+=-DSD_METAL=ON
+		CMAKE_ARGS+=-DGGML_METAL=ON
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		TARGET+=--target ggml-metal
 	endif
 endif
-ifeq ($(BUILD_TYPE),sycl_f16)
+# ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
+# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
-		-DCMAKE_C_COMPILER=icx \
+# endif
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DSD_SYCL=ON \
 		-DGGML_SYCL_F16=ON
 	CC=icx
 	CXX=icpx
 	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
 	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
 	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
 	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
 endif
-ifeq ($(BUILD_TYPE),sycl_f32)
+# ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON \
+# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
-		-DCMAKE_C_COMPILER=icx \
+# endif
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DSD_SYCL=ON
 	CC=icx
 	CXX=icpx
 	CGO_LDFLAGS_SYCL += -fsycl -L${DNNLROOT}/lib -ldnnl ${MKLROOT}/lib/intel64/libmkl_sycl.a -fiopenmp -fopenmp-targets=spir64 -lOpenCL
 	CGO_LDFLAGS_SYCL += $(shell pkg-config --libs mkl-static-lp64-gomp)
 	CGO_CXXFLAGS += -fiopenmp -fopenmp-targets=spir64
 	CGO_CXXFLAGS += $(shell pkg-config --cflags mkl-static-lp64-gomp )
 endif
 # warnings
-# CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
 # Find all .a archives in ARCHIVE_DIR
 # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
@@ -112,24 +86,11 @@ endif
 	$(MAKE) $(COMBINED_LIB)
 gosd.o:
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c"
 else
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
 endif
 libsd.a: gosd.o
 	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
 	$(AR) rcs libsd.a gosd.o
 stablediffusion-ggml:
 	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_SYCL)" C_INCLUDE_PATH="$(INCLUDE_PATH)" LIBRARY_PATH="$(LIBRARY_PATH)" \
 	CC="$(CC)" CXX="$(CXX)" CGO_CXXFLAGS="$(CGO_CXXFLAGS)" \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o ../../../../backend-assets/grpc/stablediffusion-ggml ./
 ifneq ($(UPX),)
 	$(UPX) ../../../../backend-assets/grpc/stablediffusion-ggml
 endif
 clean:
 	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/go/transcribe/whisper/whisper.go
+++ b/backend/go/transcribe/whisper/whisper.go
@@ -74,7 +74,7 @@ func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.Transcript
 		context.SetTranslate(true)
 	}
-	if err := context.Process(data, nil, nil, nil); err != nil {
+	if err := context.Process(data, nil, nil); err != nil {
 		return pb.TranscriptResult{}, err
 	}
--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -0,0 +1,17 @@
 .PHONY: autogptq
 autogptq: protogen
 	bash install.sh
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/autogptq/README.md
+++ b/backend/python/autogptq/README.md
@@ -0,0 +1,5 @@
 # Creating a separate environment for the autogptq project
 ```
 make autogptq
 ```
--- a/backend/python/autogptq/backend.py
+++ b/backend/python/autogptq/backend.py
@@ -0,0 +1,153 @@
 #!/usr/bin/env python3
 from concurrent import futures
 import argparse
 import signal
 import sys
 import os
 import time
 import base64
 import grpc
 import backend_pb2
 import backend_pb2_grpc
 from auto_gptq import AutoGPTQForCausalLM
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers import TextGenerationPipeline
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    def Health(self, request, context):
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        try:
            device = "cuda:0"
            if request.Device != "":
                device = request.Device
            # support loading local model files
            model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
            # support model `Qwen/Qwen-VL-Chat-Int4`
            if "qwen-vl" in request.Model.lower():
                self.model_name = "Qwen-VL-Chat"
                model = AutoModelForCausalLM.from_pretrained(model_path, 
                    trust_remote_code=request.TrustRemoteCode,
                    device_map="auto").eval()
            else:
                model = AutoGPTQForCausalLM.from_quantized(model_path,
                    model_basename=request.ModelBaseName,
                    use_safetensors=True,
                    trust_remote_code=request.TrustRemoteCode,
                    device=device,
                    use_triton=request.UseTriton,
                    quantize_config=None)
            self.model = model
            self.tokenizer = tokenizer
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def Predict(self, request, context):
        penalty = 1.0
        if request.Penalty != 0.0:
            penalty = request.Penalty
        tokens = 512
        if request.Tokens != 0:
            tokens = request.Tokens
        top_p = 0.95
        if request.TopP != 0.0:
            top_p = request.TopP
        prompt_images = self.recompile_vl_prompt(request)
        compiled_prompt = prompt_images[0]
        print(f"Prompt: {compiled_prompt}", file=sys.stderr)
        # Implement Predict RPC
        pipeline = TextGenerationPipeline(
            model=self.model, 
            tokenizer=self.tokenizer,
            max_new_tokens=tokens,
            temperature=request.Temperature,
            top_p=top_p,
            repetition_penalty=penalty,
            )
        t = pipeline(compiled_prompt)[0]["generated_text"]
        print(f"generated_text: {t}", file=sys.stderr)
        if compiled_prompt in t:
            t = t.replace(compiled_prompt, "")
        # house keeping. Remove the image files from /tmp folder
        for img_path in prompt_images[1]:
            try:
                os.remove(img_path)
            except Exception as e:
                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
    def PredictStream(self, request, context):
        # Implement PredictStream RPC
        #for reply in some_data_generator():
        #    yield reply
        # Not implemented yet
        return self.Predict(request, context)
    def recompile_vl_prompt(self, request):
        prompt = request.Prompt
        image_paths = []
        if "qwen-vl" in self.model_name.lower():
            # request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
            # Then, save the image file paths to an array "image_paths".
            # read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
            for i, img in enumerate(request.Images):
                timestamp = str(int(time.time() * 1000))  # Generate timestamp
                img_path = f"/tmp/vl-{timestamp}.jpg"  # Use timestamp in filename
                with open(img_path, "wb") as f:
                    f.write(base64.b64decode(img))
                image_paths.append(img_path)
                prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
        else:
            prompt = request.Prompt
        return (prompt, image_paths)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    serve(args.addr)
--- a/backend/python/autogptq/install.sh
+++ b/backend/python/autogptq/install.sh
@@ -0,0 +1,14 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 # This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
 # This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
 # We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
 # the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
 installRequirements
--- a/backend/python/autogptq/requirements-cublas11.txt
+++ b/backend/python/autogptq/requirements-cublas11.txt
@@ -0,0 +1,2 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
--- a/backend/python/autogptq/requirements-cublas12.txt
+++ b/backend/python/autogptq/requirements-cublas12.txt
@@ -0,0 +1 @@
 torch==2.4.1
--- a/backend/python/autogptq/requirements-hipblas.txt
+++ b/backend/python/autogptq/requirements-hipblas.txt
@@ -0,0 +1,2 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch==2.4.1+rocm6.0
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -0,0 +1,6 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 intel-extension-for-pytorch==2.3.110+xpu
 torch==2.3.1+cxx11.abi
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
 setuptools
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -0,0 +1,6 @@
 accelerate
 auto-gptq==0.7.1
 grpcio==1.71.0
 protobuf
 certifi
 transformers
--- a/backend/python/autogptq/run.sh
+++ b/backend/python/autogptq/run.sh
@@ -0,0 +1,4 @@
 #!/bin/bash
 source $(dirname $0)/../common/libbackend.sh
 startBackend $@
--- a/backend/python/autogptq/test.sh
+++ b/backend/python/autogptq/test.sh
@@ -0,0 +1,6 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 runUnittests
--- a/backend/python/bark/backend.py
+++ b/backend/python/bark/backend.py
@@ -61,12 +61,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.72.0
+grpcio==1.71.0
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.72.0
+grpcio==1.71.0
 protobuf
 grpcio-tools
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@@ -86,12 +86,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.72.0
+grpcio==1.71.0
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -19,7 +19,7 @@ import grpc
 from diffusers import SanaPipeline, StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
    EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
-from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline, Lumina2Text2ImgPipeline
+from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image, export_to_video
 from compel import Compel, ReturnedEmbeddingsType
@@ -168,13 +168,9 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            # We are storing all the options in a dict so we can use it later when
            # generating the images
            for opt in options:
                if ":" not in opt:
                    continue
                key, value = opt.split(":")
                self.options[key] = value
            print(f"Options: {self.options}", file=sys.stderr)
            local = False
            modelFile = request.Model
@@ -291,12 +287,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    if request.LowVRAM:
                        self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "Lumina2Text2ImgPipeline":
                self.pipe = Lumina2Text2ImgPipeline.from_pretrained(
                    request.Model,
                    torch_dtype=torch.bfloat16)
                if request.LowVRAM:
                    self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "SanaPipeline":
                self.pipe = SanaPipeline.from_pretrained(
                    request.Model,
@@ -526,12 +516,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.72.0
+grpcio==1.71.0
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/backend.py
+++ b/backend/python/exllama2/backend.py
@@ -105,12 +105,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.72.0
+grpcio==1.71.0
 protobuf
 certifi
 wheel
--- a/backend/python/faster-whisper/backend.py
+++ b/backend/python/faster-whisper/backend.py
@@ -62,12 +62,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.TranscriptResult(segments=resultSegments, text=text)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/faster-whisper/requirements.txt
+++ b/backend/python/faster-whisper/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.72.0
+grpcio==1.71.0
 protobuf
 grpcio-tools
--- a/backend/python/kokoro/backend.py
+++ b/backend/python/kokoro/backend.py
@@ -99,12 +99,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/kokoro/requirements.txt
+++ b/backend/python/kokoro/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.72.0
+grpcio==1.71.0
 protobuf
 phonemizer
 scipy
--- a/backend/python/rerankers/backend.py
+++ b/backend/python/rerankers/backend.py
@@ -91,12 +91,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.RerankResult(usage=usage, results=results)
 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.72.0
+grpcio==1.71.0
 protobuf
 certifi
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -559,12 +559,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.72.0
+grpcio==1.71.0
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -194,40 +194,27 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            await iterations.aclose()
    async def _predict(self, request, context, streaming=False):
        # Build the sampling parameters
        # NOTE: this must stay in sync with the vllm backend
        request_to_sampling_params = {
            "N": "n",
            "PresencePenalty": "presence_penalty",
            "FrequencyPenalty": "frequency_penalty",
            "RepetitionPenalty": "repetition_penalty",
            "Temperature": "temperature",
            "TopP": "top_p",
            "TopK": "top_k",
            "MinP": "min_p",
            "Seed": "seed",
            "StopPrompts": "stop",
            "StopTokenIds": "stop_token_ids",
            "BadWords": "bad_words",
            "IncludeStopStrInOutput": "include_stop_str_in_output",
            "IgnoreEOS": "ignore_eos",
            "Tokens": "max_tokens",
            "MinTokens": "min_tokens",
            "Logprobs": "logprobs",
            "PromptLogprobs": "prompt_logprobs",
            "SkipSpecialTokens": "skip_special_tokens",
            "SpacesBetweenSpecialTokens": "spaces_between_special_tokens",
            "TruncatePromptTokens": "truncate_prompt_tokens",
            "GuidedDecoding": "guided_decoding",
        }
        # Build sampling parameters
        sampling_params = SamplingParams(top_p=0.9, max_tokens=200)
-
+        if request.TopP != 0:
-        for request_field, param_field in request_to_sampling_params.items():
+            sampling_params.top_p = request.TopP
-            if hasattr(request, request_field):
+        if request.Tokens > 0:
-                value = getattr(request, request_field)
+            sampling_params.max_tokens = request.Tokens
-                if value not in (None, 0, [], False):
+        if request.Temperature != 0:
-                    setattr(sampling_params, param_field, value)
+            sampling_params.temperature = request.Temperature
        if request.TopK != 0:
            sampling_params.top_k = request.TopK
        if request.PresencePenalty != 0:
            sampling_params.presence_penalty = request.PresencePenalty
        if request.FrequencyPenalty != 0:
            sampling_params.frequency_penalty = request.FrequencyPenalty
        if request.StopPrompts:
            sampling_params.stop = request.StopPrompts
        if request.IgnoreEOS:
            sampling_params.ignore_eos = request.IgnoreEOS
        if request.Seed != 0:
            sampling_params.seed = request.Seed
        # Extract image paths and process images
        prompt = request.Prompt
@@ -333,12 +320,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
        options=[
            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
        ])
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.72.0
+grpcio==1.71.0
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/test.py
+++ b/backend/python/vllm/test.py
@@ -75,53 +75,6 @@ class TestBackendServicer(unittest.TestCase):
        finally:
            self.tearDown()
    def test_sampling_params(self):
        """
        This method tests if all sampling parameters are correctly processed
        NOTE: this does NOT test for correctness, just that we received a compatible response
        """
        try:
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
                response = stub.LoadModel(backend_pb2.ModelOptions(Model="facebook/opt-125m"))
                self.assertTrue(response.success)
                req = backend_pb2.PredictOptions(
                    Prompt="The capital of France is",
                    TopP=0.8,
                    Tokens=50,
                    Temperature=0.7,
                    TopK=40,
                    PresencePenalty=0.1,
                    FrequencyPenalty=0.2,
                    RepetitionPenalty=1.1,
                    MinP=0.05,
                    Seed=42,
                    StopPrompts=["\n"],
                    StopTokenIds=[50256],
                    BadWords=["badword"],
                    IncludeStopStrInOutput=True,
                    IgnoreEOS=True,
                    MinTokens=5,
                    Logprobs=5,
                    PromptLogprobs=5,
                    SkipSpecialTokens=True,
                    SpacesBetweenSpecialTokens=True,
                    TruncatePromptTokens=10,
                    GuidedDecoding=True,
                    N=2,
                )
                resp = stub.Predict(req)
                self.assertIsNotNone(resp.message)
                self.assertIsNotNone(resp.logprobs)
        except Exception as err:
            print(err)
            self.fail("sampling params service failed")
        finally:
            self.tearDown()
    def test_embedding(self):
        """
        This method tests if the embeddings are generated successfully
--- a/core/application/application.go
+++ b/core/application/application.go
@@ -16,7 +16,7 @@ type Application struct {
 func newApplication(appConfig *config.ApplicationConfig) *Application {
 	return &Application{
 		backendLoader:      config.NewBackendConfigLoader(appConfig.ModelPath),
-		modelLoader:        model.NewModelLoader(appConfig.ModelPath, appConfig.SingleBackend),
+		modelLoader:        model.NewModelLoader(appConfig.ModelPath),
 		applicationConfig:  appConfig,
 		templatesEvaluator: templates.NewEvaluator(appConfig.ModelPath),
 	}
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -43,12 +43,18 @@ func New(opts ...config.AppOption) (*Application, error) {
 	if err != nil {
 		return nil, fmt.Errorf("unable to create ModelPath: %q", err)
 	}
-	if options.GeneratedContentDir != "" {
+	if options.ImageDir != "" {
-		err := os.MkdirAll(options.GeneratedContentDir, 0750)
+		err := os.MkdirAll(options.ImageDir, 0750)
 		if err != nil {
 			return nil, fmt.Errorf("unable to create ImageDir: %q", err)
 		}
 	}
 	if options.AudioDir != "" {
 		err := os.MkdirAll(options.AudioDir, 0750)
 		if err != nil {
 			return nil, fmt.Errorf("unable to create AudioDir: %q", err)
 		}
 	}
 	if options.UploadDir != "" {
 		err := os.MkdirAll(options.UploadDir, 0750)
 		if err != nil {
@@ -137,7 +143,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}()
 	}
-	if options.LoadToMemory != nil && !options.SingleBackend {
+	if options.LoadToMemory != nil {
 		for _, m := range options.LoadToMemory {
 			cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
 			if err != nil {
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -17,7 +17,6 @@ func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendCo
 	if err != nil {
 		return nil, err
 	}
 	defer loader.Close()
 	var fn func() ([]float32, error)
 	switch model := inferenceModel.(type) {
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -16,7 +16,6 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 	if err != nil {
 		return nil, err
 	}
 	defer loader.Close()
 	fn := func() error {
 		_, err := inferenceModel.GenerateImage(
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -53,7 +53,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 	if err != nil {
 		return nil, err
 	}
 	defer loader.Close()
 	var protoMessages []*proto.Message
 	// if we are using the tokenizer template, we need to convert the messages to proto messages
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -40,6 +40,10 @@ func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts ...
 	grpcOpts := grpcModelOpts(c)
 	defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
 	if so.SingleBackend {
 		defOpts = append(defOpts, model.WithSingleActiveBackend())
 	}
 	if so.ParallelBackendRequests {
 		defOpts = append(defOpts, model.EnableParallelRequests)
 	}
@@ -99,7 +103,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		mmap = *c.MMap
 	}
-	ctxSize := 4096
+	ctxSize := 1024
 	if c.ContextSize != nil {
 		ctxSize = *c.ContextSize
 	}
@@ -117,7 +121,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 	triggers := make([]*pb.GrammarTrigger, 0)
 	for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
 		triggers = append(triggers, &pb.GrammarTrigger{
-			Word: t.Word,
+			Word:    t.Word,
 		})
 	}
@@ -157,33 +161,38 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		DisableLogStatus:     c.DisableLogStatus,
 		DType:                c.DType,
 		// LimitMMPerPrompt vLLM
-		LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
+		LimitImagePerPrompt:  int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
-		LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
+		LimitVideoPerPrompt:  int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
-		LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
+		LimitAudioPerPrompt:  int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
-		MMProj:              c.MMProj,
+		MMProj:               c.MMProj,
-		FlashAttention:      c.FlashAttention,
+		FlashAttention:       c.FlashAttention,
-		CacheTypeKey:        c.CacheTypeK,
+		CacheTypeKey:         c.CacheTypeK,
-		CacheTypeValue:      c.CacheTypeV,
+		CacheTypeValue:       c.CacheTypeV,
-		NoKVOffload:         c.NoKVOffloading,
+		NoKVOffload:          c.NoKVOffloading,
-		YarnExtFactor:       c.YarnExtFactor,
+		YarnExtFactor:        c.YarnExtFactor,
-		YarnAttnFactor:      c.YarnAttnFactor,
+		YarnAttnFactor:       c.YarnAttnFactor,
-		YarnBetaFast:        c.YarnBetaFast,
+		YarnBetaFast:         c.YarnBetaFast,
-		YarnBetaSlow:        c.YarnBetaSlow,
+		YarnBetaSlow:         c.YarnBetaSlow,
-		NGQA:                c.NGQA,
+		NGQA:                 c.NGQA,
-		RMSNormEps:          c.RMSNormEps,
+		RMSNormEps:           c.RMSNormEps,
-		MLock:               mmlock,
+		MLock:                mmlock,
-		RopeFreqBase:        c.RopeFreqBase,
+		RopeFreqBase:         c.RopeFreqBase,
-		RopeScaling:         c.RopeScaling,
+		RopeScaling:          c.RopeScaling,
-		Type:                c.ModelType,
+		Type:                 c.ModelType,
-		RopeFreqScale:       c.RopeFreqScale,
+		RopeFreqScale:        c.RopeFreqScale,
-		NUMA:                c.NUMA,
+		NUMA:                 c.NUMA,
-		Embeddings:          embeddings,
+		Embeddings:           embeddings,
-		LowVRAM:             lowVRAM,
+		LowVRAM:              lowVRAM,
-		NGPULayers:          int32(nGPULayers),
+		NGPULayers:           int32(nGPULayers),
-		MMap:                mmap,
+		MMap:                 mmap,
-		MainGPU:             c.MainGPU,
+		MainGPU:              c.MainGPU,
-		Threads:             int32(*c.Threads),
+		Threads:              int32(*c.Threads),
-		TensorSplit:         c.TensorSplit,
+		TensorSplit:          c.TensorSplit,
 		// AutoGPTQ
 		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
 		Device:           c.AutoGPTQ.Device,
 		UseTriton:        c.AutoGPTQ.Triton,
 		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
 		// RWKV
 		Tokenizer: c.Tokenizer,
 	}
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -12,10 +12,10 @@ import (
 func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
 	opts := ModelOptions(backendConfig, appConfig)
 	rerankModel, err := loader.Load(opts...)
 	if err != nil {
 		return nil, err
 	}
 	defer loader.Close()
 	if rerankModel == nil {
 		return nil, fmt.Errorf("could not load rerank model")
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -26,26 +26,21 @@ func SoundGeneration(
 	opts := ModelOptions(backendConfig, appConfig)
 	soundGenModel, err := loader.Load(opts...)
 	if err != nil {
 		return "", nil, err
 	}
 	defer loader.Close()
 	if soundGenModel == nil {
 		return "", nil, fmt.Errorf("could not load sound generation model")
 	}
-	if err := os.MkdirAll(appConfig.GeneratedContentDir, 0750); err != nil {
+	if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}
-	audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio")
+	fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "sound_generation", ".wav")
-	if err := os.MkdirAll(audioDir, 0750); err != nil {
+	filePath := filepath.Join(appConfig.AudioDir, fileName)
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}
 	fileName := utils.GenerateUniqueFileName(audioDir, "sound_generation", ".wav")
 	filePath := filepath.Join(audioDir, fileName)
 	res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
 		Text:        text,
--- a/core/backend/token_metrics.go
+++ b/core/backend/token_metrics.go
@@ -20,7 +20,6 @@ func TokenMetrics(
 	if err != nil {
 		return nil, err
 	}
 	defer loader.Close()
 	if model == nil {
 		return nil, fmt.Errorf("could not loadmodel model")
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -14,10 +14,10 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
 	opts := ModelOptions(backendConfig, appConfig)
 	inferenceModel, err = loader.Load(opts...)
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
 	defer loader.Close()
 	predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
 	predictOptions.Prompt = s
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -24,7 +24,6 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
 	if err != nil {
 		return nil, err
 	}
 	defer ml.Close()
 	if transcriptionModel == nil {
 		return nil, fmt.Errorf("could not load transcription model")
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -23,22 +23,21 @@ func ModelTTS(
 ) (string, *proto.Result, error) {
 	opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
 	ttsModel, err := loader.Load(opts...)
 	if err != nil {
 		return "", nil, err
 	}
 	defer loader.Close()
 	if ttsModel == nil {
 		return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
 	}
-	audioDir := filepath.Join(appConfig.GeneratedContentDir, "audio")
+	if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
 	if err := os.MkdirAll(audioDir, 0750); err != nil {
 		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
 	}
-	fileName := utils.GenerateUniqueFileName(audioDir, "tts", ".wav")
+	fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
-	filePath := filepath.Join(audioDir, fileName)
+	filePath := filepath.Join(appConfig.AudioDir, fileName)
 	// We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect.
 	// This should be addressed in a follow up PR soon.
--- a/core/backend/vad.go
+++ b/core/backend/vad.go
@@ -19,8 +19,6 @@ func VAD(request *schema.VADRequest,
 	if err != nil {
 		return nil, err
 	}
 	defer ml.Close()
 	req := proto.VADRequest{
 		Audio: request.Audio,
 	}
--- a/core/backend/video.go
+++ b/core/backend/video.go
@@ -1,36 +0,0 @@
 package backend
 import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	model "github.com/mudler/LocalAI/pkg/model"
 )
 func VideoGeneration(height, width int32, prompt, startImage, endImage, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
 	opts := ModelOptions(backendConfig, appConfig)
 	inferenceModel, err := loader.Load(
 		opts...,
 	)
 	if err != nil {
 		return nil, err
 	}
 	defer loader.Close()
 	fn := func() error {
 		_, err := inferenceModel.GenerateVideo(
 			appConfig.Context,
 			&proto.GenerateVideoRequest{
 				Height:     height,
 				Width:      width,
 				Prompt:     prompt,
 				StartImage: startImage,
 				EndImage:   endImage,
 				Dst:        dst,
 			})
 		return err
 	}
 	return fn, nil
 }
--- a/core/cli/context/context.go
+++ b/core/cli/context/context.go
@@ -1,13 +1,11 @@
 package cliContext
-import (
+import "embed"
 	rice "github.com/GeertJohan/go.rice"
 )
 type Context struct {
 	Debug    bool    `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
 	LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"`
 	// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
-	BackendAssets *rice.Box `kong:"-"`
+	BackendAssets embed.FS `kong:"-"`
 }
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -21,7 +21,8 @@ type RunCMD struct {
 	ModelsPath                   string        `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 	BackendAssetsPath            string        `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
-	GeneratedContentPath         string        `env:"LOCALAI_GENERATED_CONTENT_PATH,GENERATED_CONTENT_PATH" type:"path" default:"/tmp/generated/content" help:"Location for generated content (e.g. images, audio, videos)" group:"storage"`
+	ImagePath                    string        `env:"LOCALAI_IMAGE_PATH,IMAGE_PATH" type:"path" default:"/tmp/generated/images" help:"Location for images generated by backends (e.g. stablediffusion)" group:"storage"`
 	AudioPath                    string        `env:"LOCALAI_AUDIO_PATH,AUDIO_PATH" type:"path" default:"/tmp/generated/audio" help:"Location for audio generated by backends (e.g. piper)" group:"storage"`
 	UploadPath                   string        `env:"LOCALAI_UPLOAD_PATH,UPLOAD_PATH" type:"path" default:"/tmp/localai/upload" help:"Path to store uploads from files api" group:"storage"`
 	ConfigPath                   string        `env:"LOCALAI_CONFIG_PATH,CONFIG_PATH" default:"/tmp/localai/config" group:"storage"`
 	LocalaiConfigDir             string        `env:"LOCALAI_CONFIG_DIR" type:"path" default:"${basepath}/configuration" help:"Directory for dynamic loading of certain configuration files (currently api_keys.json and external_backends.json)" group:"storage"`
@@ -37,7 +38,7 @@ type RunCMD struct {
 	F16         bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
 	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
-	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" help:"Default context size for models" group:"performance"`
+	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`
 	Address                            string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
 	CORS                               bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
@@ -46,7 +47,7 @@ type RunCMD struct {
 	CSRF                               bool     `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
 	UploadLimit                        int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
 	APIKeys                            []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
-	DisableWebUI                       bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disables the web user interface. When set to true, the server will only expose API endpoints without serving the web interface" group:"api"`
+	DisableWebUI                       bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
 	DisablePredownloadScan             bool     `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
 	OpaqueErrors                       bool     `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
 	UseSubtleKeyComparison             bool     `env:"LOCALAI_SUBTLE_KEY_COMPARISON" default:"false" help:"If true, API Key validation comparisons will be performed using constant-time comparisons rather than simple equality. This trades off performance on each request for resiliancy against timing attacks." group:"hardening"`
@@ -80,7 +81,8 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithModelPath(r.ModelsPath),
 		config.WithContextSize(r.ContextSize),
 		config.WithDebug(zerolog.GlobalLevel() <= zerolog.DebugLevel),
-		config.WithGeneratedContentDir(r.GeneratedContentPath),
+		config.WithImageDir(r.ImagePath),
 		config.WithAudioDir(r.AudioPath),
 		config.WithUploadDir(r.UploadPath),
 		config.WithConfigsDir(r.ConfigPath),
 		config.WithDynamicConfigDir(r.LocalaiConfigDir),
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@@ -70,11 +70,11 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 	opts := &config.ApplicationConfig{
 		ModelPath:            t.ModelsPath,
 		Context:              context.Background(),
-		GeneratedContentDir:  outputDir,
+		AudioDir:             outputDir,
 		AssetsDestination:    t.BackendAssetsPath,
 		ExternalGRPCBackends: externalBackends,
 	}
-	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
+	ml := model.NewModelLoader(opts.ModelPath)
 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -32,7 +32,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 	}
 	cl := config.NewBackendConfigLoader(t.ModelsPath)
-	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
+	ml := model.NewModelLoader(opts.ModelPath)
 	if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
 		return err
 	}
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -36,12 +36,12 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 	text := strings.Join(t.Text, " ")
 	opts := &config.ApplicationConfig{
-		ModelPath:           t.ModelsPath,
+		ModelPath:         t.ModelsPath,
-		Context:             context.Background(),
+		Context:           context.Background(),
-		GeneratedContentDir: outputDir,
+		AudioDir:          outputDir,
-		AssetsDestination:   t.BackendAssetsPath,
+		AssetsDestination: t.BackendAssetsPath,
 	}
-	ml := model.NewModelLoader(opts.ModelPath, opts.SingleBackend)
+	ml := model.NewModelLoader(opts.ModelPath)
 	defer func() {
 		err := ml.StopAllGRPC()
--- a/core/cli/util.go
+++ b/core/cli/util.go
@@ -7,11 +7,11 @@ import (
 	"github.com/rs/zerolog/log"
 	gguf "github.com/gpustack/gguf-parser-go"
 	cliContext "github.com/mudler/LocalAI/core/cli/context"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/pkg/downloader"
 	gguf "github.com/thxcode/gguf-parser-go"
 )
 type UtilCMD struct {
@@ -51,7 +51,7 @@ func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
 	log.Info().
 		Any("eosTokenID", f.Tokenizer().EOSTokenID).
 		Any("bosTokenID", f.Tokenizer().BOSTokenID).
-		Any("modelName", f.Metadata().Name).
+		Any("modelName", f.Model().Name).
 		Any("architecture", f.Architecture().Architecture).Msgf("GGUF file loaded: %s", u.Args[0])
 	log.Info().Any("tokenizer", fmt.Sprintf("%+v", f.Tokenizer())).Msg("Tokenizer")
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -2,11 +2,11 @@ package config
 import (
 	"context"
 	"embed"
 	"encoding/json"
 	"regexp"
 	"time"
 	rice "github.com/GeertJohan/go.rice"
 	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/rs/zerolog/log"
 )
@@ -19,21 +19,20 @@ type ApplicationConfig struct {
 	UploadLimitMB, Threads, ContextSize int
 	F16                                 bool
 	Debug                               bool
-	GeneratedContentDir                 string
+	ImageDir                            string
-
+	AudioDir                            string
-	ConfigsDir string
+	UploadDir                           string
-	UploadDir  string
+	ConfigsDir                          string
-
+	DynamicConfigsDir                   string
-	DynamicConfigsDir             string
+	DynamicConfigsDirPollInterval       time.Duration
-	DynamicConfigsDirPollInterval time.Duration
+	CORS                                bool
-	CORS                          bool
+	CSRF                                bool
-	CSRF                          bool
+	PreloadJSONModels                   string
-	PreloadJSONModels             string
+	PreloadModelsFromPath               string
-	PreloadModelsFromPath         string
+	CORSAllowOrigins                    string
-	CORSAllowOrigins              string
+	ApiKeys                             []string
-	ApiKeys                       []string
+	P2PToken                            string
-	P2PToken                      string
+	P2PNetworkID                        string
 	P2PNetworkID                  string
 	DisableWebUI                       bool
 	EnforcePredownloadScans            bool
@@ -47,7 +46,7 @@ type ApplicationConfig struct {
 	Galleries []Gallery
-	BackendAssets     *rice.Box
+	BackendAssets     embed.FS
 	AssetsDestination string
 	ExternalGRPCBackends map[string]string
@@ -198,7 +197,7 @@ func WithBackendAssetsOutput(out string) AppOption {
 	}
 }
-func WithBackendAssets(f *rice.Box) AppOption {
+func WithBackendAssets(f embed.FS) AppOption {
 	return func(o *ApplicationConfig) {
 		o.BackendAssets = f
 	}
@@ -280,9 +279,15 @@ func WithDebug(debug bool) AppOption {
 	}
 }
-func WithGeneratedContentDir(generatedContentDir string) AppOption {
+func WithAudioDir(audioDir string) AppOption {
 	return func(o *ApplicationConfig) {
-		o.GeneratedContentDir = generatedContentDir
+		o.AudioDir = audioDir
 	}
 }
 func WithImageDir(imageDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.ImageDir = imageDir
 	}
 }
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -50,6 +50,9 @@ type BackendConfig struct {
 	// LLM configs (GPT4ALL, Llama.cpp, ...)
 	LLMConfig `yaml:",inline"`
 	// AutoGPTQ specifics
 	AutoGPTQ AutoGPTQ `yaml:"autogptq"`
 	// Diffusers
 	Diffusers Diffusers `yaml:"diffusers"`
 	Step      int       `yaml:"step"`
@@ -173,6 +176,14 @@ type LimitMMPerPrompt struct {
 	LimitAudioPerPrompt int `yaml:"audio"`
 }
 // AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
 type AutoGPTQ struct {
 	ModelBaseName    string `yaml:"model_base_name"`
 	Device           string `yaml:"device"`
 	Triton           bool   `yaml:"triton"`
 	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
 }
 // TemplateConfig is a struct that holds the configuration of the templating system
 type TemplateConfig struct {
 	// Chat is the template used in the chat completion endpoint
@@ -304,6 +315,9 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	defaultTFZ := 1.0
 	defaultZero := 0
 	// Try to offload all GPU layers (if GPU is found)
 	defaultHigh := 99999999
 	trueV := true
 	falseV := false
@@ -363,6 +377,9 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	if cfg.MirostatTAU == nil {
 		cfg.MirostatTAU = &defaultMirostatTAU
 	}
 	if cfg.NGPULayers == nil {
 		cfg.NGPULayers = &defaultHigh
 	}
 	if cfg.LowVRAM == nil {
 		cfg.LowVRAM = &falseV
@@ -372,6 +389,16 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Embeddings = &falseV
 	}
 	// Value passed by the top level are treated as default (no implicit defaults)
 	// defaults are set by the user
 	if ctx == 0 {
 		ctx = 1024
 	}
 	if cfg.ContextSize == nil {
 		cfg.ContextSize = &ctx
 	}
 	if threads == 0 {
 		// Threads can't be 0
 		threads = 4
@@ -393,7 +420,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 		cfg.Debug = &trueV
 	}
-	guessDefaultsFromFile(cfg, lo.modelPath, ctx)
+	guessDefaultsFromFile(cfg, lo.modelPath)
 }
 func (c *BackendConfig) Validate() bool {
@@ -430,19 +457,18 @@ func (c *BackendConfig) HasTemplate() bool {
 type BackendConfigUsecases int
 const (
-	FLAG_ANY              BackendConfigUsecases = 0b000000000000
+	FLAG_ANY              BackendConfigUsecases = 0b00000000000
-	FLAG_CHAT             BackendConfigUsecases = 0b000000000001
+	FLAG_CHAT             BackendConfigUsecases = 0b00000000001
-	FLAG_COMPLETION       BackendConfigUsecases = 0b000000000010
+	FLAG_COMPLETION       BackendConfigUsecases = 0b00000000010
-	FLAG_EDIT             BackendConfigUsecases = 0b000000000100
+	FLAG_EDIT             BackendConfigUsecases = 0b00000000100
-	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b000000001000
+	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b00000001000
-	FLAG_RERANK           BackendConfigUsecases = 0b000000010000
+	FLAG_RERANK           BackendConfigUsecases = 0b00000010000
-	FLAG_IMAGE            BackendConfigUsecases = 0b000000100000
+	FLAG_IMAGE            BackendConfigUsecases = 0b00000100000
-	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b000001000000
+	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b00001000000
-	FLAG_TTS              BackendConfigUsecases = 0b000010000000
+	FLAG_TTS              BackendConfigUsecases = 0b00010000000
-	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b000100000000
+	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b00100000000
-	FLAG_TOKENIZE         BackendConfigUsecases = 0b001000000000
+	FLAG_TOKENIZE         BackendConfigUsecases = 0b01000000000
-	FLAG_VAD              BackendConfigUsecases = 0b010000000000
+	FLAG_VAD              BackendConfigUsecases = 0b10000000000
 	FLAG_VIDEO            BackendConfigUsecases = 0b100000000000
 	// Common Subsets
 	FLAG_LLM BackendConfigUsecases = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
@@ -463,7 +489,6 @@ func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
 		"FLAG_TOKENIZE":         FLAG_TOKENIZE,
 		"FLAG_VAD":              FLAG_VAD,
 		"FLAG_LLM":              FLAG_LLM,
 		"FLAG_VIDEO":            FLAG_VIDEO,
 	}
 }
@@ -528,17 +553,6 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 			return false
 		}
 	}
 	if (u & FLAG_VIDEO) == FLAG_VIDEO {
 		videoBackends := []string{"diffusers", "stablediffusion"}
 		if !slices.Contains(videoBackends, c.Backend) {
 			return false
 		}
 		if c.Backend == "diffusers" && c.Diffusers.PipelineType == "" {
 			return false
 		}
 	}
 	if (u & FLAG_RERANK) == FLAG_RERANK {
 		if c.Backend != "rerankers" {
@@ -551,7 +565,7 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 		}
 	}
 	if (u & FLAG_TTS) == FLAG_TTS {
-		ttsBackends := []string{"bark-cpp", "parler-tts", "piper", "transformers-musicgen"}
+		ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
 		if !slices.Contains(ttsBackends, c.Backend) {
 			return false
 		}
--- a/core/config/gguf.go
+++ b/core/config/gguf.go
@@ -1,296 +0,0 @@
 package config
 import (
 	"strings"
 	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/rs/zerolog/log"
 	gguf "github.com/gpustack/gguf-parser-go"
 )
 type familyType uint8
 const (
 	Unknown familyType = iota
 	LLaMa3
 	CommandR
 	Phi3
 	ChatML
 	Mistral03
 	Gemma
 	DeepSeek2
 )
 const (
 	defaultContextSize = 1024
 	defaultNGPULayers  = 99999999
 )
 type settingsConfig struct {
 	StopWords      []string
 	TemplateConfig TemplateConfig
 	RepeatPenalty  float64
 }
 // default settings to adopt with a given model family
 var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
 	Gemma: {
 		RepeatPenalty: 1.0,
 		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
 		TemplateConfig: TemplateConfig{
 			Chat:        "{{.Input }}\n<start_of_turn>model\n",
 			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
 			Completion:  "{{.Input}}",
 		},
 	},
 	DeepSeek2: {
 		StopWords: []string{"<｜end▁of▁sentence｜>"},
 		TemplateConfig: TemplateConfig{
 			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
 {{ end -}}
 {{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
 {{if eq .RoleName "system" -}}{{.Content}}
 {{end -}}`,
 			Chat: "{{.Input -}}\nAssistant: ",
 		},
 	},
 	LLaMa3: {
 		StopWords: []string{"<|eot_id|>"},
 		TemplateConfig: TemplateConfig{
 			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
 			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
 		},
 	},
 	CommandR: {
 		TemplateConfig: TemplateConfig{
 			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
 			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
 You are a function calling AI model, you can call the following functions:
 ## Available Tools
 {{range .Functions}}
 - {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
 {{end}}
 When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
 <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
 			ChatMessage: `{{if eq .RoleName "user" -}}
 <|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if eq .RoleName "system" -}}
 <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if eq .RoleName "assistant" -}}
 <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if eq .RoleName "tool" -}}
 <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if .FunctionCall -}}
 <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
 {{- end -}}`,
 		},
 		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
 	},
 	Phi3: {
 		TemplateConfig: TemplateConfig{
 			Chat:        "{{.Input}}\n<|assistant|>",
 			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
 			Completion:  "{{.Input}}",
 		},
 		StopWords: []string{"<|end|>", "<|endoftext|>"},
 	},
 	ChatML: {
 		TemplateConfig: TemplateConfig{
 			Chat: "{{.Input -}}\n<|im_start|>assistant",
 			Functions: `<|im_start|>system
 You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
 {{range .Functions}}
 {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
 {{end}}
 For each function call return a json object with function name and arguments
 <|im_end|>
 {{.Input -}}
 <|im_start|>assistant`,
 			ChatMessage: `<|im_start|>{{ .RoleName }}
 {{ if .FunctionCall -}}
 Function call:
 {{ else if eq .RoleName "tool" -}}
 Function response:
 {{ end -}}
 {{ if .Content -}}
 {{.Content }}
 {{ end -}}
 {{ if .FunctionCall -}}
 {{toJson .FunctionCall}}
 {{ end -}}<|im_end|>`,
 		},
 		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
 	},
 	Mistral03: {
 		TemplateConfig: TemplateConfig{
 			Chat:      "{{.Input -}}",
 			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
 			ChatMessage: `{{if eq .RoleName "user" -}}
 [INST] {{.Content }} [/INST]
 {{- else if .FunctionCall -}}
 [TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
 {{- else if eq .RoleName "tool" -}}
 [TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
 {{- else -}}
 {{ .Content -}}
 {{ end -}}`,
 		},
 		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
 	},
 }
 // this maps well known template used in HF to model families defined above
 var knownTemplates = map[string]familyType{
 	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
 	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
 }
 func guessGGUFFromFile(cfg *BackendConfig, f *gguf.GGUFFile, defaultCtx int) {
 	if defaultCtx == 0 && cfg.ContextSize == nil {
 		ctxSize := f.EstimateLLaMACppRun().ContextSize
 		if ctxSize > 0 {
 			cSize := int(ctxSize)
 			cfg.ContextSize = &cSize
 		} else {
 			defaultCtx = defaultContextSize
 			cfg.ContextSize = &defaultCtx
 		}
 	}
 	// GPU options
 	if cfg.Options == nil {
 		if xsysinfo.HasGPU("nvidia") || xsysinfo.HasGPU("amd") {
 			cfg.Options = []string{"gpu"}
 		}
 	}
 	// vram estimation
 	vram, err := xsysinfo.TotalAvailableVRAM()
 	if err != nil {
 		log.Error().Msgf("guessDefaultsFromFile(TotalAvailableVRAM): %s", err)
 	} else if vram > 0 {
 		estimate, err := xsysinfo.EstimateGGUFVRAMUsage(f, vram)
 		if err != nil {
 			log.Error().Msgf("guessDefaultsFromFile(EstimateGGUFVRAMUsage): %s", err)
 		} else {
 			if estimate.IsFullOffload {
 				log.Warn().Msgf("guessDefaultsFromFile: %s", "full offload is recommended")
 			}
 			if estimate.EstimatedVRAM > vram {
 				log.Warn().Msgf("guessDefaultsFromFile: %s", "estimated VRAM usage is greater than available VRAM")
 			}
 			if cfg.NGPULayers == nil && estimate.EstimatedLayers > 0 {
 				log.Debug().Msgf("guessDefaultsFromFile: %d layers estimated", estimate.EstimatedLayers)
 				cfg.NGPULayers = &estimate.EstimatedLayers
 			}
 		}
 	}
 	if cfg.NGPULayers == nil {
 		// we assume we want to offload all layers
 		defaultHigh := defaultNGPULayers
 		cfg.NGPULayers = &defaultHigh
 	}
 	log.Debug().Any("NGPULayers", cfg.NGPULayers).Msgf("guessDefaultsFromFile: %s", "NGPULayers set")
 	// template estimations
 	if cfg.HasTemplate() {
 		// nothing to guess here
 		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
 		return
 	}
 	log.Debug().
 		Any("eosTokenID", f.Tokenizer().EOSTokenID).
 		Any("bosTokenID", f.Tokenizer().BOSTokenID).
 		Any("modelName", f.Metadata().Name).
 		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
 	// guess the name
 	if cfg.Name == "" {
 		cfg.Name = f.Metadata().Name
 	}
 	family := identifyFamily(f)
 	if family == Unknown {
 		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
 		return
 	}
 	// identify template
 	settings, ok := defaultsSettings[family]
 	if ok {
 		cfg.TemplateConfig = settings.TemplateConfig
 		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
 		if len(cfg.StopWords) == 0 {
 			cfg.StopWords = settings.StopWords
 		}
 		if cfg.RepeatPenalty == 0.0 {
 			cfg.RepeatPenalty = settings.RepeatPenalty
 		}
 	} else {
 		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
 	}
 	if cfg.HasTemplate() {
 		return
 	}
 	// identify from well known templates first, otherwise use the raw jinja template
 	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
 	if found {
 		// try to use the jinja template
 		cfg.TemplateConfig.JinjaTemplate = true
 		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
 	}
 }
 func identifyFamily(f *gguf.GGUFFile) familyType {
 	// identify from well known templates first
 	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
 	if found && chatTemplate.ValueString() != "" {
 		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
 			return family
 		}
 	}
 	// otherwise try to identify from the model properties
 	arch := f.Architecture().Architecture
 	eosTokenID := f.Tokenizer().EOSTokenID
 	bosTokenID := f.Tokenizer().BOSTokenID
 	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
 	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
 	llama3 := arch == "llama" && eosTokenID == 128009
 	commandR := arch == "command-r" && eosTokenID == 255001
 	qwen2 := arch == "qwen2"
 	phi3 := arch == "phi-3"
 	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Metadata().Name), "gemma")
 	deepseek2 := arch == "deepseek2"
 	switch {
 	case deepseek2:
 		return DeepSeek2
 	case gemma:
 		return Gemma
 	case llama3:
 		return LLaMa3
 	case commandR:
 		return CommandR
 	case phi3:
 		return Phi3
 	case qwen2, isYI:
 		return ChatML
 	default:
 		return Unknown
 	}
 }
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@@ -3,12 +3,147 @@ package config
 import (
 	"os"
 	"path/filepath"
 	"strings"
 	gguf "github.com/gpustack/gguf-parser-go"
 	"github.com/rs/zerolog/log"
 	gguf "github.com/thxcode/gguf-parser-go"
 )
-func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int) {
+type familyType uint8
 const (
 	Unknown familyType = iota
 	LLaMa3
 	CommandR
 	Phi3
 	ChatML
 	Mistral03
 	Gemma
 	DeepSeek2
 )
 type settingsConfig struct {
 	StopWords      []string
 	TemplateConfig TemplateConfig
 	RepeatPenalty  float64
 }
 // default settings to adopt with a given model family
 var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
 	Gemma: {
 		RepeatPenalty: 1.0,
 		StopWords:     []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
 		TemplateConfig: TemplateConfig{
 			Chat:        "{{.Input }}\n<start_of_turn>model\n",
 			ChatMessage: "<start_of_turn>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<end_of_turn>",
 			Completion:  "{{.Input}}",
 		},
 	},
 	DeepSeek2: {
 		StopWords: []string{"<｜end▁of▁sentence｜>"},
 		TemplateConfig: TemplateConfig{
 			ChatMessage: `{{if eq .RoleName "user" -}}User: {{.Content }}
 {{ end -}}
 {{if eq .RoleName "assistant" -}}Assistant: {{.Content}}<｜end▁of▁sentence｜>{{end}}
 {{if eq .RoleName "system" -}}{{.Content}}
 {{end -}}`,
 			Chat: "{{.Input -}}\nAssistant: ",
 		},
 	},
 	LLaMa3: {
 		StopWords: []string{"<|eot_id|>"},
 		TemplateConfig: TemplateConfig{
 			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
 			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
 		},
 	},
 	CommandR: {
 		TemplateConfig: TemplateConfig{
 			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
 			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
 You are a function calling AI model, you can call the following functions:
 ## Available Tools
 {{range .Functions}}
 - {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
 {{end}}
 When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
 <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
 			ChatMessage: `{{if eq .RoleName "user" -}}
 <|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if eq .RoleName "system" -}}
 <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if eq .RoleName "assistant" -}}
 <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if eq .RoleName "tool" -}}
 <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
 {{- else if .FunctionCall -}}
 <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
 {{- end -}}`,
 		},
 		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
 	},
 	Phi3: {
 		TemplateConfig: TemplateConfig{
 			Chat:        "{{.Input}}\n<|assistant|>",
 			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
 			Completion:  "{{.Input}}",
 		},
 		StopWords: []string{"<|end|>", "<|endoftext|>"},
 	},
 	ChatML: {
 		TemplateConfig: TemplateConfig{
 			Chat: "{{.Input -}}\n<|im_start|>assistant",
 			Functions: `<|im_start|>system
 You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
 {{range .Functions}}
 {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
 {{end}}
 For each function call return a json object with function name and arguments
 <|im_end|>
 {{.Input -}}
 <|im_start|>assistant`,
 			ChatMessage: `<|im_start|>{{ .RoleName }}
 {{ if .FunctionCall -}}
 Function call:
 {{ else if eq .RoleName "tool" -}}
 Function response:
 {{ end -}}
 {{ if .Content -}}
 {{.Content }}
 {{ end -}}
 {{ if .FunctionCall -}}
 {{toJson .FunctionCall}}
 {{ end -}}<|im_end|>`,
 		},
 		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
 	},
 	Mistral03: {
 		TemplateConfig: TemplateConfig{
 			Chat:      "{{.Input -}}",
 			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
 			ChatMessage: `{{if eq .RoleName "user" -}}
 [INST] {{.Content }} [/INST]
 {{- else if .FunctionCall -}}
 [TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
 {{- else if eq .RoleName "tool" -}}
 [TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
 {{- else -}}
 {{ .Content -}}
 {{ end -}}`,
 		},
 		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
 	},
 }
 // this maps well known template used in HF to model families defined above
 var knownTemplates = map[string]familyType{
 	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
 	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
 }
 func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
 	if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
 		log.Debug().Msgf("guessDefaultsFromFile: %s", "guessing disabled with LOCALAI_DISABLE_GUESSING")
 		return
@@ -19,20 +154,106 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string, defaultCtx int)
 		return
 	}
-	// We try to guess only if we don't have a template defined already
+	if cfg.HasTemplate() {
-	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
+		// nothing to guess here
-
+		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
 	// try to parse the gguf file
 	f, err := gguf.ParseGGUFFile(guessPath)
 	if err == nil {
 		guessGGUFFromFile(cfg, f, defaultCtx)
 		return
 	}
-	if cfg.ContextSize == nil {
+	// We try to guess only if we don't have a template defined already
-		if defaultCtx == 0 {
+	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
-			defaultCtx = defaultContextSize
+	f, err := gguf.ParseGGUFFile(guessPath)
 	if err != nil {
 		// Only valid for gguf files
 		log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
 		return
 	}
 	log.Debug().
 		Any("eosTokenID", f.Tokenizer().EOSTokenID).
 		Any("bosTokenID", f.Tokenizer().BOSTokenID).
 		Any("modelName", f.Model().Name).
 		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
 	// guess the name
 	if cfg.Name == "" {
 		cfg.Name = f.Model().Name
 	}
 	family := identifyFamily(f)
 	if family == Unknown {
 		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
 		return
 	}
 	// identify template
 	settings, ok := defaultsSettings[family]
 	if ok {
 		cfg.TemplateConfig = settings.TemplateConfig
 		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
 		if len(cfg.StopWords) == 0 {
 			cfg.StopWords = settings.StopWords
 		}
-		cfg.ContextSize = &defaultCtx
+		if cfg.RepeatPenalty == 0.0 {
 			cfg.RepeatPenalty = settings.RepeatPenalty
 		}
 	} else {
 		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
 	}
 	if cfg.HasTemplate() {
 		return
 	}
 	// identify from well known templates first, otherwise use the raw jinja template
 	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
 	if found {
 		// try to use the jinja template
 		cfg.TemplateConfig.JinjaTemplate = true
 		cfg.TemplateConfig.ChatMessage = chatTemplate.ValueString()
 	}
 }
 func identifyFamily(f *gguf.GGUFFile) familyType {
 	// identify from well known templates first
 	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
 	if found && chatTemplate.ValueString() != "" {
 		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
 			return family
 		}
 	}
 	// otherwise try to identify from the model properties
 	arch := f.Architecture().Architecture
 	eosTokenID := f.Tokenizer().EOSTokenID
 	bosTokenID := f.Tokenizer().BOSTokenID
 	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
 	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
 	llama3 := arch == "llama" && eosTokenID == 128009
 	commandR := arch == "command-r" && eosTokenID == 255001
 	qwen2 := arch == "qwen2"
 	phi3 := arch == "phi-3"
 	gemma := strings.HasPrefix(arch, "gemma") || strings.Contains(strings.ToLower(f.Model().Name), "gemma")
 	deepseek2 := arch == "deepseek2"
 	switch {
 	case deepseek2:
 		return DeepSeek2
 	case gemma:
 		return Gemma
 	case llama3:
 		return LLaMa3
 	case commandR:
 		return CommandR
 	case phi3:
 		return Phi3
 	case qwen2, isYI:
 		return ChatML
 	default:
 		return Unknown
 	}
 }
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -5,8 +5,6 @@ import (
 	"errors"
 	"fmt"
 	"net/http"
 	"os"
 	"path/filepath"
 	"github.com/dave-gray101/v2keyauth"
 	"github.com/mudler/LocalAI/pkg/utils"
@@ -144,9 +142,9 @@ func API(application *application.Application) (*fiber.App, error) {
 	httpFS := http.FS(embedDirStatic)
 	router.Use(favicon.New(favicon.Config{
-		URL:        "/favicon.svg",
+		URL:        "/favicon.ico",
 		FileSystem: httpFS,
-		File:       "static/favicon.svg",
+		File:       "static/favicon.ico",
 	}))
 	router.Use("/static", filesystem.New(filesystem.Config{
@@ -155,19 +153,12 @@ func API(application *application.Application) (*fiber.App, error) {
 		Browse:     true,
 	}))
-	if application.ApplicationConfig().GeneratedContentDir != "" {
+	if application.ApplicationConfig().ImageDir != "" {
-		os.MkdirAll(application.ApplicationConfig().GeneratedContentDir, 0750)
+		router.Static("/generated-images", application.ApplicationConfig().ImageDir)
-		audioPath := filepath.Join(application.ApplicationConfig().GeneratedContentDir, "audio")
+	}
 		imagePath := filepath.Join(application.ApplicationConfig().GeneratedContentDir, "images")
 		videoPath := filepath.Join(application.ApplicationConfig().GeneratedContentDir, "videos")
-		os.MkdirAll(audioPath, 0750)
+	if application.ApplicationConfig().AudioDir != "" {
-		os.MkdirAll(imagePath, 0750)
+		router.Static("/generated-audio", application.ApplicationConfig().AudioDir)
 		os.MkdirAll(videoPath, 0750)
 		router.Static("/generated-audio", audioPath)
 		router.Static("/generated-images", imagePath)
 		router.Static("/generated-videos", videoPath)
 	}
 	// Auth is applied to _all_ endpoints. No exceptions. Filtering out endpoints to bypass is the role of the Filter property of the KeyAuth Configuration
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -3,6 +3,7 @@ package http_test
 import (
 	"bytes"
 	"context"
 	"embed"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -23,7 +24,6 @@ import (
 	. "github.com/onsi/gomega"
 	"gopkg.in/yaml.v3"
 	rice "github.com/GeertJohan/go.rice"
 	openaigo "github.com/otiai10/openaigo"
 	"github.com/sashabaranov/go-openai"
 	"github.com/sashabaranov/go-openai/jsonschema"
@@ -264,15 +264,8 @@ func getRequest(url string, header http.Header) (error, int, []byte) {
 const bertEmbeddingsURL = `https://gist.githubusercontent.com/mudler/0a080b166b87640e8644b09c2aee6e3b/raw/f0e8c26bb72edc16d9fbafbfd6638072126ff225/bert-embeddings-gallery.yaml`
-var backendAssets *rice.Box
+//go:embed backend-assets/*
-
+var backendAssets embed.FS
 func init() {
 	var err error
 	backendAssets, err = rice.FindBox("backend-assets")
 	if err != nil {
 		panic(err)
 	}
 }
 var _ = Describe("API test", func() {
@@ -636,7 +629,8 @@ var _ = Describe("API test", func() {
 			application, err := application.New(
 				append(commonOpts,
 					config.WithContext(c),
-					config.WithGeneratedContentDir(tmpdir),
+					config.WithAudioDir(tmpdir),
 					config.WithImageDir(tmpdir),
 					config.WithGalleries(galleries),
 					config.WithModelPath(modelDir),
 					config.WithBackendAssets(backendAssets),
--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@@ -122,15 +122,15 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 			"id":          modalName(m),
 			"tabindex":    "-1",
 			"aria-hidden": "true",
-			"class":       "hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-full max-h-full bg-gray-900/50",
+			"class":       "hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full",
 		},
 		elem.Div(
 			attrs.Props{
-				"class": "relative p-4 w-full max-w-2xl h-[90vh] mx-auto mt-[5vh]",
+				"class": "relative p-4 w-full max-w-2xl max-h-full",
 			},
 			elem.Div(
 				attrs.Props{
-					"class": "relative bg-white rounded-lg shadow dark:bg-gray-700 h-full flex flex-col",
+					"class": "relative p-4 w-full max-w-2xl max-h-full bg-white rounded-lg shadow dark:bg-gray-700",
 				},
 				// header
 				elem.Div(
@@ -164,13 +164,14 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 				// body
 				elem.Div(
 					attrs.Props{
-						"class": "p-4 md:p-5 space-y-4 overflow-y-auto flex-1 min-h-0",
+						"class": "p-4 md:p-5 space-y-4",
 					},
 					elem.Div(
 						attrs.Props{
 							"class": "flex justify-center items-center",
 						},
 						elem.Img(attrs.Props{
 							//	"class": "rounded-t-lg object-fit object-center h-96",
 							"class":   "lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3 entered loaded",
 							"src":     m.Icon,
 							"loading": "lazy",
@@ -231,6 +232,7 @@ func modelModal(m *gallery.GalleryModel) elem.Node {
 			),
 		),
 	)
 }
 func modelDescription(m *gallery.GalleryModel) elem.Node {
--- a/core/http/endpoints/localai/stores.go
+++ b/core/http/endpoints/localai/stores.go
@@ -21,7 +21,6 @@ func StoresSetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
 		defer sl.Close()
 		vals := make([][]byte, len(input.Values))
 		for i, v := range input.Values {
@@ -49,7 +48,6 @@ func StoresDeleteEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationCo
 		if err != nil {
 			return err
 		}
 		defer sl.Close()
 		if err := store.DeleteCols(c.Context(), sb, input.Keys); err != nil {
 			return err
@@ -71,7 +69,6 @@ func StoresGetEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConfi
 		if err != nil {
 			return err
 		}
 		defer sl.Close()
 		keys, vals, err := store.GetCols(c.Context(), sb, input.Keys)
 		if err != nil {
@@ -103,7 +100,6 @@ func StoresFindEndpoint(sl *model.ModelLoader, appConfig *config.ApplicationConf
 		if err != nil {
 			return err
 		}
 		defer sl.Close()
 		keys, vals, similarities, err := store.Find(c.Context(), sb, input.Key, input.Topk)
 		if err != nil {
--- a/core/http/endpoints/localai/video.go
+++ b/core/http/endpoints/localai/video.go
@@ -1,205 +0,0 @@
 package localai
 import (
 	"bufio"
 	"encoding/base64"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
 	"os"
 	"path/filepath"
 	"strings"
 	"time"
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/gofiber/fiber/v2"
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 )
 func downloadFile(url string) (string, error) {
 	// Get the data
 	resp, err := http.Get(url)
 	if err != nil {
 		return "", err
 	}
 	defer resp.Body.Close()
 	// Create the file
 	out, err := os.CreateTemp("", "video")
 	if err != nil {
 		return "", err
 	}
 	defer out.Close()
 	// Write the body to file
 	_, err = io.Copy(out, resp.Body)
 	return out.Name(), err
 }
 //
 /*
 *
 	curl http://localhost:8080/v1/images/generations \
 	  -H "Content-Type: application/json" \
 	  -d '{
 	    "prompt": "A cute baby sea otter",
 	    "n": 1,
 	    "size": "512x512"
 	  }'
 *
 */
 // VideoEndpoint
 // @Summary Creates a video given a prompt.
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /video [post]
 func VideoEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.VideoRequest)
 		if !ok || input.Model == "" {
 			log.Error().Msg("Video Endpoint - Invalid Input")
 			return fiber.ErrBadRequest
 		}
 		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
 		if !ok || config == nil {
 			log.Error().Msg("Video Endpoint - Invalid Config")
 			return fiber.ErrBadRequest
 		}
 		src := ""
 		if input.StartImage != "" {
 			var fileData []byte
 			var err error
 			// check if input.File is an URL, if so download it and save it
 			// to a temporary file
 			if strings.HasPrefix(input.StartImage, "http://") || strings.HasPrefix(input.StartImage, "https://") {
 				out, err := downloadFile(input.StartImage)
 				if err != nil {
 					return fmt.Errorf("failed downloading file:%w", err)
 				}
 				defer os.RemoveAll(out)
 				fileData, err = os.ReadFile(out)
 				if err != nil {
 					return fmt.Errorf("failed reading file:%w", err)
 				}
 			} else {
 				// base 64 decode the file and write it somewhere
 				// that we will cleanup
 				fileData, err = base64.StdEncoding.DecodeString(input.StartImage)
 				if err != nil {
 					return err
 				}
 			}
 			// Create a temporary file
 			outputFile, err := os.CreateTemp(appConfig.GeneratedContentDir, "b64")
 			if err != nil {
 				return err
 			}
 			// write the base64 result
 			writer := bufio.NewWriter(outputFile)
 			_, err = writer.Write(fileData)
 			if err != nil {
 				outputFile.Close()
 				return err
 			}
 			outputFile.Close()
 			src = outputFile.Name()
 			defer os.RemoveAll(src)
 		}
 		log.Debug().Msgf("Parameter Config: %+v", config)
 		switch config.Backend {
 		case "stablediffusion":
 			config.Backend = model.StableDiffusionGGMLBackend
 		case "":
 			config.Backend = model.StableDiffusionGGMLBackend
 		}
 		width := input.Width
 		height := input.Height
 		if width == 0 {
 			width = 512
 		}
 		if height == 0 {
 			height = 512
 		}
 		b64JSON := input.ResponseFormat == "b64_json"
 		tempDir := ""
 		if !b64JSON {
 			tempDir = filepath.Join(appConfig.GeneratedContentDir, "videos")
 		}
 		// Create a temporary file
 		outputFile, err := os.CreateTemp(tempDir, "b64")
 		if err != nil {
 			return err
 		}
 		outputFile.Close()
 		// TODO: use mime type to determine the extension
 		output := outputFile.Name() + ".mp4"
 		// Rename the temporary file
 		err = os.Rename(outputFile.Name(), output)
 		if err != nil {
 			return err
 		}
 		baseURL := c.BaseURL()
 		fn, err := backend.VideoGeneration(height, width, input.Prompt, src, input.EndImage, output, ml, *config, appConfig)
 		if err != nil {
 			return err
 		}
 		if err := fn(); err != nil {
 			return err
 		}
 		item := &schema.Item{}
 		if b64JSON {
 			defer os.RemoveAll(output)
 			data, err := os.ReadFile(output)
 			if err != nil {
 				return err
 			}
 			item.B64JSON = base64.StdEncoding.EncodeToString(data)
 		} else {
 			base := filepath.Base(output)
 			item.URL = baseURL + "/generated-videos/" + base
 		}
 		id := uuid.New().String()
 		created := int(time.Now().Unix())
 		resp := &schema.OpenAIResponse{
 			ID:      id,
 			Created: created,
 			Data:    []schema.Item{*item},
 		}
 		jsonResult, _ := json.Marshal(resp)
 		log.Debug().Msgf("Response: %s", jsonResult)
 		// Return the prediction in the response body
 		return c.JSON(resp)
 	}
 }
--- a/core/http/endpoints/openai/assistant_test.go
+++ b/core/http/endpoints/openai/assistant_test.go
@@ -40,7 +40,7 @@ func TestAssistantEndpoints(t *testing.T) {
 	cl := &config.BackendConfigLoader{}
 	//configsDir := "/tmp/localai/configs"
 	modelPath := "/tmp/localai/model"
-	var ml = model.NewModelLoader(modelPath, false)
+	var ml = model.NewModelLoader(modelPath)
 	appConfig := &config.ApplicationConfig{
 		ConfigsDir:    configsDir,
--- a/core/http/endpoints/openai/image.go
+++ b/core/http/endpoints/openai/image.go
@@ -108,7 +108,7 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon
 			}
 			// Create a temporary file
-			outputFile, err := os.CreateTemp(appConfig.GeneratedContentDir, "b64")
+			outputFile, err := os.CreateTemp(appConfig.ImageDir, "b64")
 			if err != nil {
 				return err
 			}
@@ -184,7 +184,7 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon
 				tempDir := ""
 				if !b64JSON {
-					tempDir = filepath.Join(appConfig.GeneratedContentDir, "images")
+					tempDir = appConfig.ImageDir
 				}
 				// Create a temporary file
 				outputFile, err := os.CreateTemp(tempDir, "b64")
@@ -192,7 +192,6 @@ func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appCon
 					return err
 				}
 				outputFile.Close()
 				output := outputFile.Name() + ".png"
 				// Rename the temporary file
 				err = os.Rename(outputFile.Name(), output)
--- a/core/http/explorer.go
+++ b/core/http/explorer.go
@@ -29,9 +29,9 @@ func Explorer(db *explorer.Database) *fiber.App {
 	httpFS := http.FS(embedDirStatic)
 	app.Use(favicon.New(favicon.Config{
-		URL:        "/favicon.svg",
+		URL:        "/favicon.ico",
 		FileSystem: httpFS,
-		File:       "static/favicon.svg",
+		File:       "static/favicon.ico",
 	}))
 	app.Use("/static", filesystem.New(filesystem.Config{
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -203,10 +203,18 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch
 		config.Diffusers.ClipSkip = input.ClipSkip
 	}
 	if input.ModelBaseName != "" {
 		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
 	}
 	if input.NegativePromptScale != 0 {
 		config.NegativePromptScale = input.NegativePromptScale
 	}
 	if input.UseFastTokenizer {
 		config.UseFastTokenizer = input.UseFastTokenizer
 	}
 	if input.NegativePrompt != "" {
 		config.NegativePrompt = input.NegativePrompt
 	}
--- a/Show More
+++ b/Show More
		`@@ -0,0 +1,2 @@`
							`--extra-index-url https://download.pytorch.org/whl/cu118`
							`torch==2.4.1+cu118`
		`@@ -0,0 +1,2 @@`
							`--extra-index-url https://download.pytorch.org/whl/rocm6.0`
							`torch==2.4.1+rocm6.0`