fix(go-grpc-server): always close resultChan

By not closing the channel, if a server not implementing PredictStream receives a client call would hang indefinetly as would wait for resultChan to be consumed. If the prediction stream returns we close the channel now and we wait for the goroutine to finish. Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
feat(vllm): add support for image-to-text and video-to-text (#3729 )
2026-05-24 16:51:44 -04:00 · 2024-10-05 00:07:58 +02:00 · 2024-10-04 23:42:05 +02:00 · 2024-10-04 19:52:43 +02:00 · 2024-10-04 18:32:29 +02:00 · 2024-10-04 09:10:07 +02:00
286 changed files with 8147 additions and 3074 deletions
--- a/.devcontainer-scripts/postcreate.sh
+++ b/.devcontainer-scripts/postcreate.sh
@@ -0,0 +1,17 @@
 #!/bin/bash
 cd /workspace
 # Get the files into the volume without a bind mount
 if [ ! -d ".git" ]; then
    git clone https://github.com/mudler/LocalAI.git .
 else
    git fetch
 fi
 echo "Standard Post-Create script completed."
 if [ -f "/devcontainer-customization/postcreate.sh" ]; then
    echo "Launching customization postcreate.sh"
    bash "/devcontainer-customization/postcreate.sh"
 fi
--- a/.devcontainer-scripts/poststart.sh
+++ b/.devcontainer-scripts/poststart.sh
@@ -0,0 +1,16 @@
 #!/bin/bash
 cd /workspace
 # Grab the pre-stashed backend assets to avoid build issues
 cp -r /build/backend-assets /workspace/backend-assets
 # Ensures generated source files are present upon load
 make prepare
 echo "Standard Post-Start script completed."
 if [ -f "/devcontainer-customization/poststart.sh" ]; then
    echo "Launching customization poststart.sh"
    bash "/devcontainer-customization/poststart.sh"
 fi
--- a/.devcontainer-scripts/utils.sh
+++ b/.devcontainer-scripts/utils.sh
@@ -0,0 +1,55 @@
 #!/bin/bash
 # This file contains some really simple functions that are useful when building up customization scripts.
 # Checks if the git config has a user registered - and sets it up if not.
 #
 # Param 1: name
 # Param 2: email
 #
 config_user() {
    echo "Configuring git for $1 <$2>"
    local gcn=$(git config --global user.name)
    if [ -z "${gcn}" ]; then
        echo "Setting up git user / remote"
        git config --global user.name "$1"
        git config --global user.email "$2"
    fi
 }
 # Checks if the git remote is configured - and sets it up if not. Fetches either way.
 #
 # Param 1: remote name
 # Param 2: remote url
 #
 config_remote() {
    echo "Adding git remote and fetching $2 as $1"
    local gr=$(git remote -v | grep $1)
    if [ -z "${gr}" ]; then
        git remote add $1 $2
    fi
    git fetch $1
 }
 # Setup special .ssh files
 # Prints out lines of text to make things pretty
 # Param 1: bash array, filenames relative to the customization directory that should be copied to ~/.ssh
 setup_ssh() {
    echo "starting ~/.ssh directory setup..."
    mkdir -p "${HOME}.ssh"
    chmod 0700 "${HOME}/.ssh"
    echo "-----"
    local files=("$@")
    for file in "${files[@]}" ; do
        local cfile="/devcontainer-customization/${file}"
        local hfile="${HOME}/.ssh/${file}"
        if [ ! -f "${hfile}" ]; then
            echo "copying \"${file}\""
            cp "${cfile}" "${hfile}"
            chmod 600 "${hfile}"
        fi
    done
    echo "~/.ssh directory setup complete!"
 }
--- a/.devcontainer/customization/README.md
+++ b/.devcontainer/customization/README.md
@@ -0,0 +1,25 @@
 Place any additional resources your environment requires in this directory
 Script hooks are currently called for:
 `postcreate.sh` and `poststart.sh`
 If files with those names exist here, they will be called at the end of the normal script.
 This is a good place to set things like `git config --global user.name` are set - and to handle any other files that are mounted via this directory.
 To assist in doing so, `source /.devcontainer-scripts/utils.sh` will provide utility functions that may be useful - for example:
 ```
 #!/bin/bash
 source "/.devcontainer-scripts/utils.sh"
 sshfiles=("config", "key.pub")
 setup_ssh "${sshfiles[@]}"
 config_user "YOUR NAME" "YOUR EMAIL"
 config_remote "REMOTE NAME" "REMOTE URL"
 ```
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,24 @@
 {
    "$schema": "https://raw.githubusercontent.com/devcontainers/spec/main/schemas/devContainer.schema.json",
    "name": "LocalAI",
    "workspaceFolder": "/workspace",
    "dockerComposeFile": [ "./docker-compose-devcontainer.yml" ],
    "service": "api",
    "shutdownAction": "stopCompose",
    "customizations": {
        "vscode": {
            "extensions": [
                "golang.go",
                "ms-vscode.makefile-tools",
                "ms-azuretools.vscode-docker",
                "ms-python.python",
                "ms-python.debugpy",
                "wayou.vscode-todo-highlight",
                "waderyan.gitblame"
            ]
        }
    },
    "forwardPorts": [8080, 3000],
    "postCreateCommand": "bash /.devcontainer-scripts/postcreate.sh",
    "postStartCommand": "bash /.devcontainer-scripts/poststart.sh"
 }
--- a/.devcontainer/docker-compose-devcontainer.yml
+++ b/.devcontainer/docker-compose-devcontainer.yml
@@ -0,0 +1,48 @@
 services:
  api:
    build:
      context: ..
      dockerfile: Dockerfile
      target: devcontainer
      args:
      - FFMPEG=true
      - IMAGE_TYPE=extras
      - GO_TAGS=stablediffusion p2p tts
    env_file:
      - ../.env
    ports:
      - 8080:8080
    volumes:
      - localai_workspace:/workspace
      - ../models:/host-models
      - ./customization:/devcontainer-customization
    command: /bin/sh -c "while sleep 1000; do :; done"
    cap_add:
      - SYS_PTRACE
    security_opt:
      - seccomp:unconfined
  prometheus:
    image: prom/prometheus
    container_name: prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
    ports:
      - 9090:9090
    restart: unless-stopped
    volumes:
      - ./prometheus:/etc/prometheus
      - prom_data:/prometheus
  grafana:
    image: grafana/grafana
    container_name: grafana
    ports:
      - 3000:3000
    restart: unless-stopped
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=grafana
    volumes:
      - ./grafana:/etc/grafana/provisioning/datasources
 volumes:
  prom_data:
  localai_workspace:
--- a/.devcontainer/grafana/datasource.yml
+++ b/.devcontainer/grafana/datasource.yml
@@ -0,0 +1,10 @@
 apiVersion: 1
 datasources:
 - name: Prometheus
  type: prometheus
  url: http://prometheus:9090 
  isDefault: true
  access: proxy
  editable: true
--- a/.devcontainer/prometheus/prometheus.yml
+++ b/.devcontainer/prometheus/prometheus.yml
@@ -0,0 +1,21 @@
 global:
  scrape_interval: 15s
  scrape_timeout: 10s
  evaluation_interval: 15s
 alerting:
  alertmanagers:
  - static_configs:
    - targets: []
    scheme: http
    timeout: 10s
    api_version: v1
 scrape_configs:
 - job_name: prometheus
  honor_timestamps: true
  scrape_interval: 15s
  scrape_timeout: 10s
  metrics_path: /metrics
  scheme: http
  static_configs:
  - targets:
    - localhost:9090
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,6 +1,7 @@
 .idea
 .github
 .vscode
 .devcontainer
 models
 examples/chatbot-ui/models
 examples/rwkv/models
--- a/.env
+++ b/.env
@@ -79,6 +79,9 @@
 ### Enable to run parallel requests
 # LOCALAI_PARALLEL_REQUESTS=true
 # Enable to allow p2p mode
 # LOCALAI_P2P=true
 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
--- a/.github/bump_deps.sh
+++ b/.github/bump_deps.sh
@@ -18,5 +18,5 @@ if [ -z "$CURRENT_COMMIT" ]; then
    exit 0
 fi
-echo "Updated $VAR from $CURRENT_COMMIT to $LAST_COMMIT." > "$REPO_message.txt"
+echo "Changes: https://github.com/$REPO/compare/${CURRENT_COMMIT}..${LAST_COMMIT}" >> "${VAR}_message.txt"
-echo "https://github.com/$REPO/compare/$CURRENT_COMMIT..$LAST_COMMIT" >> "$REPO_message.txt"
+echo "${LAST_COMMIT}" >> "${VAR}_commit.txt"
--- a/.github/check_and_update.py
+++ b/.github/check_and_update.py
@@ -29,9 +29,14 @@ def calculate_sha256(file_path):
 def manual_safety_check_hf(repo_id):
    scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
    scan = scanResponse.json()
-    if scan['hasUnsafeFile']:
+    # Check if 'hasUnsafeFile' exists in the response
-        return scan
+    if 'hasUnsafeFile' in scan:
-    return None
+        if scan['hasUnsafeFile']:
            return scan
        else:
            return None
    else:
        return None
 download_type, repo_id_or_url = parse_uri(uri)
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -67,10 +67,6 @@ updates:
    directory: "/backend/python/parler-tts"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/petals"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/rerankers"
    schedule:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -45,18 +45,25 @@ jobs:
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
          {
            echo 'message<<EOF'
-            cat "${{ matrix.repository }}_message.txt"
+            cat "${{ matrix.variable }}_message.txt"
            echo EOF
          } >> "$GITHUB_OUTPUT"
          {
            echo 'commit<<EOF'
            cat "${{ matrix.variable }}_commit.txt"
            echo EOF
          } >> "$GITHUB_OUTPUT"
          rm -rfv ${{ matrix.variable }}_message.txt
          rm -rfv ${{ matrix.variable }}_commit.txt
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v6
+        uses: peter-evans/create-pull-request@v7
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Update ${{ matrix.repository }}'
-          title: 'chore: :arrow_up: Update ${{ matrix.repository }}'
+          title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`'
          branch: "update/${{ matrix.variable }}"
-          body:  ${{ steps.bump.outputs.message }}
+          body: ${{ steps.bump.outputs.message }}
          signoff: true
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -17,7 +17,7 @@ jobs:
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v6
+        uses: peter-evans/create-pull-request@v7
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -36,7 +36,7 @@ jobs:
          sudo chmod 777 /hf_cache
          bash .github/checksum_checker.sh gallery/index.yaml
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v6
+        uses: peter-evans/create-pull-request@v7
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -0,0 +1,64 @@
 name: Explorer deployment
 on:
  push:
    branches:
      - master
    tags:
      - 'v*'
 concurrency:
  group: ci-deploy-${{ github.head_ref || github.ref }}-${{ github.repository }}
 jobs:
  build-linux:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          make protogen-go
      - name: Build api
        run: |
          CGO_ENABLED=0 make build-api
      - name: rm
        uses: appleboy/ssh-action@v1.0.3
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
            key: ${{ secrets.EXPLORER_SSH_KEY }}
            port: ${{ secrets.EXPLORER_SSH_PORT }}
            script: |
                sudo rm -rf local-ai/ || true
      - name: copy file via ssh
        uses: appleboy/scp-action@v0.1.7
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
            key: ${{ secrets.EXPLORER_SSH_KEY }}
            port: ${{ secrets.EXPLORER_SSH_PORT }}
            source: "local-ai"
            overwrite: true
            rm: true
            target: ./local-ai
      - name: restarting
        uses: appleboy/ssh-action@v1.0.3
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
            key: ${{ secrets.EXPLORER_SSH_KEY }}
            port: ${{ secrets.EXPLORER_SSH_PORT }}
            script: |
                sudo cp -rfv local-ai/local-ai /usr/bin/local-ai
                sudo systemctl restart local-ai
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -13,6 +13,78 @@ concurrency:
  cancel-in-progress: true
 jobs:
  hipblas-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
      ffmpeg: ${{ matrix.ffmpeg }}
      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
      grpc-base-image: ${{ matrix.grpc-base-image }}
      aio: ${{ matrix.aio }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
      max-parallel: 2
      matrix:
        include:
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-hipblas-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-core'
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -39,7 +111,7 @@ jobs:
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
+      max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
      matrix:
        include:
          # Extra images
@@ -122,29 +194,6 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-hipblas-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
@@ -212,26 +261,6 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-core'
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
  core-image-build:
    uses: ./.github/workflows/image_build.yml
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -294,7 +294,7 @@ jobs:
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
-
+          export SKIP_GRPC_BACKEND=backend-assets/grpc/whisper
          make dist
      - uses: actions/upload-artifact@v4
        with:
@@ -327,7 +327,7 @@ jobs:
          cache: false
      - name: Dependencies
        run: |
-          brew install protobuf grpc
+          brew install protobuf grpc libomp llvm
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build
@@ -336,7 +336,7 @@ jobs:
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
-
+          export CC=/opt/homebrew/opt/llvm/bin/clang
          make dist
      - uses: actions/upload-artifact@v4
        with:
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@master
+        uses: securego/gosec@v2.21.4
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -168,32 +168,6 @@ jobs:
           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
  # tests-petals:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
  #       uses: actions/checkout@v4
  #       with:
  #         submodules: true
  #     - name: Dependencies
  #       run: |
  #         sudo apt-get update
  #         sudo apt-get install build-essential ffmpeg
  #         # Install UV
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
  #     - name: Test petals
  #       run: |
  #          make --jobs=5 --output-sync=target -C backend/python/petals
  #          make --jobs=5 --output-sync=target -C backend/python/petals test
  # tests-bark:
  #   runs-on: ubuntu-latest
  #   steps:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -178,13 +178,22 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
      - name: Dependencies
        run: |
          # Install protoc
          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Build images
        run: |
          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
      - name: Test
        run: |
-          LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
+            PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
@@ -214,12 +223,13 @@ jobs:
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export CC=/opt/homebrew/opt/llvm/bin/clang
          # Used to run the newer GNUMake version from brew that supports --output-sync
          export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -25,7 +25,7 @@ jobs:
        run: |
          make protogen-go swagger
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v6
+        uses: peter-evans/create-pull-request@v7
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.gitignore
+++ b/.gitignore
@@ -54,3 +54,6 @@ docs/static/gallery.html
 # backend virtual environments
 **/venv
 # per-developer customization files for the development container
 .devcontainer/customization/*
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -3,12 +3,12 @@
    "configurations": [
        {
            "name": "Python: Current File",
-            "type": "python",
+            "type": "debugpy",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal",
            "justMyCode": false,
-            "cwd": "${workspaceFolder}/examples/langchain-chroma",
+            "cwd": "${fileDirname}",
            "env": {
                "OPENAI_API_BASE": "http://localhost:8080/v1",
                "OPENAI_API_KEY": "abc"
@@ -19,15 +19,16 @@
            "type": "go",
            "request": "launch",
            "mode": "debug",
-            "program": "${workspaceFolder}/main.go",
+            "program": "${workspaceRoot}",
-            "args": [
+            "args": [],
                "api"
            ],
            "env": {
-                "C_INCLUDE_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
+                "LOCALAI_LOG_LEVEL": "debug",
-                "LIBRARY_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
+                "LOCALAI_P2P": "true",
-                "DEBUG": "true"
+                "LOCALAI_FEDERATED": "true"
-            }
+            },
            "buildFlags": ["-tags", "stablediffusion p2p tts", "-v"],
            "envFile": "${workspaceFolder}/.env",
            "cwd": "${workspaceRoot}"
        }
    ]
 }
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,8 +15,6 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
 - [Documentation](#documentation)
 - [Community and Communication](#community-and-communication)
 ## Getting Started
 ### Prerequisites
@@ -54,7 +52,7 @@ If you find a bug, have a feature request, or encounter any issues, please check
 ## Coding Guidelines
- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here.
 ## Testing
@@ -84,5 +82,3 @@ We are welcome the contribution of the documents, please open new PR or create a
 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
 ---
--- a/120
+++ b/120
@@ -8,12 +8,12 @@ FROM ${BASE_IMAGE} AS requirements-core
 USER root
-ARG GO_VERSION=1.22.5
+ARG GO_VERSION=1.22.6
 ARG TARGETARCH
 ARG TARGETVARIANT
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
 RUN apt-get update && \
@@ -30,7 +30,7 @@ RUN apt-get update && \
 # Install Go
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
-ENV PATH $PATH:/root/go/bin:/usr/local/go/bin
+ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
 # Install grpc compilers
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
@@ -39,15 +39,18 @@ RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
 RUN test -n "$TARGETARCH" \
    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
 # Use the variables in subsequent instructions
 RUN echo "Target Architecture: $TARGETARCH"
 RUN echo "Target Variant: $TARGETVARIANT"
 # Cuda
-ENV PATH /usr/local/cuda/bin:${PATH}
+ENV PATH=/usr/local/cuda/bin:${PATH}
 # HipBLAS requirements
-ENV PATH /opt/rocm/bin:${PATH}
+ENV PATH=/opt/rocm/bin:${PATH}
 # OpenBLAS requirements and stable diffusion
 RUN apt-get update && \
@@ -62,9 +65,6 @@ RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
 WORKDIR /build
 RUN test -n "$TARGETARCH" \
    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
 ###################################
 ###################################
@@ -81,7 +81,7 @@ RUN apt-get update && \
        espeak \
        python3-pip \
        python-is-python3 \
-        python3-dev \
+        python3-dev llvm \
        python3-venv && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* && \
@@ -217,13 +217,14 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
 ###################################
 ###################################
-# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
+# The builder-base target has the arguments, variables, and copies shared between full builder images and the uncompiled devcontainer
-# Adjustments to the build process should likely be made here.
+
-FROM requirements-drivers AS builder
+FROM requirements-drivers AS builder-base
 ARG GO_TAGS="stablediffusion tts p2p"
 ARG GRPC_BACKENDS
 ARG MAKEFLAGS
 ARG LD_FLAGS="-s -w"
 ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
@@ -231,14 +232,12 @@ ENV MAKEFLAGS=${MAKEFLAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
 ENV LD_FLAGS=${LD_FLAGS}
 RUN echo "GO_TAGS: $GO_TAGS" && echo "TARGETARCH: $TARGETARCH"
 WORKDIR /build
 COPY . .
 COPY .git .
 RUN echo "GO_TAGS: $GO_TAGS"
 RUN make prepare
 # We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
 # but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
@@ -256,8 +255,35 @@ RUN <<EOT bash
    fi
 EOT
-# stablediffusion does not tolerate a newer version of abseil, build it first
+
-RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+###################################
 ###################################
 # This first portion of builder holds the layers specifically used to build backend-assets/grpc/stablediffusion
 # In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
 FROM builder-base AS builder-sd
 # stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
 COPY Makefile .
 COPY go.mod .
 COPY go.sum .
 COPY backend/backend.proto ./backend/backend.proto
 COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
 COPY pkg/grpc ./pkg/grpc
 COPY pkg/stablediffusion ./pkg/stablediffusion
 RUN git init
 RUN make sources/go-stable-diffusion
 RUN touch prepare-sources
 # Actually build the backend
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
 ###################################
 ###################################
 # The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
 # Adjustments to the build process should likely be made here.
 FROM builder-sd AS builder
 # Install the pre-built GRPC
 COPY --from=grpc /opt/grpc /usr/local
@@ -265,8 +291,20 @@ COPY --from=grpc /opt/grpc /usr/local
 # Rebuild with defaults backends
 WORKDIR /build
 COPY . .
 COPY .git .
 RUN make prepare
 ## Build the binary
-RUN make build
+## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
 ## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
 ## (both will use CUDA or hipblas for the actual computation)
 RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
    fi
 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
        mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
@@ -276,6 +314,40 @@ RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
 ###################################
 ###################################
 # The devcontainer target is not used on CI. It is a target for developers to use locally -
 # rather than copying files it mounts them locally and leaves building to the developer
 FROM builder-base AS devcontainer
 ARG FFMPEG
 COPY --from=grpc /opt/grpc /usr/local
 COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion /build/backend-assets/grpc/stablediffusion
 COPY .devcontainer-scripts /.devcontainer-scripts
 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            ffmpeg && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/* \
    ; fi
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        ssh less wget
 # For the devcontainer, leave apt functional in case additional devtools are needed at runtime.
 RUN go install github.com/go-delve/delve/cmd/dlv@latest
 RUN go install github.com/mikefarah/yq/v4@latest
 ###################################
 ###################################
 # This is the final target. The result of this target will be the image uploaded to the registry.
 # If you cannot find a more suitable place for an addition, this layer is a suitable place for it.
 FROM requirements-drivers
@@ -326,7 +398,7 @@ COPY --from=builder /build/local-ai ./
 COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
 # do not let stablediffusion rebuild (requires an older version of absl)
-COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
+COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
 # Change the shell to bash so we can use [[ tests below
 SHELL ["/bin/bash", "-c"]
@@ -345,9 +417,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/transformers-musicgen \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/exllama \
    ; fi
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
@@ -356,9 +425,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$I
    if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/openvoice \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "petals" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/petals \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "sentencetransformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/sentencetransformers \
    ; fi && \
--- a/79
+++ b/79
@@ -8,11 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=0d6fb52be0c1b7e77eb855f3adc4952771c8ce4c
+CPPLLAMA_VERSION?=d5ed2b929d85bbd7dbeecb690880f07d9d7a6077
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
 GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
@@ -20,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=fe36c909715e6751277ddb020e7892c7670b61d4
+WHISPER_CPP_VERSION?=ccc2547210e09e3a1785817383ab770389bb442b
 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
@@ -190,7 +186,6 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
 ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
@@ -253,18 +248,6 @@ sources/go-piper:
 sources/go-piper/libpiper_binding.a: sources/go-piper
 	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
 ## GPT4ALL
 sources/gpt4all:
 	mkdir -p sources/gpt4all
 	cd sources/gpt4all && \
 	git init && \
 	git remote add origin $(GPT4ALL_REPO) && \
 	git fetch origin && \
 	git checkout $(GPT4ALL_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a
 ## RWKV
 sources/go-rwkv.cpp:
@@ -318,7 +301,7 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
-get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
+get-sources: sources/go-llama.cpp sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
 replace:
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
@@ -328,7 +311,6 @@ replace:
 	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
 dropreplace:
@@ -339,7 +321,6 @@ dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
 	$(GOCMD) mod edit -dropreplace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
 prepare-sources: get-sources replace
@@ -349,7 +330,6 @@ prepare-sources: get-sources replace
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
 	$(MAKE) -C sources/go-rwkv.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
@@ -379,6 +359,9 @@ clean-tests:
 	rm -rf test-dir
 	rm -rf core/http/backend-assets
 clean-dc: clean
 	cp -r /build/backend-assets /workspace/backend-assets
 ## Build:
 build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
@@ -396,7 +379,7 @@ build-minimal:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
 build-api:
-	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build
+	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=p2p $(MAKE) build
 backend-assets/lib:
 	mkdir -p backend-assets/lib
@@ -407,7 +390,7 @@ ifeq ($(DETECT_LIBS),true)
 	scripts/prepare-libs.sh backend-assets/grpc/llama-cpp-avx2
 endif
 ifeq ($(OS),Darwin)
-	$(info ${GREEN}I Skip CUDA/hipblas build on MacOS${RESET})
+	BUILD_TYPE=none $(MAKE) backend-assets/grpc/llama-cpp-fallback
 else
 	$(MAKE) backend-assets/grpc/llama-cpp-cuda
 	$(MAKE) backend-assets/grpc/llama-cpp-hipblas
@@ -469,8 +452,7 @@ test: prepare test-models/testmodel.ggml grpcs
 	export GO_TAGS="tts stablediffusion debug"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
@@ -486,7 +468,7 @@ run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
 	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
-run-e2e-aio:
+run-e2e-aio: protogen-go
 	@echo 'Running e2e AIO tests'
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
@@ -500,10 +482,6 @@ teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)
 test-gpt4all: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r $(TEST_PATHS)
 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
@@ -559,10 +537,10 @@ protogen-go-clean:
 	$(RM) bin/*
 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
 .PHONY: autogptq-protogen
 autogptq-protogen:
@@ -596,14 +574,6 @@ diffusers-protogen:
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean
 .PHONY: exllama-protogen
 exllama-protogen:
 	$(MAKE) -C backend/python/exllama protogen
 .PHONY: exllama-protogen-clean
 exllama-protogen-clean:
 	$(MAKE) -C backend/python/exllama protogen-clean
 .PHONY: exllama2-protogen
 exllama2-protogen:
 	$(MAKE) -C backend/python/exllama2 protogen
@@ -620,14 +590,6 @@ mamba-protogen:
 mamba-protogen-clean:
 	$(MAKE) -C backend/python/mamba protogen-clean
 .PHONY: petals-protogen
 petals-protogen:
 	$(MAKE) -C backend/python/petals protogen
 .PHONY: petals-protogen-clean
 petals-protogen-clean:
 	$(MAKE) -C backend/python/petals protogen-clean
 .PHONY: rerankers-protogen
 rerankers-protogen:
 	$(MAKE) -C backend/python/rerankers protogen
@@ -708,8 +670,6 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
 	$(MAKE) -C backend/python/openvoice
 	$(MAKE) -C backend/python/exllama
 	$(MAKE) -C backend/python/petals
 	$(MAKE) -C backend/python/exllama2
 prepare-test-extra: protogen-python
@@ -730,12 +690,6 @@ backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_bindin
 	mkdir -p backend-assets/espeak-ng-data
 	@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
 backend-assets/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
 	mkdir -p backend-assets/gpt4all
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
 backend-assets/grpc: protogen-go replace
 	mkdir -p backend-assets/grpc
@@ -746,13 +700,6 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/bert-embeddings
 endif
 backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/gpt4all
 endif
 backend-assets/grpc/huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/
 ifneq ($(UPX),)
@@ -893,7 +840,7 @@ endif
 backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/whisper
 endif
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@
 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) 
 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
@@ -68,10 +68,9 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 [💻 Getting started](https://localai.io/basics/getting_started/index.html)
-## 🔥🔥 Hot topics / Roadmap
+## 📰 Latest project news
 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 - Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
 - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
 - June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
 - June 2024: Support for models from OCI registries: https://github.com/mudler/LocalAI/pull/2628
@@ -82,8 +81,12 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 - May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
 - April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
-Hot topics (looking for contributors):
+Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 ## 🔥🔥 Hot topics (looking for help):
 - Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
 - Realtime API https://github.com/mudler/LocalAI/issues/3714
 - 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
 - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4-vision-preview
+name: gpt-4o
 roles:
  user: "USER:"
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4-vision-preview
+name: gpt-4o
 roles:
  user: "USER:"
--- a/aio/intel/image-gen.yaml
+++ b/aio/intel/image-gen.yaml
@@ -1,6 +1,6 @@
 name: stablediffusion
 parameters:
-  model: runwayml/stable-diffusion-v1-5
+  model: Lykon/dreamshaper-8
 backend: diffusers
 step: 25
 f16: true
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 mmap: false
 f16: false
-name: gpt-4-vision-preview
+name: gpt-4o
 roles:
  user: "USER:"
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -16,6 +16,7 @@ service Backend {
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}
@@ -25,6 +26,19 @@ service Backend {
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
  rpc Rerank(RerankRequest) returns (RerankResult) {}
  rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
 }
 // Define the empty request
 message MetricsRequest {}
 message MetricsResponse {
  int32 slot_id = 1;
  string prompt_json_for_slot = 2;  // Stores the prompt as a JSON string.
  float tokens_per_second = 3;
  int32 tokens_generated = 4;
  int32 prompt_tokens_processed = 5;
 }
 message RerankRequest {
@@ -133,6 +147,9 @@ message PredictOptions {
  repeated string Images = 42;
  bool UseTokenizerTemplate = 43;
  repeated Message Messages = 44;
  repeated string Videos = 45;
  repeated string Audios = 46;
  string CorrelationId = 47;
 }
 // The response message containing the result
@@ -270,6 +287,17 @@ message TTSRequest {
  optional string language = 5;
 }
 message SoundGenerationRequest {
  string text = 1;
  string model = 2;
  string dst = 3;
  optional float duration = 4;
  optional float temperature = 5;
  optional bool sample = 6;
  optional string src = 7;
  optional int32 src_divisor = 8;
 }
 message TokenizationResponse {
  int32 length = 1;
  repeated int32 tokens = 2;
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -13,15 +13,15 @@
 #include <getopt.h>
 #include "clip.h"
 #include "llava.h"
 #include "log.h"
 #include "stb_image.h"
 #include "common.h"
 #include "json.hpp"
 #include "llama.h"
 #include "grammar-parser.h"
 #include "backend.pb.h"
 #include "backend.grpc.pb.h"
 #include "utils.hpp"
-
+#include "sampling.h"
 // include std::regex
 #include <cstddef>
 #include <thread>
@@ -203,8 +203,8 @@ struct llama_client_slot
    std::string stopping_word;
    // sampling
-    struct llama_sampling_params sparams;
+    struct gpt_sampler_params sparams;
-    llama_sampling_context *ctx_sampling = nullptr;
+    gpt_sampler *ctx_sampling = nullptr;
    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
@@ -449,7 +449,7 @@ struct llama_server_context
            LOG_INFO("Multi Modal Mode Enabled", {});
            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
            if(clp_ctx == nullptr) {
-                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
+                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
                return false;
            }
@@ -458,10 +458,12 @@ struct llama_server_context
            }
        }
-        std::tie(model, ctx) = llama_init_from_gpt_params(params);
+        llama_init_result llama_init = llama_init_from_gpt_params(params);
        model = llama_init.model;
        ctx = llama_init.context;
        if (model == nullptr)
        {
-            LOG_ERROR("unable to load model", {{"model", params.model}});
+            LOG_ERR("unable to load model: %s", params.model.c_str());
            return false;
        }
@@ -469,7 +471,7 @@ struct llama_server_context
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
            const int n_embd_llm  = llama_n_embd(model);
            if (n_embd_clip != n_embd_llm) {
-                LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
+                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                llama_free(ctx);
                llama_free_model(model);
                return false;
@@ -478,7 +480,7 @@ struct llama_server_context
        n_ctx = llama_n_ctx(ctx);
-        add_bos_token = llama_should_add_bos_token(model);
+        add_bos_token = llama_add_bos_token(model);
        return true;
    }
@@ -488,11 +490,21 @@ struct llama_server_context
        std::vector<char> buf(1);
        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
        if (res < 0) {
-            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+            LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
        }
    }
    llama_client_slot* get_active_slot() {
        for (llama_client_slot& slot : slots) {
            // Check if the slot is currently processing
            if (slot.is_processing()) {
                return &slot;  // Return the active slot
            }
        }
        return nullptr;  // No active slot found
    }
    void initialize() {
        // create slots
        all_slots_are_idle = true;
@@ -617,7 +629,7 @@ struct llama_server_context
    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
        slot_params default_params;
-        llama_sampling_params default_sparams;
+        gpt_sampler_params default_sparams;
        slot->params.stream             = json_value(data, "stream",            false);
        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
@@ -626,7 +638,7 @@ struct llama_server_context
        slot->sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
        slot->sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
        slot->sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
-        slot->sparams.typical_p         = json_value(data, "typical_p",         default_sparams.typical_p);
+        slot->sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
        slot->sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
        slot->sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
        slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
@@ -639,7 +651,7 @@ struct llama_server_context
        slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
        slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
-        slot->params.seed               = json_value(data, "seed",              default_params.seed);
+        slot->sparams.seed               = json_value(data, "seed",              default_sparams.seed);
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
@@ -663,6 +675,7 @@ struct llama_server_context
            slot->params.input_prefix = "";
        }
        if (data.count("input_suffix") != 0)
        {
            slot->params.input_suffix = data["input_suffix"];
@@ -681,6 +694,10 @@ struct llama_server_context
            slot->prompt = "";
        }
        if (json_value(data, "ignore_eos", false)) {
                slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
        }
        /*
        slot->sparams.penalty_prompt_tokens.clear();
        slot->sparams.use_penalty_prompt_tokens = false;
        const auto &penalty_prompt = data.find("penalty_prompt");
@@ -716,14 +733,10 @@ struct llama_server_context
                slot->sparams.use_penalty_prompt_tokens = true;
            }
        }
      */
        slot->sparams.logit_bias.clear();
        if (json_value(data, "ignore_eos", false))
        {
            slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
        }
        const auto &logit_bias = data.find("logit_bias");
        if (logit_bias != data.end() && logit_bias->is_array())
        {
@@ -751,7 +764,7 @@ struct llama_server_context
                        llama_token tok = el[0].get<llama_token>();
                        if (tok >= 0 && tok < n_vocab)
                        {
-                            slot->sparams.logit_bias[tok] = bias;
+                            slot->sparams.logit_bias.push_back({tok, bias});
                        }
                    }
                    else if (el[0].is_string())
@@ -759,13 +772,13 @@ struct llama_server_context
                        auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
                        for (auto tok : toks)
                        {
-                            slot->sparams.logit_bias[tok] = bias;
+                            slot->sparams.logit_bias.push_back({tok, bias});
                        }
                    }
                }
            }
        }
-
+        
        slot->params.antiprompt.clear();
        const auto &stop = data.find("stop");
@@ -779,24 +792,22 @@ struct llama_server_context
                }
            }
        }
-
+        
-        const auto &samplers_sequence = data.find("samplers");
+        const auto & samplers = data.find("samplers");
-        if (samplers_sequence != data.end() && samplers_sequence->is_array())
+        if (samplers != data.end() && samplers->is_array()) {
        {
            std::vector<std::string> sampler_names;
-            for (const auto &sampler_name : *samplers_sequence)
+                for (const auto & name : *samplers) {
-            {
+                    if (name.is_string()) {
-                if (sampler_name.is_string())
+                        sampler_names.emplace_back(name);
-                {
+                    }
                    sampler_names.emplace_back(sampler_name);
                }
-            }
+                slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
            slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
        }
        else
        {
-            slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
+                slot->sparams.samplers = default_sparams.samplers;
        }
        if (multimodal)
        {
@@ -812,10 +823,11 @@ struct llama_server_context
                    img_sl.img_data = clip_image_u8_init();
                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
                    {
-                        LOG_ERROR("failed to load image", {
+                        LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d", 
-                            {"slot_id",   slot->id},
+                             __func__,
-                            {"img_sl_id", img_sl.id}
+                             slot->id,
-                        });
+                             img_sl.id
                        );
                        return false;
                    }
                    LOG_VERBOSE("image loaded", {
@@ -853,12 +865,12 @@ struct llama_server_context
                                    }
                                }
                                if (!found) {
-                                    LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
+                                    LOG("ERROR: Image with id: %i, not found.\n", img_id);
                                    slot->images.clear();
                                    return false;
                                }
                            } catch (const std::invalid_argument& e) {
-                                LOG_TEE("Invalid image number id in prompt\n");
+                                LOG("Invalid image number id in prompt\n");
                                slot->images.clear();
                                return false;
                            }
@@ -873,10 +885,10 @@ struct llama_server_context
        if (slot->ctx_sampling != nullptr)
        {
-            llama_sampling_free(slot->ctx_sampling);
+            gpt_sampler_free(slot->ctx_sampling);
        }
-        slot->ctx_sampling = llama_sampling_init(slot->sparams);
+        slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
-        llama_set_rng_seed(ctx, slot->params.seed);
+        //llama_set_rng_seed(ctx, slot->params.seed);
        slot->command = LOAD_PROMPT;
        all_slots_are_idle = false;
@@ -886,7 +898,7 @@ struct llama_server_context
            {"task_id", slot->task_id},
        });
-        LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
+      //  LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
        return true;
    }
@@ -926,7 +938,7 @@ struct llama_server_context
                };
                if (llama_decode(ctx, batch_view) != 0)
                {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG("%s: llama_decode() failed\n", __func__);
                    return;
                }
            }
@@ -938,7 +950,7 @@ struct llama_server_context
            }
        }
-        LOG_TEE("system prompt updated\n");
+        LOG("system prompt updated\n");
        system_need_update = false;
    }
@@ -1004,11 +1016,13 @@ struct llama_server_context
        slot.generated_text += token_str;
        slot.has_next_token = true;
 /*
        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
        {
            // we can change penalty_prompt_tokens because it is always created from scratch each request
            slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
        }
        */
        // check if there is incomplete UTF-8 character at the end
        bool incomplete = false;
@@ -1117,8 +1131,8 @@ struct llama_server_context
                continue;
            }
-            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
+            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
-                LOG_TEE("Error processing the given image");
+                LOG("Error processing the given image");
                return false;
            }
@@ -1130,7 +1144,7 @@ struct llama_server_context
    void send_error(task_server& task, const std::string &error)
    {
-        LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
+        LOG("task %i - error: %s\n", task.id, error.c_str());
        task_result res;
        res.id = task.id;
        res.multitask_id = task.multitask_id;
@@ -1142,13 +1156,11 @@ struct llama_server_context
    json get_formated_generation(llama_client_slot &slot)
    {
-        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
+        std::vector<std::string> samplers;
-        const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
+        samplers.reserve(slot.sparams.samplers.size());
-                                eos_bias->second < 0.0f && std::isinf(eos_bias->second);
+        for (const auto & sampler : slot.sparams.samplers)
        std::vector<std::string> samplers_sequence;
        for (const auto &sampler_type : slot.sparams.samplers_sequence)
        {
-            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
+            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
        }
        return json {
@@ -1163,13 +1175,11 @@ struct llama_server_context
            {"top_p",             slot.sparams.top_p},
            {"min_p",             slot.sparams.min_p},
            {"tfs_z",             slot.sparams.tfs_z},
-            {"typical_p",         slot.sparams.typical_p},
+            {"typical_p",         slot.sparams.typ_p},
            {"repeat_last_n",     slot.sparams.penalty_last_n},
            {"repeat_penalty",    slot.sparams.penalty_repeat},
            {"presence_penalty",  slot.sparams.penalty_present},
            {"frequency_penalty", slot.sparams.penalty_freq},
            {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
            {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
            {"mirostat",          slot.sparams.mirostat},
            {"mirostat_tau",      slot.sparams.mirostat_tau},
            {"mirostat_eta",      slot.sparams.mirostat_eta},
@@ -1177,13 +1187,13 @@ struct llama_server_context
            {"stop",              slot.params.antiprompt},
            {"n_predict",         slot.params.n_predict},
            {"n_keep",            params.n_keep},
-            {"ignore_eos",        ignore_eos},
+            {"ignore_eos",        slot.sparams.ignore_eos},
            {"stream",            slot.params.stream},
-            {"logit_bias",        slot.sparams.logit_bias},
+      //      {"logit_bias",        slot.sparams.logit_bias},
            {"n_probs",           slot.sparams.n_probs},
            {"min_keep",          slot.sparams.min_keep},
            {"grammar",           slot.sparams.grammar},
-            {"samplers",          samplers_sequence}
+            {"samplers",          samplers}
        };
    }
@@ -1373,7 +1383,7 @@ struct llama_server_context
                };
                if (llama_decode(ctx, batch_view))
                {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                    LOG("%s : failed to eval\n", __func__);
                    return false;
                }
            }
@@ -1391,7 +1401,7 @@ struct llama_server_context
                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
                if (llama_decode(ctx, batch_img))
                {
-                    LOG_TEE("%s : failed to eval image\n", __func__);
+                    LOG("%s : failed to eval image\n", __func__);
                    return false;
                }
                slot.n_past += n_eval;
@@ -1574,7 +1584,7 @@ struct llama_server_context
                    slot.n_past = 0;
                    slot.truncated = false;
                    slot.has_next_token = true;
-                    LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+                    LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
                    continue;
                    // END LOCALAI changes
@@ -1712,7 +1722,7 @@ struct llama_server_context
                    if (!slot.params.cache_prompt)
                    {
-                        llama_sampling_reset(slot.ctx_sampling);
+                        gpt_sampler_reset(slot.ctx_sampling);
                        slot.n_past = 0;
                        slot.n_past_se = 0;
@@ -1724,7 +1734,7 @@ struct llama_server_context
                        // push the prompt into the sampling context (do not apply grammar)
                        for (auto &token : prompt_tokens)
                        {
-                            llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
+                            gpt_sampler_accept(slot.ctx_sampling, token, false);
                        }
                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@@ -1822,10 +1832,11 @@ struct llama_server_context
                    if (has_images && !ingest_images(slot, n_batch))
                    {
-                        LOG_ERROR("failed processing images", {
+                        LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d", 
-                            "slot_id", slot.id,
+                            __func__,
-                            "task_id", slot.task_id,
+                            slot.id,
-                        });
+                            slot.task_id
                        );
                        // FIXME @phymbert: to be properly tested
                        //  early returning without changing the slot state will block the slot for ever
                        // no one at the moment is checking the return value
@@ -1865,10 +1876,10 @@ struct llama_server_context
                        const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
                        const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
-                        LOG_TEE("\n");
+                        LOG("\n");
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+                        LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
@@ -1878,7 +1889,7 @@ struct llama_server_context
                        slot.ga_i += slot.ga_w / slot.ga_n;
-                        LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+                        LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
                    }
                    slot.n_past_se += n_tokens;
                }
@@ -1903,11 +1914,11 @@ struct llama_server_context
                if (n_batch == 1 || ret < 0)
                {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                    return false;
                }
-                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
+                LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
@@ -1932,9 +1943,9 @@ struct llama_server_context
                }
                completion_token_output result;
-                const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
+                const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
-                llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
+                gpt_sampler_accept(slot.ctx_sampling, id, true);
                slot.n_decoded += 1;
                if (slot.n_decoded == 1)
@@ -1944,19 +1955,14 @@ struct llama_server_context
                    metrics.on_prompt_eval(slot);
                }
                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
                result.tok = id;
                const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
-                const int32_t n_probs = slot.sparams.n_probs;
+                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
-                if (slot.sparams.temp <= 0 && n_probs > 0)
+                    result.probs.push_back({
-                {
+                        cur_p->data[i].id,
-                    // for llama_sample_token_greedy we need to sort candidates
+                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
-                    llama_sample_softmax(ctx, &cur_p);
+                    });
                }
                for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
                {
                    result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
                }
                if (!process_token(result, slot))
@@ -2110,6 +2116,9 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
    data["ignore_eos"] = predict->ignoreeos();
    data["embeddings"] = predict->embeddings();
    // Add the correlationid to json data
    data["correlation_id"] = predict->correlationid();
    // for each image in the request, add the image data
    //
    for (int i = 0; i < predict->images_size(); i++) {
@@ -2208,7 +2217,7 @@ static void params_parse(const backend::ModelOptions* request,
    params.model_alias =  request->modelfile();
    params.n_ctx = request->contextsize();
    //params.memory_f16 = request->f16memory();
-    params.n_threads = request->threads();
+    params.cpuparams.n_threads = request->threads();
    params.n_gpu_layers = request->ngpulayers();
    params.n_batch = request->nbatch();
    // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
@@ -2258,7 +2267,7 @@ static void params_parse(const backend::ModelOptions* request,
     }
     // get the directory of modelfile
     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
-     params.lora_adapter.push_back(std::make_tuple(model_dir + "/"+request->loraadapter(), scale_factor));
+     params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
    }
    params.use_mlock = request->mlock();
    params.use_mmap = request->mmap();
@@ -2348,6 +2357,11 @@ public:
                int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
                reply.set_prompt_tokens(tokens_evaluated);
                // Log Request Correlation Id
                LOG_VERBOSE("correlation:", {
                    { "id", data["correlation_id"] }
                });
                // Send the reply
                writer->Write(reply);
@@ -2371,6 +2385,12 @@ public:
        std::string completion_text;
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
            // Log Request Correlation Id
            LOG_VERBOSE("correlation:", {
                { "id", data["correlation_id"] }
            });
            completion_text = result.result_json.value("content", "");
            int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
            int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
@@ -2410,6 +2430,31 @@ public:
        return grpc::Status::OK;
    }
    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
        llama_client_slot* active_slot = llama.get_active_slot();
        if (active_slot != nullptr) {
            // Calculate the tokens per second using existing logic
            double tokens_per_second = 1e3 / active_slot->t_token_generation * active_slot->n_decoded;
            // Populate the response with metrics
            response->set_slot_id(active_slot->id);
            response->set_prompt_json_for_slot(active_slot->prompt.dump());
            response->set_tokens_per_second(tokens_per_second);
            response->set_tokens_generated(active_slot->n_decoded);
            response->set_prompt_tokens_processed(active_slot->num_prompt_tokens_processed);
        } else {
            // Handle case when no active slot exists
            response->set_slot_id(0);
            response->set_prompt_json_for_slot("");
            response->set_tokens_per_second(0);
            response->set_tokens_generated(0);
            response->set_prompt_tokens_processed(0);
        }
        return grpc::Status::OK;
    } 
 };
 void RunServer(const std::string& server_address) {
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -0,0 +1,13 @@
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
 index 342042ff..224db9b5 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
             int* patches_data = (int*)malloc(ggml_nbytes(patches));
             for (int i = 0; i < num_patches; i++) {
 -                patches_data[i] = i + 1;
 +                patches_data[i] = i;
             }
             ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
             free(patches_data);
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -1,5 +1,12 @@
 #!/bin/bash
 ## Patches
 ## Apply patches from the `patches` directory
 for patch in $(ls patches); do
    echo "Applying patch $patch"
    patch -d llama.cpp/ -p1 < patches/$patch
 done 
 cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
 cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
 cp -rfv json.hpp llama.cpp/examples/grpc-server/
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -480,31 +480,4 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
    }
    return ret;
 }
 //
 // random string / id
 //
 static std::string random_string()
 {
    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
    std::random_device rd;
    std::mt19937 generator(rd());
    std::string result(32, ' ');
    for (int i = 0; i < 32; ++i) {
        result[i] = str[generator() % str.size()];
    }
    return result;
 }
 static std::string gen_chatcmplid()
 {
    std::stringstream chatcmplid;
    chatcmplid << "chatcmpl-" << random_string();
    return chatcmplid.str();
 }
--- a/backend/go/llm/gpt4all/gpt4all.go
+++ b/backend/go/llm/gpt4all/gpt4all.go
@@ -1,62 +0,0 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	gpt4all "github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
 )
 type LLM struct {
 	base.SingleThread
 	gpt4all *gpt4all.Model
 }
 func (llm *LLM) Load(opts *pb.ModelOptions) error {
 	model, err := gpt4all.New(opts.ModelFile,
 		gpt4all.SetThreads(int(opts.Threads)),
 		gpt4all.SetLibrarySearchPath(opts.LibrarySearchPath))
 	llm.gpt4all = model
 	return err
 }
 func buildPredictOptions(opts *pb.PredictOptions) []gpt4all.PredictOption {
 	predictOptions := []gpt4all.PredictOption{
 		gpt4all.SetTemperature(float64(opts.Temperature)),
 		gpt4all.SetTopP(float64(opts.TopP)),
 		gpt4all.SetTopK(int(opts.TopK)),
 		gpt4all.SetTokens(int(opts.Tokens)),
 	}
 	if opts.Batch != 0 {
 		predictOptions = append(predictOptions, gpt4all.SetBatch(int(opts.Batch)))
 	}
 	return predictOptions
 }
 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.gpt4all.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	predictOptions := buildPredictOptions(opts)
 	go func() {
 		llm.gpt4all.SetTokenCallback(func(token string) bool {
 			results <- token
 			return true
 		})
 		_, err := llm.gpt4all.Predict(opts.Prompt, predictOptions...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		llm.gpt4all.SetTokenCallback(nil)
 		close(results)
 	}()
 	return nil
 }
--- a/backend/go/llm/gpt4all/main.go
+++ b/backend/go/llm/gpt4all/main.go
@@ -1,21 +0,0 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/transcribe/transcript.go
+++ b/backend/go/transcribe/transcript.go
@@ -1,104 +0,0 @@
 package main
 import (
 	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	"github.com/go-audio/wav"
 	"github.com/mudler/LocalAI/core/schema"
 )
 func ffmpegCommand(args []string) (string, error) {
 	cmd := exec.Command("ffmpeg", args...) // Constrain this to ffmpeg to permit security scanner to see that the command is safe.
 	cmd.Env = os.Environ()
 	out, err := cmd.CombinedOutput()
 	return string(out), err
 }
 // AudioToWav converts audio to wav for transcribe.
 // TODO: use https://github.com/mccoyst/ogg?
 func audioToWav(src, dst string) error {
 	commandArgs := []string{"-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
 	out, err := ffmpegCommand(commandArgs)
 	if err != nil {
 		return fmt.Errorf("error: %w out: %s", err, out)
 	}
 	return nil
 }
 func Transcript(model whisper.Model, audiopath, language string, translate bool, threads uint) (schema.TranscriptionResult, error) {
 	res := schema.TranscriptionResult{}
 	dir, err := os.MkdirTemp("", "whisper")
 	if err != nil {
 		return res, err
 	}
 	defer os.RemoveAll(dir)
 	convertedPath := filepath.Join(dir, "converted.wav")
 	if err := audioToWav(audiopath, convertedPath); err != nil {
 		return res, err
 	}
 	// Open samples
 	fh, err := os.Open(convertedPath)
 	if err != nil {
 		return res, err
 	}
 	defer fh.Close()
 	// Read samples
 	d := wav.NewDecoder(fh)
 	buf, err := d.FullPCMBuffer()
 	if err != nil {
 		return res, err
 	}
 	data := buf.AsFloat32Buffer().Data
 	// Process samples
 	context, err := model.NewContext()
 	if err != nil {
 		return res, err
 	}
 	context.SetThreads(threads)
 	if language != "" {
 		context.SetLanguage(language)
 	} else {
 		context.SetLanguage("auto")
 	}
 	if translate {
 		context.SetTranslate(true)
 	}
 	if err := context.Process(data, nil, nil); err != nil {
 		return res, err
 	}
 	for {
 		s, err := context.NextSegment()
 		if err != nil {
 			break
 		}
 		var tokens []int
 		for _, t := range s.Tokens {
 			tokens = append(tokens, t.Id)
 		}
 		segment := schema.Segment{Id: s.Num, Text: s.Text, Start: s.Start, End: s.End, Tokens: tokens}
 		res.Segments = append(res.Segments, segment)
 		res.Text += s.Text
 	}
 	return res, nil
 }
--- a/backend/go/transcribe/whisper.go
+++ b/backend/go/transcribe/whisper.go
@@ -1,26 +0,0 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type Whisper struct {
 	base.SingleThread
 	whisper whisper.Model
 }
 func (sd *Whisper) Load(opts *pb.ModelOptions) error {
 	// Note: the Model here is a path to a directory containing the model files
 	w, err := whisper.New(opts.ModelFile)
 	sd.whisper = w
 	return err
 }
 func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.TranscriptionResult, error) {
 	return Transcript(sd.whisper, opts.Dst, opts.Language, opts.Translate, uint(opts.Threads))
 }
--- a/backend/go/transcribe/whisper/main.go
+++ b/backend/go/transcribe/whisper/main.go
--- a/backend/go/transcribe/whisper/whisper.go
+++ b/backend/go/transcribe/whisper/whisper.go
@@ -0,0 +1,105 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"os"
 	"path/filepath"
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	"github.com/go-audio/wav"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/utils"
 )
 type Whisper struct {
 	base.SingleThread
 	whisper whisper.Model
 }
 func (sd *Whisper) Load(opts *pb.ModelOptions) error {
 	// Note: the Model here is a path to a directory containing the model files
 	w, err := whisper.New(opts.ModelFile)
 	sd.whisper = w
 	return err
 }
 func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
 	dir, err := os.MkdirTemp("", "whisper")
 	if err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	defer os.RemoveAll(dir)
 	convertedPath := filepath.Join(dir, "converted.wav")
 	if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	// Open samples
 	fh, err := os.Open(convertedPath)
 	if err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	defer fh.Close()
 	// Read samples
 	d := wav.NewDecoder(fh)
 	buf, err := d.FullPCMBuffer()
 	if err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	data := buf.AsFloat32Buffer().Data
 	// Process samples
 	context, err := sd.whisper.NewContext()
 	if err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	context.SetThreads(uint(opts.Threads))
 	if opts.Language != "" {
 		context.SetLanguage(opts.Language)
 	} else {
 		context.SetLanguage("auto")
 	}
 	if opts.Translate {
 		context.SetTranslate(true)
 	}
 	if err := context.Process(data, nil, nil); err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	segments := []*pb.TranscriptSegment{}
 	text := ""
 	for {
 		s, err := context.NextSegment()
 		if err != nil {
 			break
 		}
 		var tokens []int32
 		for _, t := range s.Tokens {
 			tokens = append(tokens, int32(t.Id))
 		}
 		segment := &pb.TranscriptSegment{Id: int32(s.Num), Text: s.Text, Start: int64(s.Start), End: int64(s.End), Tokens: tokens}
 		segments = append(segments, segment)
 		text += s.Text
 	}
 	return pb.TranscriptResult{
 		Segments: segments,
 		Text:     text,
 	}, nil
 }
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.65.4
+grpcio==1.66.2
 protobuf
 certifi
 transformers
--- a/backend/python/bark/requirements-cpu.txt
+++ b/backend/python/bark/requirements-cpu.txt
@@ -0,0 +1,4 @@
 transformers
 accelerate
 torch
 torchaudio
--- a/backend/python/bark/requirements-cublas11.txt
+++ b/backend/python/bark/requirements-cublas11.txt
@@ -1,3 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch
-torchaudio
+torchaudio
 transformers
 accelerate
--- a/backend/python/bark/requirements-cublas12.txt
+++ b/backend/python/bark/requirements-cublas12.txt
@@ -1,2 +1,4 @@
 torch
-torchaudio
+torchaudio
 transformers
 accelerate
--- a/backend/python/bark/requirements-hipblas.txt
+++ b/backend/python/bark/requirements-hipblas.txt
@@ -1,3 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch
-torchaudio
+torchaudio
 transformers
 accelerate
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -3,4 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,6 +1,4 @@
 accelerate
 bark==0.1.5
-grpcio==1.65.4
+grpcio==1.66.2
 protobuf
-certifi
+certifi
 transformers
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -18,10 +18,23 @@
 # source $(dirname $0)/../common/libbackend.sh
 #
 function init() {
    # Name of the backend (directory name)
    BACKEND_NAME=${PWD##*/}
    # Path where all backends files are
    MY_DIR=$(realpath `dirname $0`)
    # Build type
    BUILD_PROFILE=$(getBuildProfile)
    # Environment directory
    EDIR=${MY_DIR}
    # Allow to specify a custom env dir for shared environments
    if [ "x${ENV_DIR}" != "x" ]; then
        EDIR=${ENV_DIR}
    fi
    # If a backend has defined a list of valid build profiles...
    if [ ! -z "${LIMIT_TARGETS}" ]; then
        isValidTarget=$(checkTargets ${LIMIT_TARGETS})
@@ -74,13 +87,14 @@ function getBuildProfile() {
 # This function is idempotent, so you can call it as many times as you want and it will
 # always result in an activated virtual environment
 function ensureVenv() {
-    if [ ! -d "${MY_DIR}/venv" ]; then
+    if [ ! -d "${EDIR}/venv" ]; then
-        uv venv ${MY_DIR}/venv
+        uv venv ${EDIR}/venv
        echo "virtualenv created"
    fi
-    
+
-    if [ "x${VIRTUAL_ENV}" != "x${MY_DIR}/venv" ]; then
+    # Source if we are not already in a Virtual env
-        source ${MY_DIR}/venv/bin/activate
+    if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then
        source ${EDIR}/venv/bin/activate
        echo "virtualenv activated"
    fi
@@ -113,21 +127,25 @@ function installRequirements() {
    # These are the requirements files we will attempt to install, in order
    declare -a requirementFiles=(
-        "${MY_DIR}/requirements-install.txt"
+        "${EDIR}/requirements-install.txt"
-        "${MY_DIR}/requirements.txt"
+        "${EDIR}/requirements.txt"
-        "${MY_DIR}/requirements-${BUILD_TYPE}.txt"
+        "${EDIR}/requirements-${BUILD_TYPE}.txt"
    )
    if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
-        requirementFiles+=("${MY_DIR}/requirements-${BUILD_PROFILE}.txt")
+        requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}.txt")
    fi
    # if BUILD_TYPE is empty, we are a CPU build, so we should try to install the CPU requirements
    if [ "x${BUILD_TYPE}" == "x" ]; then
-        requirementFiles+=("${MY_DIR}/requirements-cpu.txt")
+        requirementFiles+=("${EDIR}/requirements-cpu.txt")
    fi
-    requirementFiles+=("${MY_DIR}/requirements-after.txt")
+    requirementFiles+=("${EDIR}/requirements-after.txt")
    if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
        requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}-after.txt")
    fi
    for reqFile in ${requirementFiles[@]}; do
        if [ -f ${reqFile} ]; then
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.65.4
+grpcio==1.66.2
 protobuf
--- a/backend/python/coqui/requirements-cpu.txt
+++ b/backend/python/coqui/requirements-cpu.txt
@@ -0,0 +1,3 @@
 transformers
 accelerate
 torch
--- a/backend/python/coqui/requirements-cublas11.txt
+++ b/backend/python/coqui/requirements-cublas11.txt
@@ -1,3 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch
-torchaudio
+torchaudio
 transformers
 accelerate
--- a/backend/python/coqui/requirements-cublas12.txt
+++ b/backend/python/coqui/requirements-cublas12.txt
@@ -1,2 +1,4 @@
 torch
-torchaudio
+torchaudio
 transformers
 accelerate
--- a/backend/python/coqui/requirements-hipblas.txt
+++ b/backend/python/coqui/requirements-hipblas.txt
@@ -1,3 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch
-torchaudio
+torchaudio
 transformers
 accelerate
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -3,4 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,6 +1,4 @@
-accelerate
+coqui-tts
-TTS==0.22.0
+grpcio==1.66.2
 grpcio==1.65.4
 protobuf
-certifi
+certifi
 transformers
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -18,13 +18,13 @@ import backend_pb2_grpc
 import grpc
 from diffusers import StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
-    EulerAncestralDiscreteScheduler
+    EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
 from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image, export_to_video
 from compel import Compel, ReturnedEmbeddingsType
-
+from optimum.quanto import freeze, qfloat8, quantize
-from transformers import CLIPTextModel
+from transformers import CLIPTextModel, T5EncoderModel
 from safetensors.torch import load_file
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -163,10 +163,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            modelFile = request.Model
            self.cfg_scale = 7
            self.PipelineType = request.PipelineType
            if request.CFGScale != 0:
                self.cfg_scale = request.CFGScale
-            clipmodel = "runwayml/stable-diffusion-v1-5"
+            clipmodel = "Lykon/dreamshaper-8"
            if request.CLIPModel != "":
                clipmodel = request.CLIPModel
            clipsubfolder = "text_encoder"
@@ -244,6 +246,30 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        torch_dtype=torchType,
                        use_safetensors=True,
                        variant=variant)
            elif request.PipelineType == "FluxPipeline":
                    self.pipe = FluxPipeline.from_pretrained(
                        request.Model,
                        torch_dtype=torch.bfloat16)
                    if request.LowVRAM:
                        self.pipe.enable_model_cpu_offload()
            elif request.PipelineType == "FluxTransformer2DModel":
                    dtype = torch.bfloat16
                    # specify from environment or default to "ChuckMcSneed/FLUX.1-dev"
                    bfl_repo = os.environ.get("BFL_REPO", "ChuckMcSneed/FLUX.1-dev")
                    transformer = FluxTransformer2DModel.from_single_file(modelFile, torch_dtype=dtype)
                    quantize(transformer, weights=qfloat8)
                    freeze(transformer)
                    text_encoder_2 = T5EncoderModel.from_pretrained(bfl_repo, subfolder="text_encoder_2", torch_dtype=dtype)
                    quantize(text_encoder_2, weights=qfloat8)
                    freeze(text_encoder_2)
                    self.pipe = FluxPipeline.from_pretrained(bfl_repo, transformer=None, text_encoder_2=None, torch_dtype=dtype)
                    self.pipe.transformer = transformer
                    self.pipe.text_encoder_2 = text_encoder_2
                    if request.LowVRAM:
                        self.pipe.enable_model_cpu_offload()
            if CLIPSKIP and request.CLIPSkip != 0:
                self.clip_skip = request.CLIPSkip
@@ -399,6 +425,13 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                request.seed
            )
        if self.PipelineType == "FluxPipeline":
            kwargs["max_sequence_length"] = 256
        if self.PipelineType == "FluxTransformer2DModel":
            kwargs["output_type"] = "pil"
            kwargs["generator"] = torch.Generator("cpu").manual_seed(0)
        if self.img2vid:
            # Load the conditioning image
            image = load_image(request.src)
--- a/backend/python/diffusers/requirements-cpu.txt
+++ b/backend/python/diffusers/requirements-cpu.txt
@@ -0,0 +1,9 @@
 diffusers
 opencv-python
 transformers
 accelerate
 compel
 peft
 sentencepiece
 torch
 optimum-quanto
--- a/backend/python/diffusers/requirements-cublas11.txt
+++ b/backend/python/diffusers/requirements-cublas11.txt
@@ -1,2 +1,10 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch
 diffusers
 opencv-python
 transformers
 accelerate
 compel
 peft
 sentencepiece
 optimum-quanto
--- a/backend/python/diffusers/requirements-cublas12.txt
+++ b/backend/python/diffusers/requirements-cublas12.txt
@@ -1 +1,9 @@
 torch
 diffusers
 opencv-python
 transformers
 accelerate
 compel
 peft
 sentencepiece
 optimum-quanto
--- a/backend/python/diffusers/requirements-hipblas.txt
+++ b/backend/python/diffusers/requirements-hipblas.txt
@@ -1,3 +1,11 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch
+torch==2.3.1+rocm6.0
-torchvision
+torchvision==0.18.1+rocm6.0
 diffusers
 opencv-python
 transformers
 accelerate
 compel
 peft
 sentencepiece
 optimum-quanto
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,4 +3,12 @@ intel-extension-for-pytorch
 torch
 torchvision
 optimum[openvino]
-setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 diffusers
 opencv-python
 transformers
 accelerate
 compel
 peft
 sentencepiece
 optimum-quanto
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,12 +1,5 @@
 setuptools
-accelerate
+grpcio==1.66.2
 compel
 peft
 diffusers
 grpcio==1.65.4
 opencv-python
 pillow
 protobuf
 sentencepiece
 transformers
 certifi
--- a/backend/python/diffusers/test.py
+++ b/backend/python/diffusers/test.py
@@ -53,7 +53,7 @@ class TestBackendServicer(unittest.TestCase):
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="Lykon/dreamshaper-8"))
                self.assertTrue(response.success)
                self.assertEqual(response.message, "Model loaded successfully")
        except Exception as err:
@@ -71,7 +71,7 @@ class TestBackendServicer(unittest.TestCase):
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="Lykon/dreamshaper-8"))
                print(response.message)
                self.assertTrue(response.success)
                image_req = backend_pb2.GenerateImageRequest(positive_prompt="cat", width=16,height=16, dst="test.jpg")
@@ -81,4 +81,4 @@ class TestBackendServicer(unittest.TestCase):
            print(err)
            self.fail("Image gen service failed")
        finally:
-            self.tearDown()
+            self.tearDown()
--- a/backend/python/exllama/.gitignore
+++ b/backend/python/exllama/.gitignore
@@ -1 +0,0 @@
 source
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@@ -1,25 +0,0 @@
 export CONDA_ENV_PATH = "exllama.yml"
 .PHONY: exllama
 exllama: protogen
 	bash install.sh ${CONDA_ENV_PATH}
 .PHONY: run
 run: protogen
 	@echo "Running exllama..."
 	bash run.sh
 	@echo "exllama run."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
 .PHONY: clean
 clean: protogen-clean
 	$(RM) -r venv source __pycache__
--- a/backend/python/exllama/README.md
+++ b/backend/python/exllama/README.md
@@ -1,5 +0,0 @@
 # Creating a separate environment for the exllama project
 ```
 make exllama
 ```
--- a/backend/python/exllama/backend.py
+++ b/backend/python/exllama/backend.py
@@ -1,159 +0,0 @@
 #!/usr/bin/env python3
 import grpc
 from concurrent import futures
 import time
 import backend_pb2
 import backend_pb2_grpc
 import argparse
 import signal
 import sys
 import os, glob
 from pathlib import Path
 import torch
 import torch.nn.functional as F
 from torch import version as torch_version
 from source.tokenizer import ExLlamaTokenizer
 from source.generator import ExLlamaGenerator
 from source.model import ExLlama, ExLlamaCache, ExLlamaConfig
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    def generate(self,prompt, max_new_tokens):
        self.generator.end_beam_search()
        # Tokenizing the input
        ids = self.generator.tokenizer.encode(prompt)
        self.generator.gen_begin_reuse(ids)
        initial_len = self.generator.sequence[0].shape[0]
        has_leading_space = False
        decoded_text = ''
        for i in range(max_new_tokens):
            token = self.generator.gen_single_token()
            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
                has_leading_space = True
            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
            if has_leading_space:
                decoded_text = ' ' + decoded_text
            if token.item() == self.generator.tokenizer.eos_token_id:
                break
        return decoded_text
    def Health(self, request, context):
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        try:
            # https://github.com/turboderp/exllama/blob/master/example_cfg.py
            model_directory = request.ModelFile
            # Locate files we need within that directory
            tokenizer_path = os.path.join(model_directory, "tokenizer.model")
            model_config_path = os.path.join(model_directory, "config.json")
            st_pattern = os.path.join(model_directory, "*.safetensors")
            model_path = glob.glob(st_pattern)[0]
            # Create config, model, tokenizer and generator
            config = ExLlamaConfig(model_config_path)               # create config from config.json
            config.model_path = model_path                          # supply path to model weights file
            if (request.ContextSize):
                config.max_seq_len = request.ContextSize            # override max sequence length
                config.max_attention_size = request.ContextSize**2  # Should be set to context_size^2. 
                # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
            # Set Rope scaling.
            if (request.RopeFreqScale):
                # Alpha value for Rope scaling. 
                # Higher value increases context but adds perplexity.
                # alpha_value and compress_pos_emb are mutually exclusive.
                # https://github.com/turboderp/exllama/issues/115
                config.alpha_value = request.RopeFreqScale
                config.calculate_rotary_embedding_base()
            model = ExLlama(config)                                 # create ExLlama instance and load the weights
            tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
            cache = ExLlamaCache(model, batch_size = 2)             # create cache for inference
            generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
            self.generator= generator
            self.model = model
            self.tokenizer = tokenizer
            self.cache = cache
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def Predict(self, request, context):
        penalty = 1.15
        if request.Penalty != 0.0:
            penalty = request.Penalty
        self.generator.settings.token_repetition_penalty_max = penalty
        self.generator.settings.temperature = request.Temperature
        self.generator.settings.top_k = request.TopK
        self.generator.settings.top_p = request.TopP
        tokens = 512
        if request.Tokens != 0:
            tokens = request.Tokens
        if self.cache.batch_size == 1:
            del self.cache
            self.cache = ExLlamaCache(self.model, batch_size=2)
            self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
        t = self.generate(request.Prompt, tokens)
        # Remove prompt from response if present
        if request.Prompt in t:
            t = t.replace(request.Prompt, "")
        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
    def PredictStream(self, request, context):
        # Implement PredictStream RPC
        #for reply in some_data_generator():
        #    yield reply
        # Not implemented yet
        return self.Predict(request, context)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    serve(args.addr)
--- a/backend/python/exllama/install.sh
+++ b/backend/python/exllama/install.sh
@@ -1,13 +0,0 @@
 #!/bin/bash
 set -e
 LIMIT_TARGETS="cublas"
 source $(dirname $0)/../common/libbackend.sh
 installRequirements
 git clone https://github.com/turboderp/exllama $MY_DIR/source
 uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt
 cp -v ./*py $MY_DIR/source/
--- a/backend/python/exllama/requirements-cublas11.txt
+++ b/backend/python/exllama/requirements-cublas11.txt
@@ -1,2 +0,0 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch
--- a/backend/python/exllama/requirements-cublas12.txt
+++ b/backend/python/exllama/requirements-cublas12.txt
@@ -1 +0,0 @@
 torch
--- a/backend/python/exllama/requirements.txt
+++ b/backend/python/exllama/requirements.txt
@@ -1,5 +0,0 @@
 grpcio==1.65.0
 protobuf
 transformers
 certifi
 setuptools
--- a/backend/python/exllama/run.sh
+++ b/backend/python/exllama/run.sh
@@ -1,7 +0,0 @@
 #!/bin/bash
 LIMIT_TARGETS="cublas"
 BACKEND_FILE="${MY_DIR}/source/backend.py"
 source $(dirname $0)/../common/libbackend.sh
 startBackend $@
--- a/backend/python/exllama/test.sh
+++ b/backend/python/exllama/test.sh
@@ -1,6 +0,0 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 runUnittests
--- a/backend/python/exllama2/requirements-cpu.txt
+++ b/backend/python/exllama2/requirements-cpu.txt
@@ -0,0 +1,3 @@
 transformers
 accelerate
 torch
--- a/backend/python/exllama2/requirements-cublas11.txt
+++ b/backend/python/exllama2/requirements-cublas11.txt
@@ -1,2 +1,4 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch
 transformers
 accelerate
--- a/backend/python/exllama2/requirements-cublas12.txt
+++ b/backend/python/exllama2/requirements-cublas12.txt
@@ -1 +1,3 @@
 torch
 transformers
 accelerate
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,5 +1,4 @@
-accelerate
+grpcio==1.66.2
 grpcio==1.65.4
 protobuf
 certifi
 wheel
--- a/backend/python/mamba/requirements-cpu.txt
+++ b/backend/python/mamba/requirements-cpu.txt
@@ -1 +1,2 @@
-torch
+torch
 transformers
--- a/backend/python/mamba/requirements-cublas11.txt
+++ b/backend/python/mamba/requirements-cublas11.txt
@@ -1,2 +1,3 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
-torch
+torch
 transformers
--- a/backend/python/mamba/requirements-cublas12.txt
+++ b/backend/python/mamba/requirements-cublas12.txt
@@ -1 +1,2 @@
-torch
+torch
 transformers
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,4 +1,3 @@
-grpcio==1.65.1
+grpcio==1.66.2
 protobuf
-certifi
+certifi
 transformers
--- a/backend/python/openvoice/requirements-cpu.txt
+++ b/backend/python/openvoice/requirements-cpu.txt
@@ -0,0 +1 @@
 torch
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.65.4
+grpcio==1.66.2
 protobuf
 librosa==0.9.1
 faster-whisper==1.0.3
@@ -18,6 +18,6 @@ python-dotenv
 pypinyin==0.50.0
 cn2an==0.5.22
 jieba==0.42.1
-gradio==4.38.1
+gradio==4.44.1
 langid==1.1.6
 git+https://github.com/myshell-ai/MeloTTS.git
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.65.4
+grpcio==1.66.2
 protobuf
 librosa
 faster-whisper
--- a/backend/python/parler-tts/install.sh
+++ b/backend/python/parler-tts/install.sh
@@ -15,5 +15,12 @@ installRequirements
 # https://github.com/descriptinc/audiotools/issues/101
 # incompatible protobuf versions.
-PYDIR=$(ls ${MY_DIR}/venv/lib)
+PYDIR=python3.10
-curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/builder.py
+pyenv="${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/"
 if [ ! -d ${pyenv} ]; then
    echo "(parler-tts/install.sh): Error: ${pyenv} does not exist"
    exit 1
 fi
 curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${pyenv}/builder.py
--- a/backend/python/parler-tts/requirements-after.txt
+++ b/backend/python/parler-tts/requirements-after.txt
@@ -0,0 +1,3 @@
 git+https://github.com/huggingface/parler-tts.git@8e465f1b5fcd223478e07175cb40494d19ffbe17
 llvmlite==0.43.0
 numba==0.60.0
--- a/backend/python/parler-tts/requirements-cpu.txt
+++ b/backend/python/parler-tts/requirements-cpu.txt
@@ -0,0 +1,3 @@
 transformers
 accelerate
 torch
--- a/backend/python/parler-tts/requirements-cublas11.txt
+++ b/backend/python/parler-tts/requirements-cublas11.txt
@@ -1,3 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch
-torchaudio
+torchaudio
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements-cublas12.txt
+++ b/backend/python/parler-tts/requirements-cublas12.txt
@@ -1,2 +1,4 @@
 torch
-torchaudio
+torchaudio
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements-hipblas.txt
+++ b/backend/python/parler-tts/requirements-hipblas.txt
@@ -1,3 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch
+torch==2.3.0+rocm6.0
-torchaudio
+torchaudio==2.3.0+rocm6.0
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -3,4 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==72.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,6 +1,4 @@
-accelerate
+grpcio==1.66.2
 grpcio==1.65.1
 protobuf
 git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16
 certifi
-transformers
+llvmlite==0.43.0
--- a/backend/python/petals/Makefile
+++ b/backend/python/petals/Makefile
@@ -1,31 +0,0 @@
 .PHONY: petals
 petals: protogen
 	@echo "Creating virtual environment..."
 	bash install.sh "petals.yml"
 	@echo "Virtual environment created."
 .PHONY: run
 run: protogen
 	@echo "Running petals..."
 	bash run.sh
 	@echo "petals run."
 .PHONY: test
 test: protogen
 	@echo "Testing petals..."
 	bash test.sh
 	@echo "petals tested."
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
 .PHONY: protogen-clean
 protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
 	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
 .PHONY: clean
 clean: protogen-clean
 	rm -rf venv __pycache__
--- a/backend/python/petals/backend.py
+++ b/backend/python/petals/backend.py
@@ -1,140 +0,0 @@
 #!/usr/bin/env python3
 from concurrent import futures
 import time
 import argparse
 import signal
 import sys
 import os
 import backend_pb2
 import backend_pb2_grpc
 import grpc
 import torch
 from transformers import AutoTokenizer
 from petals import AutoDistributedModelForCausalLM
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 # If MAX_WORKERS are specified in the environment use it, otherwise default to 1
 MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
 # Implement the BackendServicer class with the service methods
 class BackendServicer(backend_pb2_grpc.BackendServicer):
    """
    A gRPC servicer that implements the Backend service defined in backend.proto.
    """
    def Health(self, request, context):
        """
        Returns a health check message.
        Args:
            request: The health check request.
            context: The gRPC context.
        Returns:
            backend_pb2.Reply: The health check reply.
        """
        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
    def LoadModel(self, request, context):
        """
        Loads a language model.
        Args:
            request: The load model request.
            context: The gRPC context.
        Returns:
            backend_pb2.Result: The load model result.
        """
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(request.Model, use_fast=False, add_bos_token=False)
            self.model = AutoDistributedModelForCausalLM.from_pretrained(request.Model)
            self.cuda = False
            if request.CUDA:
                self.model = self.model.cuda()
                self.cuda = True
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(message="Model loaded successfully", success=True)
    def Predict(self, request, context):
        """
        Generates text based on the given prompt and sampling parameters.
        Args:
            request: The predict request.
            context: The gRPC context.
        Returns:
            backend_pb2.Result: The predict result.
        """
        inputs = self.tokenizer(request.Prompt, return_tensors="pt")["input_ids"]
        if self.cuda:
            inputs = inputs.cuda()
        if request.Tokens == 0:
            # Max to max value if tokens are not specified
            request.Tokens = 8192
        # TODO: kwargs and map all parameters
        outputs = self.model.generate(inputs, max_new_tokens=request.Tokens)
        generated_text = self.tokenizer.decode(outputs[0])
        # Remove prompt from response if present
        if request.Prompt in generated_text:
            generated_text = generated_text.replace(request.Prompt, "")
        return backend_pb2.Result(message=bytes(generated_text, encoding='utf-8'))
    def PredictStream(self, request, context):
        """
        Generates text based on the given prompt and sampling parameters, and streams the results.
        Args:
            request: The predict stream request.
            context: The gRPC context.
        Returns:
            backend_pb2.Result: The predict stream result.
        """
        # Implement PredictStream RPC
        #for reply in some_data_generator():
        #    yield reply
        # Not implemented yet
        return self.Predict(request, context)
 def serve(address):
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
    print("Server started. Listening on: " + address, file=sys.stderr)
    # Define the signal handler function
    def signal_handler(sig, frame):
        print("Received termination signal. Shutting down...")
        server.stop(0)
        sys.exit(0)
    # Set the signal handlers for SIGINT and SIGTERM
    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)
    try:
        while True:
            time.sleep(_ONE_DAY_IN_SECONDS)
    except KeyboardInterrupt:
        server.stop(0)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the gRPC server.")
    parser.add_argument(
        "--addr", default="localhost:50051", help="The address to bind the server to."
    )
    args = parser.parse_args()
    serve(args.addr)
--- a/backend/python/petals/install.sh
+++ b/backend/python/petals/install.sh
@@ -1,14 +0,0 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 # This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
 # This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
 # We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
 # the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
 fi
 installRequirements
--- a/Show More
+++ b/Show More
`@@ -1,2 +1,2 @@`
	`grpcio==1.65.4`	`grpcio==1.66.2`
	`protobuf`	`protobuf`
		`@@ -1,2 +0,0 @@`
			`--extra-index-url https://download.pytorch.org/whl/cu118`
			`torch`