refactor: break down json grammar parser in different files

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-24 00:26:34 -04:00 · 2024-07-24 23:21:25 +02:00
293 changed files with 3221 additions and 8788 deletions
--- a/.devcontainer-scripts/postcreate.sh
+++ b/.devcontainer-scripts/postcreate.sh
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-cd /workspace
-
-# Get the files into the volume without a bind mount
-if [ ! -d ".git" ]; then
-    git clone https://github.com/mudler/LocalAI.git .
-else
-    git fetch
-fi
-
-echo "Standard Post-Create script completed."
-
-if [ -f "/devcontainer-customization/postcreate.sh" ]; then
-    echo "Launching customization postcreate.sh"
-    bash "/devcontainer-customization/postcreate.sh"
-fi
--- a/.devcontainer-scripts/poststart.sh
+++ b/.devcontainer-scripts/poststart.sh
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-cd /workspace
-
-# Grab the pre-stashed backend assets to avoid build issues
-cp -r /build/backend-assets /workspace/backend-assets
-
-# Ensures generated source files are present upon load
-make prepare
-
-echo "Standard Post-Start script completed."
-
-if [ -f "/devcontainer-customization/poststart.sh" ]; then
-    echo "Launching customization poststart.sh"
-    bash "/devcontainer-customization/poststart.sh"
-fi
--- a/.devcontainer-scripts/utils.sh
+++ b/.devcontainer-scripts/utils.sh
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-# This file contains some really simple functions that are useful when building up customization scripts.
-
-
-# Checks if the git config has a user registered - and sets it up if not.
-#
-# Param 1: name
-# Param 2: email
-#
-config_user() {
-    echo "Configuring git for $1 <$2>"
-    local gcn=$(git config --global user.name)
-    if [ -z "${gcn}" ]; then
-        echo "Setting up git user / remote"
-        git config --global user.name "$1"
-        git config --global user.email "$2"
-        
-    fi
-}
-
-# Checks if the git remote is configured - and sets it up if not. Fetches either way.
-#
-# Param 1: remote name
-# Param 2: remote url
-#
-config_remote() {
-    echo "Adding git remote and fetching $2 as $1"
-    local gr=$(git remote -v | grep $1)
-    if [ -z "${gr}" ]; then
-        git remote add $1 $2
-    fi
-    git fetch $1
-}
-
-# Setup special .ssh files
-# Prints out lines of text to make things pretty
-# Param 1: bash array, filenames relative to the customization directory that should be copied to ~/.ssh
-setup_ssh() {
-    echo "starting ~/.ssh directory setup..."
-    mkdir -p "${HOME}.ssh"
-    chmod 0700 "${HOME}/.ssh"
-    echo "-----"
-    local files=("$@")
-    for file in "${files[@]}" ; do
-        local cfile="/devcontainer-customization/${file}"
-        local hfile="${HOME}/.ssh/${file}"
-        if [ ! -f "${hfile}" ]; then
-            echo "copying \"${file}\""
-            cp "${cfile}" "${hfile}"
-            chmod 600 "${hfile}"
-        fi
-    done
-    echo "~/.ssh directory setup complete!"
-}
--- a/.devcontainer/customization/README.md
+++ b/.devcontainer/customization/README.md
@@ -1,25 +0,0 @@
-Place any additional resources your environment requires in this directory
-
-Script hooks are currently called for:
-`postcreate.sh` and `poststart.sh`
-
-If files with those names exist here, they will be called at the end of the normal script.
-
-This is a good place to set things like `git config --global user.name` are set - and to handle any other files that are mounted via this directory.
-
-To assist in doing so, `source /.devcontainer-scripts/utils.sh` will provide utility functions that may be useful - for example:
-
-```
-#!/bin/bash
-
-source "/.devcontainer-scripts/utils.sh"
-
-sshfiles=("config", "key.pub")
-
-setup_ssh "${sshfiles[@]}"
-
-config_user "YOUR NAME" "YOUR EMAIL"
-
-config_remote "REMOTE NAME" "REMOTE URL"
-
-```
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,24 +0,0 @@
-{
-    "$schema": "https://raw.githubusercontent.com/devcontainers/spec/main/schemas/devContainer.schema.json",
-    "name": "LocalAI",
-    "workspaceFolder": "/workspace",
-    "dockerComposeFile": [ "./docker-compose-devcontainer.yml" ],
-    "service": "api",
-    "shutdownAction": "stopCompose",
-    "customizations": {
-        "vscode": {
-            "extensions": [
-                "golang.go",
-                "ms-vscode.makefile-tools",
-                "ms-azuretools.vscode-docker",
-                "ms-python.python",
-                "ms-python.debugpy",
-                "wayou.vscode-todo-highlight",
-                "waderyan.gitblame"
-            ]
-        }
-    },
-    "forwardPorts": [8080, 3000],
-    "postCreateCommand": "bash /.devcontainer-scripts/postcreate.sh",
-    "postStartCommand": "bash /.devcontainer-scripts/poststart.sh"
-}
--- a/.devcontainer/docker-compose-devcontainer.yml
+++ b/.devcontainer/docker-compose-devcontainer.yml
@@ -1,48 +0,0 @@
-services:
-  api:
-    build:
-      context: ..
-      dockerfile: Dockerfile
-      target: devcontainer
-      args:
-      - FFMPEG=true
-      - IMAGE_TYPE=extras
-      - GO_TAGS=stablediffusion p2p tts
-    env_file:
-      - ../.env
-    ports:
-      - 8080:8080
-    volumes:
-      - localai_workspace:/workspace
-      - ../models:/host-models
-      - ./customization:/devcontainer-customization
-    command: /bin/sh -c "while sleep 1000; do :; done"
-    cap_add:
-      - SYS_PTRACE
-    security_opt:
-      - seccomp:unconfined
-  prometheus:
-    image: prom/prometheus
-    container_name: prometheus
-    command:
-      - '--config.file=/etc/prometheus/prometheus.yml'
-    ports:
-      - 9090:9090
-    restart: unless-stopped
-    volumes:
-      - ./prometheus:/etc/prometheus
-      - prom_data:/prometheus
-  grafana:
-    image: grafana/grafana
-    container_name: grafana
-    ports:
-      - 3000:3000
-    restart: unless-stopped
-    environment:
-      - GF_SECURITY_ADMIN_USER=admin
-      - GF_SECURITY_ADMIN_PASSWORD=grafana
-    volumes:
-      - ./grafana:/etc/grafana/provisioning/datasources
-volumes:
-  prom_data:
-  localai_workspace:
--- a/.devcontainer/grafana/datasource.yml
+++ b/.devcontainer/grafana/datasource.yml
@@ -1,10 +0,0 @@
-
-apiVersion: 1
-
-datasources:
- name: Prometheus
-  type: prometheus
-  url: http://prometheus:9090 
-  isDefault: true
-  access: proxy
-  editable: true
--- a/.devcontainer/prometheus/prometheus.yml
+++ b/.devcontainer/prometheus/prometheus.yml
@@ -1,21 +0,0 @@
-global:
-  scrape_interval: 15s
-  scrape_timeout: 10s
-  evaluation_interval: 15s
-alerting:
-  alertmanagers:
-  - static_configs:
-    - targets: []
-    scheme: http
-    timeout: 10s
-    api_version: v1
-scrape_configs:
- job_name: prometheus
-  honor_timestamps: true
-  scrape_interval: 15s
-  scrape_timeout: 10s
-  metrics_path: /metrics
-  scheme: http
-  static_configs:
-  - targets:
-    - localhost:9090
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,7 +1,6 @@
 .idea
 .github
 .vscode
-.devcontainer
 models
 examples/chatbot-ui/models
 examples/rwkv/models
--- a/.env
+++ b/.env
@@ -79,9 +79,6 @@
 ### Enable to run parallel requests
 # LOCALAI_PARALLEL_REQUESTS=true

-# Enable to allow p2p mode
-# LOCALAI_P2P=true
-
 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
--- a/.github/bump_deps.sh
+++ b/.github/bump_deps.sh
@@ -6,17 +6,4 @@ VAR=$3

 LAST_COMMIT=$(curl -s -H "Accept: application/vnd.github.VERSION.sha" "https://api.github.com/repos/$REPO/commits/$BRANCH")

-# Read $VAR from Makefile (only first match)
-set +e
-CURRENT_COMMIT="$(grep -m1 "^$VAR?=" Makefile | cut -d'=' -f2)"
-set -e
-
 sed -i Makefile -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
-
-if [ -z "$CURRENT_COMMIT" ]; then
-    echo "Could not find $VAR in Makefile."
-    exit 0
-fi
-
-echo "Changes: https://github.com/$REPO/compare/${CURRENT_COMMIT}..${LAST_COMMIT}" >> "${VAR}_message.txt"
-echo "${LAST_COMMIT}" >> "${VAR}_commit.txt"
--- a/.github/check_and_update.py
+++ b/.github/check_and_update.py
@@ -29,14 +29,9 @@ def calculate_sha256(file_path):
 def manual_safety_check_hf(repo_id):
    scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
    scan = scanResponse.json()
-    # Check if 'hasUnsafeFile' exists in the response
-    if 'hasUnsafeFile' in scan:
-        if scan['hasUnsafeFile']:
-            return scan
-        else:
-            return None
-    else:
-        return None
+    if scan['hasUnsafeFile']:
+        return scan
+    return None

 download_type, repo_id_or_url = parse_uri(uri)

--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -67,6 +67,10 @@ updates:
    directory: "/backend/python/parler-tts"
    schedule:
      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/petals"
+    schedule:
+      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/rerankers"
    schedule:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -40,30 +40,17 @@ jobs:
    steps:
      - uses: actions/checkout@v4
      - name: Bump dependencies 🔧
-        id: bump
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
-          {
-            echo 'message<<EOF'
-            cat "${{ matrix.variable }}_message.txt"
-            echo EOF
-          } >> "$GITHUB_OUTPUT"
-          {
-            echo 'commit<<EOF'
-            cat "${{ matrix.variable }}_commit.txt"
-            echo EOF
-          } >> "$GITHUB_OUTPUT"
-          rm -rfv ${{ matrix.variable }}_message.txt
-          rm -rfv ${{ matrix.variable }}_commit.txt
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Update ${{ matrix.repository }}'
-          title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`'
+          title: 'chore: :arrow_up: Update ${{ matrix.repository }}'
          branch: "update/${{ matrix.variable }}"
-          body: ${{ steps.bump.outputs.message }}
+          body: Bump of ${{ matrix.repository }} version
          signoff: true


--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -17,7 +17,7 @@ jobs:
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -36,12 +36,12 @@ jobs:
          sudo chmod 777 /hf_cache
          bash .github/checksum_checker.sh gallery/index.yaml
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Checksum updates in gallery/index.yaml'
-          title: 'chore(model-gallery): :arrow_up: update checksum'
+          title: 'models(gallery): :arrow_up: update checksum'
          branch: "update/checksum"
          body: Updating checksums in gallery/index.yaml
          signoff: true
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -1,64 +0,0 @@
-name: Explorer deployment
-
-on:
-  push:
-    branches:
-      - master
-    tags:
-      - 'v*'
-
-concurrency:
-  group: ci-deploy-${{ github.head_ref || github.ref }}-${{ github.repository }}
-
-jobs:
-  build-linux:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.21.x'
-          cache: false
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
-          make protogen-go
-      - name: Build api
-        run: |
-          CGO_ENABLED=0 make build-api
-      - name: rm
-        uses: appleboy/ssh-action@v1.0.3
-        with:
-            host: ${{ secrets.EXPLORER_SSH_HOST }}
-            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
-            key: ${{ secrets.EXPLORER_SSH_KEY }}
-            port: ${{ secrets.EXPLORER_SSH_PORT }}
-            script: |
-                sudo rm -rf local-ai/ || true
-      - name: copy file via ssh
-        uses: appleboy/scp-action@v0.1.7
-        with:
-            host: ${{ secrets.EXPLORER_SSH_HOST }}
-            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
-            key: ${{ secrets.EXPLORER_SSH_KEY }}
-            port: ${{ secrets.EXPLORER_SSH_PORT }}
-            source: "local-ai"
-            overwrite: true
-            rm: true
-            target: ./local-ai
-      - name: restarting
-        uses: appleboy/ssh-action@v1.0.3
-        with:
-            host: ${{ secrets.EXPLORER_SSH_HOST }}
-            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
-            key: ${{ secrets.EXPLORER_SSH_KEY }}
-            port: ${{ secrets.EXPLORER_SSH_PORT }}
-            script: |
-                sudo cp -rfv local-ai/local-ai /usr/bin/local-ai
-                sudo systemctl restart local-ai
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -13,78 +13,6 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  hipblas-jobs:
-    uses: ./.github/workflows/image_build.yml
-    with:
-      tag-latest: ${{ matrix.tag-latest }}
-      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
-      build-type: ${{ matrix.build-type }}
-      cuda-major-version: ${{ matrix.cuda-major-version }}
-      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-      platforms: ${{ matrix.platforms }}
-      runs-on: ${{ matrix.runs-on }}
-      base-image: ${{ matrix.base-image }}
-      grpc-base-image: ${{ matrix.grpc-base-image }}
-      aio: ${{ matrix.aio }}
-      makeflags: ${{ matrix.makeflags }}
-      latest-image: ${{ matrix.latest-image }}
-      latest-image-aio: ${{ matrix.latest-image-aio }}
-    secrets:
-      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-    strategy:
-      # Pushing with all jobs in parallel
-      # eats the bandwidth of all the nodes
-      max-parallel: 2
-      matrix:
-        include:
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-hipblas-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            aio: "-aio-gpu-hipblas"
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            latest-image: 'latest-gpu-hipblas'
-            latest-image-aio: 'latest-aio-gpu-hipblas'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas'
-            ffmpeg: 'false'
-            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -111,7 +39,7 @@ jobs:
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
+      max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
      matrix:
        include:
          # Extra images
@@ -194,6 +122,29 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-hipblas-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            aio: "-aio-gpu-hipblas"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            latest-image: 'latest-gpu-hipblas'
+            latest-image-aio: 'latest-aio-gpu-hipblas'
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas'
+            ffmpeg: 'false'
+            image-type: 'extras'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
@@ -261,6 +212,26 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-core'
+            ffmpeg: 'false'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"

  core-image-build:
    uses: ./.github/workflows/image_build.yml
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -294,7 +294,7 @@ jobs:
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
-          export SKIP_GRPC_BACKEND=backend-assets/grpc/whisper
+
          make dist
      - uses: actions/upload-artifact@v4
        with:
@@ -327,7 +327,7 @@ jobs:
          cache: false
      - name: Dependencies
        run: |
-          brew install protobuf grpc libomp llvm
+          brew install protobuf grpc
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build
@@ -336,7 +336,7 @@ jobs:
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
-          export CC=/opt/homebrew/opt/llvm/bin/clang
+
          make dist
      - uses: actions/upload-artifact@v4
        with:
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.21.0
+        uses: securego/gosec@master
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -168,6 +168,32 @@ jobs:
           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test

+
+
+  # tests-petals:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v4
+  #       with:
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
+
+  #     - name: Test petals
+  #       run: |
+  #          make --jobs=5 --output-sync=target -C backend/python/petals
+  #          make --jobs=5 --output-sync=target -C backend/python/petals test
+
+
+
  # tests-bark:
  #   runs-on: ubuntu-latest
  #   steps:
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -214,13 +214,12 @@ jobs:
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
-          export CC=/opt/homebrew/opt/llvm/bin/clang
          # Used to run the newer GNUMake version from brew that supports --output-sync
          export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -25,7 +25,7 @@ jobs:
        run: |
          make protogen-go swagger
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.gitignore
+++ b/.gitignore
@@ -54,6 +54,3 @@ docs/static/gallery.html

 # backend virtual environments
 **/venv
-
-# per-developer customization files for the development container
-.devcontainer/customization/*
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -3,12 +3,12 @@
    "configurations": [
        {
            "name": "Python: Current File",
-            "type": "debugpy",
+            "type": "python",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal",
            "justMyCode": false,
-            "cwd": "${fileDirname}",
+            "cwd": "${workspaceFolder}/examples/langchain-chroma",
            "env": {
                "OPENAI_API_BASE": "http://localhost:8080/v1",
                "OPENAI_API_KEY": "abc"
@@ -19,16 +19,15 @@
            "type": "go",
            "request": "launch",
            "mode": "debug",
-            "program": "${workspaceRoot}",
-            "args": [],
+            "program": "${workspaceFolder}/main.go",
+            "args": [
+                "api"
+            ],
            "env": {
-                "LOCALAI_LOG_LEVEL": "debug",
-                "LOCALAI_P2P": "true",
-                "LOCALAI_FEDERATED": "true"
-            },
-            "buildFlags": ["-tags", "stablediffusion p2p tts", "-v"],
-            "envFile": "${workspaceFolder}/.env",
-            "cwd": "${workspaceRoot}"
+                "C_INCLUDE_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
+                "LIBRARY_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
+                "DEBUG": "true"
+            }
        }
    ]
 }
--- a/120
+++ b/120
@@ -8,12 +8,12 @@ FROM ${BASE_IMAGE} AS requirements-core

 USER root

-ARG GO_VERSION=1.22.6
+ARG GO_VERSION=1.22.5
 ARG TARGETARCH
 ARG TARGETVARIANT

 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"


 RUN apt-get update && \
@@ -30,7 +30,7 @@ RUN apt-get update && \

 # Install Go
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
-ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
+ENV PATH $PATH:/root/go/bin:/usr/local/go/bin

 # Install grpc compilers
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
@@ -39,18 +39,15 @@ RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates

-RUN test -n "$TARGETARCH" \
-    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
-
 # Use the variables in subsequent instructions
 RUN echo "Target Architecture: $TARGETARCH"
 RUN echo "Target Variant: $TARGETVARIANT"

 # Cuda
-ENV PATH=/usr/local/cuda/bin:${PATH}
+ENV PATH /usr/local/cuda/bin:${PATH}

 # HipBLAS requirements
-ENV PATH=/opt/rocm/bin:${PATH}
+ENV PATH /opt/rocm/bin:${PATH}

 # OpenBLAS requirements and stable diffusion
 RUN apt-get update && \
@@ -65,6 +62,9 @@ RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2

 WORKDIR /build

+RUN test -n "$TARGETARCH" \
+    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
+
 ###################################
 ###################################

@@ -81,7 +81,7 @@ RUN apt-get update && \
        espeak \
        python3-pip \
        python-is-python3 \
-        python3-dev llvm \
+        python3-dev \
        python3-venv && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* && \
@@ -217,14 +217,13 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
 ###################################
 ###################################

-# The builder-base target has the arguments, variables, and copies shared between full builder images and the uncompiled devcontainer
-
-FROM requirements-drivers AS builder-base
+# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
+# Adjustments to the build process should likely be made here.
+FROM requirements-drivers AS builder

 ARG GO_TAGS="stablediffusion tts p2p"
 ARG GRPC_BACKENDS
 ARG MAKEFLAGS
-ARG LD_FLAGS="-s -w"

 ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
@@ -232,12 +231,14 @@ ENV MAKEFLAGS=${MAKEFLAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
-ENV LD_FLAGS=${LD_FLAGS}
-
-RUN echo "GO_TAGS: $GO_TAGS" && echo "TARGETARCH: $TARGETARCH"

 WORKDIR /build

+COPY . .
+COPY .git .
+RUN echo "GO_TAGS: $GO_TAGS"
+
+RUN make prepare

 # We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
 # but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
@@ -255,35 +256,8 @@ RUN <<EOT bash
    fi
 EOT

-
-###################################
-###################################
-
-# This first portion of builder holds the layers specifically used to build backend-assets/grpc/stablediffusion
-# In most cases, builder is the image you should be using - however, this can save build time if one just needs to copy backend-assets/grpc/stablediffusion and nothing else.
-FROM builder-base AS builder-sd
-
-# stablediffusion does not tolerate a newer version of abseil, copy only over enough elements to build it
-COPY Makefile .
-COPY go.mod .
-COPY go.sum .
-COPY backend/backend.proto ./backend/backend.proto
-COPY backend/go/image/stablediffusion ./backend/go/image/stablediffusion
-COPY pkg/grpc ./pkg/grpc
-COPY pkg/stablediffusion ./pkg/stablediffusion
-RUN git init
-RUN make sources/go-stable-diffusion
-RUN touch prepare-sources
-
-# Actually build the backend
-RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make backend-assets/grpc/stablediffusion
-
-###################################
-###################################
-
-# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
-# Adjustments to the build process should likely be made here.
-FROM builder-sd AS builder
+# stablediffusion does not tolerate a newer version of abseil, build it first
+RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build

 # Install the pre-built GRPC
 COPY --from=grpc /opt/grpc /usr/local
@@ -291,20 +265,8 @@ COPY --from=grpc /opt/grpc /usr/local
 # Rebuild with defaults backends
 WORKDIR /build

-COPY . .
-COPY .git .
-
-RUN make prepare
-
 ## Build the binary
-## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
-## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
-## (both will use CUDA or hipblas for the actual computation)
-RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
-        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
-    else \
-        make build; \
-    fi
+RUN make build

 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
        mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
@@ -314,40 +276,6 @@ RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
 ###################################
 ###################################

-# The devcontainer target is not used on CI. It is a target for developers to use locally -
-# rather than copying files it mounts them locally and leaves building to the developer
-
-FROM builder-base AS devcontainer
-
-ARG FFMPEG
-
-COPY --from=grpc /opt/grpc /usr/local
-
-COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion /build/backend-assets/grpc/stablediffusion
-
-COPY .devcontainer-scripts /.devcontainer-scripts
-
-# Add FFmpeg
-RUN if [ "${FFMPEG}" = "true" ]; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            ffmpeg && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* \
-    ; fi
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        ssh less wget
-# For the devcontainer, leave apt functional in case additional devtools are needed at runtime.
-
-RUN go install github.com/go-delve/delve/cmd/dlv@latest
-
-RUN go install github.com/mikefarah/yq/v4@latest
-
-###################################
-###################################
-
 # This is the final target. The result of this target will be the image uploaded to the registry.
 # If you cannot find a more suitable place for an addition, this layer is a suitable place for it.
 FROM requirements-drivers
@@ -398,7 +326,7 @@ COPY --from=builder /build/local-ai ./
 COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/

 # do not let stablediffusion rebuild (requires an older version of absl)
-COPY --from=builder-sd /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
+COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion

 # Change the shell to bash so we can use [[ tests below
 SHELL ["/bin/bash", "-c"]
@@ -417,6 +345,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/transformers-musicgen \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/exllama \
    ; fi

 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
@@ -425,6 +356,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$I
    if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/openvoice \
    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "petals" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/petals \
+    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "sentencetransformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/sentencetransformers \
    ; fi && \
--- a/83
+++ b/83
@@ -8,7 +8,11 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=f0c7b5edf82aa200656fd88c11ae3a805d7130bf
+CPPLLAMA_VERSION?=081fe431aa8fb6307145c4feb3eed4f48cab19f8
+
+# gpt4all version
+GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
+GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8

 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
@@ -16,7 +20,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=34972dbe221709323714fc8402f2e24041d48213
+WHISPER_CPP_VERSION?=f68298ce06ca3edd6e6f3f21c3d0bb5f073942c3

 # bert.cpp version
 BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
@@ -186,6 +190,7 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
+ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
 ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
@@ -248,6 +253,18 @@ sources/go-piper:
 sources/go-piper/libpiper_binding.a: sources/go-piper
 	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o

+## GPT4ALL
+sources/gpt4all:
+	mkdir -p sources/gpt4all
+	cd sources/gpt4all && \
+	git init && \
+	git remote add origin $(GPT4ALL_REPO) && \
+	git fetch origin && \
+	git checkout $(GPT4ALL_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
+	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a

 ## RWKV
 sources/go-rwkv.cpp:
@@ -301,7 +318,7 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a

-get-sources: sources/go-llama.cpp sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
+get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp

 replace:
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
@@ -311,6 +328,7 @@ replace:
 	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
+	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp

 dropreplace:
@@ -321,6 +339,7 @@ dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
+	$(GOCMD) mod edit -dropreplace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp

 prepare-sources: get-sources replace
@@ -330,6 +349,7 @@ prepare-sources: get-sources replace
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C sources/go-llama.cpp clean
+	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
 	$(MAKE) -C sources/go-rwkv.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
@@ -359,9 +379,6 @@ clean-tests:
 	rm -rf test-dir
 	rm -rf core/http/backend-assets

-clean-dc: clean
-	cp -r /build/backend-assets /workspace/backend-assets
-
 ## Build:
 build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
@@ -379,7 +396,7 @@ build-minimal:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build

 build-api:
-	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=p2p $(MAKE) build
+	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build

 backend-assets/lib:
 	mkdir -p backend-assets/lib
@@ -390,7 +407,7 @@ ifeq ($(DETECT_LIBS),true)
 	scripts/prepare-libs.sh backend-assets/grpc/llama-cpp-avx2
 endif
 ifeq ($(OS),Darwin)
-	BUILD_TYPE=none $(MAKE) backend-assets/grpc/llama-cpp-fallback
+	$(info ${GREEN}I Skip CUDA/hipblas build on MacOS${RESET})
 else
 	$(MAKE) backend-assets/grpc/llama-cpp-cuda
 	$(MAKE) backend-assets/grpc/llama-cpp-hipblas
@@ -452,7 +469,8 @@ test: prepare test-models/testmodel.ggml grpcs
 	export GO_TAGS="tts stablediffusion debug"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
+	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
@@ -482,6 +500,10 @@ teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)

+test-gpt4all: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r $(TEST_PATHS)
+
 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
@@ -537,10 +559,10 @@ protogen-go-clean:
 	$(RM) bin/*

 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen

 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean

 .PHONY: autogptq-protogen
 autogptq-protogen:
@@ -574,6 +596,14 @@ diffusers-protogen:
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean

+.PHONY: exllama-protogen
+exllama-protogen:
+	$(MAKE) -C backend/python/exllama protogen
+
+.PHONY: exllama-protogen-clean
+exllama-protogen-clean:
+	$(MAKE) -C backend/python/exllama protogen-clean
+
 .PHONY: exllama2-protogen
 exllama2-protogen:
 	$(MAKE) -C backend/python/exllama2 protogen
@@ -590,6 +620,14 @@ mamba-protogen:
 mamba-protogen-clean:
 	$(MAKE) -C backend/python/mamba protogen-clean

+.PHONY: petals-protogen
+petals-protogen:
+	$(MAKE) -C backend/python/petals protogen
+
+.PHONY: petals-protogen-clean
+petals-protogen-clean:
+	$(MAKE) -C backend/python/petals protogen-clean
+
 .PHONY: rerankers-protogen
 rerankers-protogen:
 	$(MAKE) -C backend/python/rerankers protogen
@@ -670,6 +708,8 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
 	$(MAKE) -C backend/python/openvoice
+	$(MAKE) -C backend/python/exllama
+	$(MAKE) -C backend/python/petals
 	$(MAKE) -C backend/python/exllama2

 prepare-test-extra: protogen-python
@@ -690,6 +730,12 @@ backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_bindin
 	mkdir -p backend-assets/espeak-ng-data
 	@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data

+backend-assets/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
+	mkdir -p backend-assets/gpt4all
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
+
 backend-assets/grpc: protogen-go replace
 	mkdir -p backend-assets/grpc

@@ -700,6 +746,13 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/bert-embeddings
 endif

+backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/gpt4all
+endif
+
 backend-assets/grpc/huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/
 ifneq ($(UPX),)
@@ -730,6 +783,9 @@ else
 	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server
 endif
+ifneq ($(UPX),)
+	$(UPX) backend/cpp/${VARIANT}/grpc-server
+endif

 # This target is for manually building a variant with-auto detected flags
 backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/llama.cpp
@@ -802,6 +858,9 @@ backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc backend/cpp/llama/llama.
 backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
 	mkdir -p backend-assets/util/
 	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
+ifneq ($(UPX),)
+	$(UPX) backend-assets/util/llama-cpp-rpc-server
+endif

 backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
@@ -840,7 +899,7 @@ endif

 backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/whisper
 endif
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@

 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

@@ -72,7 +72,6 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

- Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
 - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
 - June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
 - June 2024: Support for models from OCI registries: https://github.com/mudler/LocalAI/pull/2628
@@ -85,7 +84,6 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu

 Hot topics (looking for contributors):

- 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
 - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
 - Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
@@ -152,7 +150,6 @@ Other:

 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

- [Run Visual studio code with LocalAI (SUSE)](https://www.suse.com/c/running-ai-locally/)
 - 🆕 [Run LocalAI on Jetson Nano Devkit](https://mudler.pm/posts/local-ai-jetson-nano-devkit/)
 - [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/blog/low-code-llm-apps-with-local-ai-flowise-and-pulumi/)
 - [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4o
+name: gpt-4-vision-preview

 roles:
  user: "USER:"
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4o
+name: gpt-4-vision-preview

 roles:
  user: "USER:"
--- a/aio/intel/image-gen.yaml
+++ b/aio/intel/image-gen.yaml
@@ -1,6 +1,6 @@
 name: stablediffusion
 parameters:
-  model: Lykon/dreamshaper-8
+  model: runwayml/stable-diffusion-v1-5
 backend: diffusers
 step: 25
 f16: true
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 mmap: false
 f16: false
-name: gpt-4o
+name: gpt-4-vision-preview

 roles:
  user: "USER:"
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -16,7 +16,6 @@ service Backend {
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
-  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}

@@ -134,8 +133,6 @@ message PredictOptions {
  repeated string Images = 42;
  bool UseTokenizerTemplate = 43;
  repeated Message Messages = 44;
-  repeated string Videos = 45;
-  repeated string Audios = 46;
 }

 // The response message containing the result
@@ -273,17 +270,6 @@ message TTSRequest {
  optional string language = 5;
 }

-message SoundGenerationRequest {
-  string text = 1;
-  string model = 2;
-  string dst = 3;
-  optional float duration = 4;
-  optional float temperature = 5;
-  optional bool sample = 6;
-  optional string src = 7;
-  optional int32 src_divisor = 8;
-}
-
 message TokenizationResponse {
  int32 length = 1;
  repeated int32 tokens = 2;
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -13,15 +13,15 @@
 #include <getopt.h>
 #include "clip.h"
 #include "llava.h"
-#include "log.h"
 #include "stb_image.h"
 #include "common.h"
 #include "json.hpp"
 #include "llama.h"
+#include "grammar-parser.h"
 #include "backend.pb.h"
 #include "backend.grpc.pb.h"
 #include "utils.hpp"
-#include "sampling.h"
+
 // include std::regex
 #include <cstddef>
 #include <thread>
@@ -203,8 +203,8 @@ struct llama_client_slot
    std::string stopping_word;

    // sampling
-    struct gpt_sampler_params sparams;
-    gpt_sampler *ctx_sampling = nullptr;
+    struct llama_sampling_params sparams;
+    llama_sampling_context *ctx_sampling = nullptr;

    int32_t ga_i = 0;   // group-attention state
    int32_t ga_n = 1;   // group-attention factor
@@ -449,7 +449,7 @@ struct llama_server_context
            LOG_INFO("Multi Modal Mode Enabled", {});
            clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
            if(clp_ctx == nullptr) {
-                LOG_ERR("unable to load clip model: %s", params.mmproj.c_str());
+                LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
                return false;
            }

@@ -458,12 +458,10 @@ struct llama_server_context
            }
        }

-        llama_init_result llama_init = llama_init_from_gpt_params(params);
-        model = llama_init.model;
-        ctx = llama_init.context;
+        std::tie(model, ctx) = llama_init_from_gpt_params(params);
        if (model == nullptr)
        {
-            LOG_ERR("unable to load model: %s", params.model.c_str());
+            LOG_ERROR("unable to load model", {{"model", params.model}});
            return false;
        }

@@ -471,7 +469,7 @@ struct llama_server_context
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
            const int n_embd_llm  = llama_n_embd(model);
            if (n_embd_clip != n_embd_llm) {
-                LOG("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
+                LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
                llama_free(ctx);
                llama_free_model(model);
                return false;
@@ -480,7 +478,7 @@ struct llama_server_context

        n_ctx = llama_n_ctx(ctx);

-        add_bos_token = llama_add_bos_token(model);
+        add_bos_token = llama_should_add_bos_token(model);

        return true;
    }
@@ -490,7 +488,7 @@ struct llama_server_context
        std::vector<char> buf(1);
        int res = llama_chat_apply_template(model, nullptr, chat, 1, true, buf.data(), buf.size());
        if (res < 0) {
-            LOG_ERR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", __func__);
+            LOG_ERROR("The chat template comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
            sparams.chat_template = "<|im_start|>"; // llama_chat_apply_template only checks if <|im_start|> exist in the template
        }
    }
@@ -619,7 +617,7 @@ struct llama_server_context

    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
        slot_params default_params;
-        gpt_sampler_params default_sparams;
+        llama_sampling_params default_sparams;
 
        slot->params.stream             = json_value(data, "stream",            false);
        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
@@ -628,7 +626,7 @@ struct llama_server_context
        slot->sparams.top_p             = json_value(data, "top_p",             default_sparams.top_p);
        slot->sparams.min_p             = json_value(data, "min_p",             default_sparams.min_p);
        slot->sparams.tfs_z             = json_value(data, "tfs_z",             default_sparams.tfs_z);
-        slot->sparams.typ_p             = json_value(data, "typical_p",         default_sparams.typ_p);
+        slot->sparams.typical_p         = json_value(data, "typical_p",         default_sparams.typical_p);
        slot->sparams.temp              = json_value(data, "temperature",       default_sparams.temp);
        slot->sparams.dynatemp_range    = json_value(data, "dynatemp_range",    default_sparams.dynatemp_range);
        slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
@@ -641,7 +639,7 @@ struct llama_server_context
        slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
        slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
        slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
-        slot->sparams.seed               = json_value(data, "seed",              default_sparams.seed);
+        slot->params.seed               = json_value(data, "seed",              default_params.seed);
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
@@ -665,7 +663,6 @@ struct llama_server_context
            slot->params.input_prefix = "";
        }

-
        if (data.count("input_suffix") != 0)
        {
            slot->params.input_suffix = data["input_suffix"];
@@ -684,10 +681,6 @@ struct llama_server_context
            slot->prompt = "";
        }

-        if (json_value(data, "ignore_eos", false)) {
-                slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
-        }
-        /*
        slot->sparams.penalty_prompt_tokens.clear();
        slot->sparams.use_penalty_prompt_tokens = false;
        const auto &penalty_prompt = data.find("penalty_prompt");
@@ -723,10 +716,14 @@ struct llama_server_context
                slot->sparams.use_penalty_prompt_tokens = true;
            }
        }
-      */

        slot->sparams.logit_bias.clear();

+        if (json_value(data, "ignore_eos", false))
+        {
+            slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+        }
+
        const auto &logit_bias = data.find("logit_bias");
        if (logit_bias != data.end() && logit_bias->is_array())
        {
@@ -754,7 +751,7 @@ struct llama_server_context
                        llama_token tok = el[0].get<llama_token>();
                        if (tok >= 0 && tok < n_vocab)
                        {
-                            slot->sparams.logit_bias.push_back({tok, bias});
+                            slot->sparams.logit_bias[tok] = bias;
                        }
                    }
                    else if (el[0].is_string())
@@ -762,13 +759,13 @@ struct llama_server_context
                        auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
                        for (auto tok : toks)
                        {
-                            slot->sparams.logit_bias.push_back({tok, bias});
+                            slot->sparams.logit_bias[tok] = bias;
                        }
                    }
                }
            }
        }
-        
+
        slot->params.antiprompt.clear();

        const auto &stop = data.find("stop");
@@ -782,22 +779,24 @@ struct llama_server_context
                }
            }
        }
-        
-        const auto & samplers = data.find("samplers");
-        if (samplers != data.end() && samplers->is_array()) {
+
+        const auto &samplers_sequence = data.find("samplers");
+        if (samplers_sequence != data.end() && samplers_sequence->is_array())
+        {
            std::vector<std::string> sampler_names;
-                for (const auto & name : *samplers) {
-                    if (name.is_string()) {
-                        sampler_names.emplace_back(name);
-                    }
+            for (const auto &sampler_name : *samplers_sequence)
+            {
+                if (sampler_name.is_string())
+                {
+                    sampler_names.emplace_back(sampler_name);
                }
-                slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
+            }
+            slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
        }
        else
        {
-                slot->sparams.samplers = default_sparams.samplers;
+            slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
        }
-        

        if (multimodal)
        {
@@ -813,11 +812,10 @@ struct llama_server_context
                    img_sl.img_data = clip_image_u8_init();
                    if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
                    {
-                        LOG_ERR("%s: failed to load image, slot_id: %d, img_sl_id: %d", 
-                             __func__,
-                             slot->id,
-                             img_sl.id
-                        );
+                        LOG_ERROR("failed to load image", {
+                            {"slot_id",   slot->id},
+                            {"img_sl_id", img_sl.id}
+                        });
                        return false;
                    }
                    LOG_VERBOSE("image loaded", {
@@ -855,12 +853,12 @@ struct llama_server_context
                                    }
                                }
                                if (!found) {
-                                    LOG("ERROR: Image with id: %i, not found.\n", img_id);
+                                    LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
                                    slot->images.clear();
                                    return false;
                                }
                            } catch (const std::invalid_argument& e) {
-                                LOG("Invalid image number id in prompt\n");
+                                LOG_TEE("Invalid image number id in prompt\n");
                                slot->images.clear();
                                return false;
                            }
@@ -875,10 +873,10 @@ struct llama_server_context

        if (slot->ctx_sampling != nullptr)
        {
-            gpt_sampler_free(slot->ctx_sampling);
+            llama_sampling_free(slot->ctx_sampling);
        }
-        slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
-        //llama_set_rng_seed(ctx, slot->params.seed);
+        slot->ctx_sampling = llama_sampling_init(slot->sparams);
+        llama_set_rng_seed(ctx, slot->params.seed);
        slot->command = LOAD_PROMPT;

        all_slots_are_idle = false;
@@ -888,7 +886,7 @@ struct llama_server_context
            {"task_id", slot->task_id},
        });

-      //  LOG("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
+        LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());

        return true;
    }
@@ -928,7 +926,7 @@ struct llama_server_context
                };
                if (llama_decode(ctx, batch_view) != 0)
                {
-                    LOG("%s: llama_decode() failed\n", __func__);
+                    LOG_TEE("%s: llama_decode() failed\n", __func__);
                    return;
                }
            }
@@ -940,7 +938,7 @@ struct llama_server_context
            }
        }

-        LOG("system prompt updated\n");
+        LOG_TEE("system prompt updated\n");
        system_need_update = false;
    }

@@ -1006,13 +1004,11 @@ struct llama_server_context
        slot.generated_text += token_str;
        slot.has_next_token = true;

-/*
        if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
        {
            // we can change penalty_prompt_tokens because it is always created from scratch each request
            slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
        }
-        */

        // check if there is incomplete UTF-8 character at the end
        bool incomplete = false;
@@ -1121,8 +1117,8 @@ struct llama_server_context
                continue;
            }

-            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.cpuparams.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
-                LOG("Error processing the given image");
+            if (!llava_image_embed_make_with_clip_img(clp_ctx, params.n_threads, img.img_data, &img.image_embedding, &img.image_tokens)) {
+                LOG_TEE("Error processing the given image");
                return false;
            }

@@ -1134,7 +1130,7 @@ struct llama_server_context

    void send_error(task_server& task, const std::string &error)
    {
-        LOG("task %i - error: %s\n", task.id, error.c_str());
+        LOG_TEE("task %i - error: %s\n", task.id, error.c_str());
        task_result res;
        res.id = task.id;
        res.multitask_id = task.multitask_id;
@@ -1146,11 +1142,13 @@ struct llama_server_context

    json get_formated_generation(llama_client_slot &slot)
    {
-        std::vector<std::string> samplers;
-        samplers.reserve(slot.sparams.samplers.size());
-        for (const auto & sampler : slot.sparams.samplers)
+        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
+        const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
+                                eos_bias->second < 0.0f && std::isinf(eos_bias->second);
+        std::vector<std::string> samplers_sequence;
+        for (const auto &sampler_type : slot.sparams.samplers_sequence)
        {
-            samplers.emplace_back(gpt_sampler_type_to_str(sampler));
+            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
        }

        return json {
@@ -1165,11 +1163,13 @@ struct llama_server_context
            {"top_p",             slot.sparams.top_p},
            {"min_p",             slot.sparams.min_p},
            {"tfs_z",             slot.sparams.tfs_z},
-            {"typical_p",         slot.sparams.typ_p},
+            {"typical_p",         slot.sparams.typical_p},
            {"repeat_last_n",     slot.sparams.penalty_last_n},
            {"repeat_penalty",    slot.sparams.penalty_repeat},
            {"presence_penalty",  slot.sparams.penalty_present},
            {"frequency_penalty", slot.sparams.penalty_freq},
+            {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
+            {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
            {"mirostat",          slot.sparams.mirostat},
            {"mirostat_tau",      slot.sparams.mirostat_tau},
            {"mirostat_eta",      slot.sparams.mirostat_eta},
@@ -1177,13 +1177,13 @@ struct llama_server_context
            {"stop",              slot.params.antiprompt},
            {"n_predict",         slot.params.n_predict},
            {"n_keep",            params.n_keep},
-            {"ignore_eos",        slot.sparams.ignore_eos},
+            {"ignore_eos",        ignore_eos},
            {"stream",            slot.params.stream},
-      //      {"logit_bias",        slot.sparams.logit_bias},
+            {"logit_bias",        slot.sparams.logit_bias},
            {"n_probs",           slot.sparams.n_probs},
            {"min_keep",          slot.sparams.min_keep},
            {"grammar",           slot.sparams.grammar},
-            {"samplers",          samplers}
+            {"samplers",          samplers_sequence}
        };
    }

@@ -1373,7 +1373,7 @@ struct llama_server_context
                };
                if (llama_decode(ctx, batch_view))
                {
-                    LOG("%s : failed to eval\n", __func__);
+                    LOG_TEE("%s : failed to eval\n", __func__);
                    return false;
                }
            }
@@ -1391,7 +1391,7 @@ struct llama_server_context
                llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
                if (llama_decode(ctx, batch_img))
                {
-                    LOG("%s : failed to eval image\n", __func__);
+                    LOG_TEE("%s : failed to eval image\n", __func__);
                    return false;
                }
                slot.n_past += n_eval;
@@ -1574,7 +1574,7 @@ struct llama_server_context
                    slot.n_past = 0;
                    slot.truncated = false;
                    slot.has_next_token = true;
-                    LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
+                    LOG_TEE("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());

                    continue;
                    // END LOCALAI changes
@@ -1712,7 +1712,7 @@ struct llama_server_context

                    if (!slot.params.cache_prompt)
                    {
-                        gpt_sampler_reset(slot.ctx_sampling);
+                        llama_sampling_reset(slot.ctx_sampling);

                        slot.n_past = 0;
                        slot.n_past_se = 0;
@@ -1724,7 +1724,7 @@ struct llama_server_context
                        // push the prompt into the sampling context (do not apply grammar)
                        for (auto &token : prompt_tokens)
                        {
-                            gpt_sampler_accept(slot.ctx_sampling, token, false);
+                            llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
                        }

                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
@@ -1822,11 +1822,10 @@ struct llama_server_context

                    if (has_images && !ingest_images(slot, n_batch))
                    {
-                        LOG_ERR("%s: failed processing images Slot id : %d, Task id: %d", 
-                            __func__,
-                            slot.id,
-                            slot.task_id
-                        );
+                        LOG_ERROR("failed processing images", {
+                            "slot_id", slot.id,
+                            "task_id", slot.task_id,
+                        });
                        // FIXME @phymbert: to be properly tested
                        //  early returning without changing the slot state will block the slot for ever
                        // no one at the moment is checking the return value
@@ -1866,10 +1865,10 @@ struct llama_server_context
                        const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
                        const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;

-                        LOG("\n");
-                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+                        LOG_TEE("\n");
+                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);

                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
@@ -1879,7 +1878,7 @@ struct llama_server_context

                        slot.ga_i += slot.ga_w / slot.ga_n;

-                        LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+                        LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
                    }
                    slot.n_past_se += n_tokens;
                }
@@ -1904,11 +1903,11 @@ struct llama_server_context
                if (n_batch == 1 || ret < 0)
                {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                    return false;
                }

-                LOG("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
+                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);

                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
@@ -1933,9 +1932,9 @@ struct llama_server_context
                }

                completion_token_output result;
-                const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
+                const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);

-                gpt_sampler_accept(slot.ctx_sampling, id, true);
+                llama_sampling_accept(slot.ctx_sampling, ctx, id, true);

                slot.n_decoded += 1;
                if (slot.n_decoded == 1)
@@ -1945,14 +1944,19 @@ struct llama_server_context
                    metrics.on_prompt_eval(slot);
                }

+                llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
                result.tok = id;
-                const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);

-                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
-                    result.probs.push_back({
-                        cur_p->data[i].id,
-                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
-                    });
+                const int32_t n_probs = slot.sparams.n_probs;
+                if (slot.sparams.temp <= 0 && n_probs > 0)
+                {
+                    // for llama_sample_token_greedy we need to sort candidates
+                    llama_sample_softmax(ctx, &cur_p);
+                }
+
+                for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
+                {
+                    result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
                }

                if (!process_token(result, slot))
@@ -2204,7 +2208,7 @@ static void params_parse(const backend::ModelOptions* request,
    params.model_alias =  request->modelfile();
    params.n_ctx = request->contextsize();
    //params.memory_f16 = request->f16memory();
-    params.cpuparams.n_threads = request->threads();
+    params.n_threads = request->threads();
    params.n_gpu_layers = request->ngpulayers();
    params.n_batch = request->nbatch();
    // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
@@ -2254,7 +2258,7 @@ static void params_parse(const backend::ModelOptions* request,
     }
     // get the directory of modelfile
     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
-     params.lora_adapters.push_back({ model_dir + "/"+request->loraadapter(), scale_factor });
+     params.lora_adapter.push_back(std::make_tuple(model_dir + "/"+request->loraadapter(), scale_factor));
    }
    params.use_mlock = request->mlock();
    params.use_mmap = request->mmap();
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -1,13 +0,0 @@
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 342042ff..224db9b5 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
-             struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
-             int* patches_data = (int*)malloc(ggml_nbytes(patches));
-             for (int i = 0; i < num_patches; i++) {
-                patches_data[i] = i + 1;
-+                patches_data[i] = i;
-             }
-             ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-             free(patches_data);
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -1,12 +1,5 @@
 #!/bin/bash

-## Patches
-## Apply patches from the `patches` directory
-for patch in $(ls patches); do
-    echo "Applying patch $patch"
-    patch -d llama.cpp/ -p1 < patches/$patch
-done 
-
 cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
 cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
 cp -rfv json.hpp llama.cpp/examples/grpc-server/
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -480,4 +480,31 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
    }

    return ret;
+}
+
+//
+// random string / id
+//
+
+static std::string random_string()
+{
+    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+    std::random_device rd;
+    std::mt19937 generator(rd());
+
+    std::string result(32, ' ');
+
+    for (int i = 0; i < 32; ++i) {
+        result[i] = str[generator() % str.size()];
+    }
+
+    return result;
+}
+
+static std::string gen_chatcmplid()
+{
+    std::stringstream chatcmplid;
+    chatcmplid << "chatcmpl-" << random_string();
+    return chatcmplid.str();
 }
--- a/backend/go/llm/gpt4all/gpt4all.go
+++ b/backend/go/llm/gpt4all/gpt4all.go
@@ -0,0 +1,62 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	gpt4all "github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
+)
+
+type LLM struct {
+	base.SingleThread
+
+	gpt4all *gpt4all.Model
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	model, err := gpt4all.New(opts.ModelFile,
+		gpt4all.SetThreads(int(opts.Threads)),
+		gpt4all.SetLibrarySearchPath(opts.LibrarySearchPath))
+	llm.gpt4all = model
+	return err
+}
+
+func buildPredictOptions(opts *pb.PredictOptions) []gpt4all.PredictOption {
+	predictOptions := []gpt4all.PredictOption{
+		gpt4all.SetTemperature(float64(opts.Temperature)),
+		gpt4all.SetTopP(float64(opts.TopP)),
+		gpt4all.SetTopK(int(opts.TopK)),
+		gpt4all.SetTokens(int(opts.Tokens)),
+	}
+
+	if opts.Batch != 0 {
+		predictOptions = append(predictOptions, gpt4all.SetBatch(int(opts.Batch)))
+	}
+	return predictOptions
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	return llm.gpt4all.Predict(opts.Prompt, buildPredictOptions(opts)...)
+}
+
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	predictOptions := buildPredictOptions(opts)
+
+	go func() {
+		llm.gpt4all.SetTokenCallback(func(token string) bool {
+			results <- token
+			return true
+		})
+		_, err := llm.gpt4all.Predict(opts.Prompt, predictOptions...)
+		if err != nil {
+			fmt.Println("err: ", err)
+		}
+		llm.gpt4all.SetTokenCallback(nil)
+		close(results)
+	}()
+
+	return nil
+}
--- a/backend/go/llm/gpt4all/main.go
+++ b/backend/go/llm/gpt4all/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/transcribe/whisper/main.go
+++ b/backend/go/transcribe/whisper/main.go
--- a/backend/go/transcribe/transcript.go
+++ b/backend/go/transcribe/transcript.go
@@ -0,0 +1,104 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+
+	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	"github.com/go-audio/wav"
+	"github.com/mudler/LocalAI/core/schema"
+)
+
+func ffmpegCommand(args []string) (string, error) {
+	cmd := exec.Command("ffmpeg", args...) // Constrain this to ffmpeg to permit security scanner to see that the command is safe.
+	cmd.Env = os.Environ()
+	out, err := cmd.CombinedOutput()
+	return string(out), err
+}
+
+// AudioToWav converts audio to wav for transcribe.
+// TODO: use https://github.com/mccoyst/ogg?
+func audioToWav(src, dst string) error {
+	commandArgs := []string{"-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
+	out, err := ffmpegCommand(commandArgs)
+	if err != nil {
+		return fmt.Errorf("error: %w out: %s", err, out)
+	}
+	return nil
+}
+
+func Transcript(model whisper.Model, audiopath, language string, translate bool, threads uint) (schema.TranscriptionResult, error) {
+	res := schema.TranscriptionResult{}
+
+	dir, err := os.MkdirTemp("", "whisper")
+	if err != nil {
+		return res, err
+	}
+	defer os.RemoveAll(dir)
+
+	convertedPath := filepath.Join(dir, "converted.wav")
+
+	if err := audioToWav(audiopath, convertedPath); err != nil {
+		return res, err
+	}
+
+	// Open samples
+	fh, err := os.Open(convertedPath)
+	if err != nil {
+		return res, err
+	}
+	defer fh.Close()
+
+	// Read samples
+	d := wav.NewDecoder(fh)
+	buf, err := d.FullPCMBuffer()
+	if err != nil {
+		return res, err
+	}
+
+	data := buf.AsFloat32Buffer().Data
+
+	// Process samples
+	context, err := model.NewContext()
+	if err != nil {
+		return res, err
+
+	}
+
+	context.SetThreads(threads)
+
+	if language != "" {
+		context.SetLanguage(language)
+	} else {
+		context.SetLanguage("auto")
+	}
+
+	if translate {
+		context.SetTranslate(true)
+	}
+
+	if err := context.Process(data, nil, nil); err != nil {
+		return res, err
+	}
+
+	for {
+		s, err := context.NextSegment()
+		if err != nil {
+			break
+		}
+
+		var tokens []int
+		for _, t := range s.Tokens {
+			tokens = append(tokens, t.Id)
+		}
+
+		segment := schema.Segment{Id: s.Num, Text: s.Text, Start: s.Start, End: s.End, Tokens: tokens}
+		res.Segments = append(res.Segments, segment)
+
+		res.Text += s.Text
+	}
+
+	return res, nil
+}
--- a/backend/go/transcribe/whisper.go
+++ b/backend/go/transcribe/whisper.go
@@ -0,0 +1,26 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+type Whisper struct {
+	base.SingleThread
+	whisper whisper.Model
+}
+
+func (sd *Whisper) Load(opts *pb.ModelOptions) error {
+	// Note: the Model here is a path to a directory containing the model files
+	w, err := whisper.New(opts.ModelFile)
+	sd.whisper = w
+	return err
+}
+
+func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.TranscriptionResult, error) {
+	return Transcript(sd.whisper, opts.Dst, opts.Language, opts.Translate, uint(opts.Threads))
+}
--- a/backend/go/transcribe/whisper/whisper.go
+++ b/backend/go/transcribe/whisper/whisper.go
@@ -1,105 +0,0 @@
-package main
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"os"
-	"path/filepath"
-
-	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
-	"github.com/go-audio/wav"
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/utils"
-)
-
-type Whisper struct {
-	base.SingleThread
-	whisper whisper.Model
-}
-
-func (sd *Whisper) Load(opts *pb.ModelOptions) error {
-	// Note: the Model here is a path to a directory containing the model files
-	w, err := whisper.New(opts.ModelFile)
-	sd.whisper = w
-	return err
-}
-
-func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
-
-	dir, err := os.MkdirTemp("", "whisper")
-	if err != nil {
-		return pb.TranscriptResult{}, err
-	}
-	defer os.RemoveAll(dir)
-
-	convertedPath := filepath.Join(dir, "converted.wav")
-
-	if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil {
-		return pb.TranscriptResult{}, err
-	}
-
-	// Open samples
-	fh, err := os.Open(convertedPath)
-	if err != nil {
-		return pb.TranscriptResult{}, err
-	}
-	defer fh.Close()
-
-	// Read samples
-	d := wav.NewDecoder(fh)
-	buf, err := d.FullPCMBuffer()
-	if err != nil {
-		return pb.TranscriptResult{}, err
-	}
-
-	data := buf.AsFloat32Buffer().Data
-
-	// Process samples
-	context, err := sd.whisper.NewContext()
-	if err != nil {
-		return pb.TranscriptResult{}, err
-
-	}
-
-	context.SetThreads(uint(opts.Threads))
-
-	if opts.Language != "" {
-		context.SetLanguage(opts.Language)
-	} else {
-		context.SetLanguage("auto")
-	}
-
-	if opts.Translate {
-		context.SetTranslate(true)
-	}
-
-	if err := context.Process(data, nil, nil); err != nil {
-		return pb.TranscriptResult{}, err
-	}
-
-	segments := []*pb.TranscriptSegment{}
-	text := ""
-	for {
-		s, err := context.NextSegment()
-		if err != nil {
-			break
-		}
-
-		var tokens []int32
-		for _, t := range s.Tokens {
-			tokens = append(tokens, int32(t.Id))
-		}
-
-		segment := &pb.TranscriptSegment{Id: int32(s.Num), Text: s.Text, Start: int64(s.Start), End: int64(s.End), Tokens: tokens}
-		segments = append(segments, segment)
-
-		text += s.Text
-	}
-
-	return pb.TranscriptResult{
-		Segments: segments,
-		Text:     text,
-	}, nil
-
-}
--- a/backend/python/autogptq/requirements-cublas11.txt
+++ b/backend/python/autogptq/requirements-cublas11.txt
@@ -1,2 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch
--- a/backend/python/autogptq/requirements-cublas12.txt
+++ b/backend/python/autogptq/requirements-cublas12.txt
@@ -1 +0,0 @@
-torch
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,7 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.66.1
+grpcio==1.65.1
 protobuf
+torch
 certifi
 transformers
--- a/backend/python/bark/requirements-cpu.txt
+++ b/backend/python/bark/requirements-cpu.txt
@@ -1,4 +0,0 @@
-transformers
-accelerate
-torch
-torchaudio
--- a/backend/python/bark/requirements-cublas11.txt
+++ b/backend/python/bark/requirements-cublas11.txt
@@ -1,5 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch
-torchaudio
-transformers
-accelerate
--- a/backend/python/bark/requirements-cublas12.txt
+++ b/backend/python/bark/requirements-cublas12.txt
@@ -1,4 +0,0 @@
-torch
-torchaudio
-transformers
-accelerate
--- a/backend/python/bark/requirements-hipblas.txt
+++ b/backend/python/bark/requirements-hipblas.txt
@@ -1,5 +1,3 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch
-torchaudio
-transformers
-accelerate
+torchaudio
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -3,6 +3,4 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
-transformers
-accelerate
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,6 @@
+accelerate
 bark==0.1.5
-grpcio==1.66.1
+grpcio==1.65.1
 protobuf
-certifi
+certifi
+transformers
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -18,23 +18,10 @@
 # source $(dirname $0)/../common/libbackend.sh
 #
 function init() {
-    # Name of the backend (directory name)
    BACKEND_NAME=${PWD##*/}
-
-    # Path where all backends files are
    MY_DIR=$(realpath `dirname $0`)
-
-    # Build type
    BUILD_PROFILE=$(getBuildProfile)

-    # Environment directory
-    EDIR=${MY_DIR}
-
-    # Allow to specify a custom env dir for shared environments
-    if [ "x${ENV_DIR}" != "x" ]; then
-        EDIR=${ENV_DIR}
-    fi
-
    # If a backend has defined a list of valid build profiles...
    if [ ! -z "${LIMIT_TARGETS}" ]; then
        isValidTarget=$(checkTargets ${LIMIT_TARGETS})
@@ -87,14 +74,13 @@ function getBuildProfile() {
 # This function is idempotent, so you can call it as many times as you want and it will
 # always result in an activated virtual environment
 function ensureVenv() {
-    if [ ! -d "${EDIR}/venv" ]; then
-        uv venv ${EDIR}/venv
+    if [ ! -d "${MY_DIR}/venv" ]; then
+        uv venv ${MY_DIR}/venv
        echo "virtualenv created"
    fi
-
-    # Source if we are not already in a Virtual env
-    if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then
-        source ${EDIR}/venv/bin/activate
+    
+    if [ "x${VIRTUAL_ENV}" != "x${MY_DIR}/venv" ]; then
+        source ${MY_DIR}/venv/bin/activate
        echo "virtualenv activated"
    fi

@@ -127,24 +113,13 @@ function installRequirements() {

    # These are the requirements files we will attempt to install, in order
    declare -a requirementFiles=(
-        "${EDIR}/requirements-install.txt"
-        "${EDIR}/requirements.txt"
-        "${EDIR}/requirements-${BUILD_TYPE}.txt"
+        "${MY_DIR}/requirements-install.txt"
+        "${MY_DIR}/requirements.txt"
+        "${MY_DIR}/requirements-${BUILD_TYPE}.txt"
    )

    if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
-        requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}.txt")
-    fi
-
-    # if BUILD_TYPE is empty, we are a CPU build, so we should try to install the CPU requirements
-    if [ "x${BUILD_TYPE}" == "x" ]; then
-        requirementFiles+=("${EDIR}/requirements-cpu.txt")
-    fi
-
-    requirementFiles+=("${EDIR}/requirements-after.txt")
-
-    if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
-        requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}-after.txt")
+        requirementFiles+=("${MY_DIR}/requirements-${BUILD_PROFILE}.txt")
    fi

    for reqFile in ${requirementFiles[@]}; do
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.66.1
+grpcio==1.65.1
 protobuf
--- a/backend/python/coqui/requirements-cpu.txt
+++ b/backend/python/coqui/requirements-cpu.txt
@@ -1,3 +0,0 @@
-transformers
-accelerate
-torch
--- a/backend/python/coqui/requirements-cublas11.txt
+++ b/backend/python/coqui/requirements-cublas11.txt
@@ -1,5 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch
-torchaudio
-transformers
-accelerate
--- a/backend/python/coqui/requirements-cublas12.txt
+++ b/backend/python/coqui/requirements-cublas12.txt
@@ -1,4 +0,0 @@
-torch
-torchaudio
-transformers
-accelerate
--- a/backend/python/coqui/requirements-hipblas.txt
+++ b/backend/python/coqui/requirements-hipblas.txt
@@ -1,5 +1,3 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
 torch
-torchaudio
-transformers
-accelerate
+torchaudio
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -3,6 +3,4 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
-transformers
-accelerate
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,6 @@
-coqui-tts
-grpcio==1.66.1
+accelerate
+TTS==0.22.0
+grpcio==1.65.1
 protobuf
-certifi
+certifi
+transformers
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -18,13 +18,13 @@ import backend_pb2_grpc
 import grpc

 from diffusers import StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, \
-    EulerAncestralDiscreteScheduler, FluxPipeline, FluxTransformer2DModel
+    EulerAncestralDiscreteScheduler
 from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image, export_to_video
 from compel import Compel, ReturnedEmbeddingsType
-from optimum.quanto import freeze, qfloat8, quantize
-from transformers import CLIPTextModel, T5EncoderModel
+
+from transformers import CLIPTextModel
 from safetensors.torch import load_file

 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -163,12 +163,10 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            modelFile = request.Model

            self.cfg_scale = 7
-            self.PipelineType = request.PipelineType
-
            if request.CFGScale != 0:
                self.cfg_scale = request.CFGScale

-            clipmodel = "Lykon/dreamshaper-8"
+            clipmodel = "runwayml/stable-diffusion-v1-5"
            if request.CLIPModel != "":
                clipmodel = request.CLIPModel
            clipsubfolder = "text_encoder"
@@ -246,30 +244,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        torch_dtype=torchType,
                        use_safetensors=True,
                        variant=variant)
-            elif request.PipelineType == "FluxPipeline":
-                    self.pipe = FluxPipeline.from_pretrained(
-                        request.Model,
-                        torch_dtype=torch.bfloat16)
-                    if request.LowVRAM:
-                        self.pipe.enable_model_cpu_offload()
-            elif request.PipelineType == "FluxTransformer2DModel":
-                    dtype = torch.bfloat16
-                    # specify from environment or default to "ChuckMcSneed/FLUX.1-dev"
-                    bfl_repo = os.environ.get("BFL_REPO", "ChuckMcSneed/FLUX.1-dev")
-
-                    transformer = FluxTransformer2DModel.from_single_file(modelFile, torch_dtype=dtype)
-                    quantize(transformer, weights=qfloat8)
-                    freeze(transformer)
-                    text_encoder_2 = T5EncoderModel.from_pretrained(bfl_repo, subfolder="text_encoder_2", torch_dtype=dtype)
-                    quantize(text_encoder_2, weights=qfloat8)
-                    freeze(text_encoder_2)
-
-                    self.pipe = FluxPipeline.from_pretrained(bfl_repo, transformer=None, text_encoder_2=None, torch_dtype=dtype)
-                    self.pipe.transformer = transformer
-                    self.pipe.text_encoder_2 = text_encoder_2
-
-                    if request.LowVRAM:
-                        self.pipe.enable_model_cpu_offload()

            if CLIPSKIP and request.CLIPSkip != 0:
                self.clip_skip = request.CLIPSkip
@@ -425,13 +399,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                request.seed
            )

-        if self.PipelineType == "FluxPipeline":
-            kwargs["max_sequence_length"] = 256
-
-        if self.PipelineType == "FluxTransformer2DModel":
-            kwargs["output_type"] = "pil"
-            kwargs["generator"] = torch.Generator("cpu").manual_seed(0)
-
        if self.img2vid:
            # Load the conditioning image
            image = load_image(request.src)
--- a/backend/python/diffusers/requirements-cpu.txt
+++ b/backend/python/diffusers/requirements-cpu.txt
@@ -1,9 +0,0 @@
-diffusers
-opencv-python
-transformers
-accelerate
-compel
-peft
-sentencepiece
-torch
-optimum-quanto
--- a/backend/python/diffusers/requirements-cublas11.txt
+++ b/backend/python/diffusers/requirements-cublas11.txt
@@ -1,10 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch
-diffusers
-opencv-python
-transformers
-accelerate
-compel
-peft
-sentencepiece
-optimum-quanto
--- a/backend/python/diffusers/requirements-cublas12.txt
+++ b/backend/python/diffusers/requirements-cublas12.txt
@@ -1,9 +0,0 @@
-torch
-diffusers
-opencv-python
-transformers
-accelerate
-compel
-peft
-sentencepiece
-optimum-quanto
--- a/backend/python/diffusers/requirements-hipblas.txt
+++ b/backend/python/diffusers/requirements-hipblas.txt
@@ -1,11 +1,3 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.3.1+rocm6.0
-torchvision==0.18.1+rocm6.0
-diffusers
-opencv-python
-transformers
-accelerate
-compel
-peft
-sentencepiece
-optimum-quanto
+torch
+torchvision
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,12 +3,4 @@ intel-extension-for-pytorch
 torch
 torchvision
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
-diffusers
-opencv-python
-transformers
-accelerate
-compel
-peft
-sentencepiece
-optimum-quanto
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,13 @@
 setuptools
-grpcio==1.66.1
+accelerate
+compel
+peft
+diffusers
+grpcio==1.65.1
+opencv-python
 pillow
 protobuf
+sentencepiece
+torch
+transformers
 certifi
--- a/backend/python/diffusers/test.py
+++ b/backend/python/diffusers/test.py
@@ -53,7 +53,7 @@ class TestBackendServicer(unittest.TestCase):
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="Lykon/dreamshaper-8"))
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
                self.assertTrue(response.success)
                self.assertEqual(response.message, "Model loaded successfully")
        except Exception as err:
@@ -71,7 +71,7 @@ class TestBackendServicer(unittest.TestCase):
            self.setUp()
            with grpc.insecure_channel("localhost:50051") as channel:
                stub = backend_pb2_grpc.BackendStub(channel)
-                response = stub.LoadModel(backend_pb2.ModelOptions(Model="Lykon/dreamshaper-8"))
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="runwayml/stable-diffusion-v1-5"))
                print(response.message)
                self.assertTrue(response.success)
                image_req = backend_pb2.GenerateImageRequest(positive_prompt="cat", width=16,height=16, dst="test.jpg")
@@ -81,4 +81,4 @@ class TestBackendServicer(unittest.TestCase):
            print(err)
            self.fail("Image gen service failed")
        finally:
-            self.tearDown()
+            self.tearDown()
--- a/backend/python/exllama/.gitignore
+++ b/backend/python/exllama/.gitignore
@@ -0,0 +1 @@
+source
--- a/backend/python/exllama/Makefile
+++ b/backend/python/exllama/Makefile
@@ -0,0 +1,25 @@
+export CONDA_ENV_PATH = "exllama.yml"
+
+.PHONY: exllama
+exllama: protogen
+	bash install.sh ${CONDA_ENV_PATH}
+
+.PHONY: run
+run: protogen
+	@echo "Running exllama..."
+	bash run.sh
+	@echo "exllama run."
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	$(RM) -r venv source __pycache__
--- a/backend/python/exllama/README.md
+++ b/backend/python/exllama/README.md
@@ -0,0 +1,5 @@
+# Creating a separate environment for the exllama project
+
+```
+make exllama
+```
--- a/backend/python/exllama/backend.py
+++ b/backend/python/exllama/backend.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+import grpc
+from concurrent import futures
+import time
+import backend_pb2
+import backend_pb2_grpc
+import argparse
+import signal
+import sys
+import os, glob
+
+from pathlib import Path
+import torch
+import torch.nn.functional as F
+from torch import version as torch_version
+
+from source.tokenizer import ExLlamaTokenizer
+from source.generator import ExLlamaGenerator
+from source.model import ExLlama, ExLlamaCache, ExLlamaConfig
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    def generate(self,prompt, max_new_tokens):
+        self.generator.end_beam_search()
+
+        # Tokenizing the input
+        ids = self.generator.tokenizer.encode(prompt)
+
+        self.generator.gen_begin_reuse(ids)
+        initial_len = self.generator.sequence[0].shape[0]
+        has_leading_space = False
+        decoded_text = ''
+        for i in range(max_new_tokens):
+            token = self.generator.gen_single_token()
+            if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
+                has_leading_space = True
+
+            decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
+            if has_leading_space:
+                decoded_text = ' ' + decoded_text
+
+            if token.item() == self.generator.tokenizer.eos_token_id:
+                break
+        return decoded_text
+    def Health(self, request, context):
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+    def LoadModel(self, request, context):
+        try:
+            # https://github.com/turboderp/exllama/blob/master/example_cfg.py
+            model_directory = request.ModelFile
+
+            # Locate files we need within that directory
+            tokenizer_path = os.path.join(model_directory, "tokenizer.model")
+            model_config_path = os.path.join(model_directory, "config.json")
+            st_pattern = os.path.join(model_directory, "*.safetensors")
+            model_path = glob.glob(st_pattern)[0]
+
+            # Create config, model, tokenizer and generator
+
+            config = ExLlamaConfig(model_config_path)               # create config from config.json
+            config.model_path = model_path                          # supply path to model weights file
+            if (request.ContextSize):
+                config.max_seq_len = request.ContextSize            # override max sequence length
+                config.max_attention_size = request.ContextSize**2  # Should be set to context_size^2. 
+                # https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
+
+            # Set Rope scaling.
+            if (request.RopeFreqScale):
+                # Alpha value for Rope scaling. 
+                # Higher value increases context but adds perplexity.
+                # alpha_value and compress_pos_emb are mutually exclusive.
+                # https://github.com/turboderp/exllama/issues/115
+                config.alpha_value = request.RopeFreqScale
+                config.calculate_rotary_embedding_base()
+
+            model = ExLlama(config)                                 # create ExLlama instance and load the weights
+            tokenizer = ExLlamaTokenizer(tokenizer_path)            # create tokenizer from tokenizer model file
+
+            cache = ExLlamaCache(model, batch_size = 2)             # create cache for inference
+            generator = ExLlamaGenerator(model, tokenizer, cache)   # create generator
+
+            self.generator= generator
+            self.model = model
+            self.tokenizer = tokenizer
+            self.cache = cache
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def Predict(self, request, context):
+        penalty = 1.15
+        if request.Penalty != 0.0:
+            penalty = request.Penalty
+        self.generator.settings.token_repetition_penalty_max = penalty
+        self.generator.settings.temperature = request.Temperature
+        self.generator.settings.top_k = request.TopK
+        self.generator.settings.top_p = request.TopP
+
+        tokens = 512
+        if request.Tokens != 0:
+            tokens = request.Tokens
+
+        if self.cache.batch_size == 1:
+            del self.cache
+            self.cache = ExLlamaCache(self.model, batch_size=2)
+            self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
+
+        t = self.generate(request.Prompt, tokens)
+
+        # Remove prompt from response if present
+        if request.Prompt in t:
+            t = t.replace(request.Prompt, "")
+
+        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
+
+    def PredictStream(self, request, context):
+        # Implement PredictStream RPC
+        #for reply in some_data_generator():
+        #    yield reply
+        # Not implemented yet
+        return self.Predict(request, context)
+
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+
+    serve(args.addr)
--- a/backend/python/exllama/install.sh
+++ b/backend/python/exllama/install.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -e
+
+LIMIT_TARGETS="cublas"
+
+source $(dirname $0)/../common/libbackend.sh
+
+installRequirements
+
+git clone https://github.com/turboderp/exllama $MY_DIR/source
+uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt
+
+cp -v ./*py $MY_DIR/source/
--- a/backend/python/exllama/requirements.txt
+++ b/backend/python/exllama/requirements.txt
@@ -0,0 +1,6 @@
+grpcio==1.65.0
+protobuf
+torch
+transformers
+certifi
+setuptools
--- a/backend/python/exllama/run.sh
+++ b/backend/python/exllama/run.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+LIMIT_TARGETS="cublas"
+BACKEND_FILE="${MY_DIR}/source/backend.py"
+
+source $(dirname $0)/../common/libbackend.sh
+
+startBackend $@
--- a/backend/python/exllama/test.sh
+++ b/backend/python/exllama/test.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+runUnittests
--- a/backend/python/exllama2/requirements-cpu.txt
+++ b/backend/python/exllama2/requirements-cpu.txt
@@ -1,3 +0,0 @@
-transformers
-accelerate
-torch
--- a/backend/python/exllama2/requirements-cublas11.txt
+++ b/backend/python/exllama2/requirements-cublas11.txt
@@ -1,4 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch
-transformers
-accelerate
--- a/backend/python/exllama2/requirements-cublas12.txt
+++ b/backend/python/exllama2/requirements-cublas12.txt
@@ -1,3 +0,0 @@
-torch
-transformers
-accelerate
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,5 +1,7 @@
-grpcio==1.66.1
+accelerate
+grpcio==1.65.1
 protobuf
 certifi
+torch
 wheel
 setuptools
--- a/backend/python/mamba/requirements-after.txt
+++ b/backend/python/mamba/requirements-after.txt
@@ -1,2 +0,0 @@
-causal-conv1d==1.4.0
-mamba-ssm==2.2.2
--- a/backend/python/mamba/requirements-cpu.txt
+++ b/backend/python/mamba/requirements-cpu.txt
@@ -1,2 +0,0 @@
-torch
-transformers
--- a/backend/python/mamba/requirements-cublas11.txt
+++ b/backend/python/mamba/requirements-cublas11.txt
@@ -1,3 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch
-transformers
--- a/backend/python/mamba/requirements-cublas12.txt
+++ b/backend/python/mamba/requirements-cublas12.txt
@@ -1,2 +0,0 @@
-torch
-transformers
--- a/backend/python/mamba/requirements-install.txt
+++ b/backend/python/mamba/requirements-install.txt
@@ -3,4 +3,5 @@
 # https://github.com/Dao-AILab/causal-conv1d/issues/24
 packaging
 setuptools
-wheel
+wheel
+torch==2.3.1
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,3 +1,6 @@
-grpcio==1.66.1
+causal-conv1d==1.4.0
+mamba-ssm==2.2.2
+grpcio==1.65.1
 protobuf
-certifi
+certifi
+transformers
--- a/backend/python/openvoice/requirements-cpu.txt
+++ b/backend/python/openvoice/requirements-cpu.txt
@@ -1 +0,0 @@
-torch
--- a/backend/python/openvoice/requirements-cublas11.txt
+++ b/backend/python/openvoice/requirements-cublas11.txt
@@ -1,2 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch
--- a/backend/python/openvoice/requirements-cublas12.txt
+++ b/backend/python/openvoice/requirements-cublas12.txt
@@ -1 +0,0 @@
-torch
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.66.1
+grpcio==1.65.1
 protobuf
 librosa==0.9.1
 faster-whisper==1.0.3
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.66.1
+grpcio==1.65.1
 protobuf
 librosa
 faster-whisper
--- a/backend/python/openvoice/test.sh
+++ b/backend/python/openvoice/test.sh
@@ -5,7 +5,7 @@ source $(dirname $0)/../common/libbackend.sh

 # Download checkpoints if not present
 if [ ! -d "checkpoints_v2" ]; then
-    wget https://myshell-public-repo-host.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip -O checkpoints_v2.zip
+    wget https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip -O checkpoints_v2.zip
    unzip checkpoints_v2.zip
 fi

--- a/backend/python/parler-tts/install.sh
+++ b/backend/python/parler-tts/install.sh
@@ -15,12 +15,5 @@ installRequirements

 # https://github.com/descriptinc/audiotools/issues/101
 # incompatible protobuf versions.
-PYDIR=python3.10
-pyenv="${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/"
-
-if [ ! -d ${pyenv} ]; then
-    echo "(parler-tts/install.sh): Error: ${pyenv} does not exist"
-    exit 1
-fi
-
-curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${pyenv}/builder.py
+PYDIR=$(ls ${MY_DIR}/venv/lib)
+curl -L https://raw.githubusercontent.com/protocolbuffers/protobuf/main/python/google/protobuf/internal/builder.py -o ${MY_DIR}/venv/lib/${PYDIR}/site-packages/google/protobuf/internal/builder.py
--- a/backend/python/parler-tts/requirements-after.txt
+++ b/backend/python/parler-tts/requirements-after.txt
@@ -1,3 +0,0 @@
-git+https://github.com/huggingface/parler-tts.git@8e465f1b5fcd223478e07175cb40494d19ffbe17
-llvmlite==0.43.0
-numba==0.60.0
--- a/backend/python/parler-tts/requirements-cpu.txt
+++ b/backend/python/parler-tts/requirements-cpu.txt
@@ -1,3 +0,0 @@
-transformers
-accelerate
-torch
--- a/backend/python/parler-tts/requirements-cublas11.txt
+++ b/backend/python/parler-tts/requirements-cublas11.txt
@@ -1,5 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch
-torchaudio
-transformers
-accelerate
--- a/backend/python/parler-tts/requirements-cublas12.txt
+++ b/backend/python/parler-tts/requirements-cublas12.txt
@@ -1,4 +0,0 @@
-torch
-torchaudio
-transformers
-accelerate
--- a/backend/python/parler-tts/requirements-hipblas.txt
+++ b/backend/python/parler-tts/requirements-hipblas.txt
@@ -1,5 +1,3 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.3.0+rocm6.0
-torchaudio==2.3.0+rocm6.0
-transformers
-accelerate
+torch
+torchaudio
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -3,6 +3,4 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
-transformers
-accelerate
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/Show More
+++ b/Show More