workaround upstream issue

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
test
2026-06-11 10:19:33 -04:00 · 2024-07-24 11:02:58 +02:00 · 2024-07-24 10:23:41 +02:00 · 2024-07-24 08:59:18 +02:00
672 changed files with 18983 additions and 264269 deletions
--- a/.devcontainer-scripts/postcreate.sh
+++ b/.devcontainer-scripts/postcreate.sh
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-cd /workspace
-
-# Get the files into the volume without a bind mount
-if [ ! -d ".git" ]; then
-    git clone https://github.com/mudler/LocalAI.git .
-else
-    git fetch
-fi
-
-echo "Standard Post-Create script completed."
-
-if [ -f "/devcontainer-customization/postcreate.sh" ]; then
-    echo "Launching customization postcreate.sh"
-    bash "/devcontainer-customization/postcreate.sh"
-fi
--- a/.devcontainer-scripts/poststart.sh
+++ b/.devcontainer-scripts/poststart.sh
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-cd /workspace
-
-# Grab the pre-stashed backend assets to avoid build issues
-cp -r /build/backend-assets /workspace/backend-assets
-
-# Ensures generated source files are present upon load
-make prepare
-
-echo "Standard Post-Start script completed."
-
-if [ -f "/devcontainer-customization/poststart.sh" ]; then
-    echo "Launching customization poststart.sh"
-    bash "/devcontainer-customization/poststart.sh"
-fi
--- a/.devcontainer-scripts/utils.sh
+++ b/.devcontainer-scripts/utils.sh
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-# This file contains some really simple functions that are useful when building up customization scripts.
-
-
-# Checks if the git config has a user registered - and sets it up if not.
-#
-# Param 1: name
-# Param 2: email
-#
-config_user() {
-    echo "Configuring git for $1 <$2>"
-    local gcn=$(git config --global user.name)
-    if [ -z "${gcn}" ]; then
-        echo "Setting up git user / remote"
-        git config --global user.name "$1"
-        git config --global user.email "$2"
-        
-    fi
-}
-
-# Checks if the git remote is configured - and sets it up if not. Fetches either way.
-#
-# Param 1: remote name
-# Param 2: remote url
-#
-config_remote() {
-    echo "Adding git remote and fetching $2 as $1"
-    local gr=$(git remote -v | grep $1)
-    if [ -z "${gr}" ]; then
-        git remote add $1 $2
-    fi
-    git fetch $1
-}
-
-# Setup special .ssh files
-# Prints out lines of text to make things pretty
-# Param 1: bash array, filenames relative to the customization directory that should be copied to ~/.ssh
-setup_ssh() {
-    echo "starting ~/.ssh directory setup..."
-    mkdir -p "${HOME}.ssh"
-    chmod 0700 "${HOME}/.ssh"
-    echo "-----"
-    local files=("$@")
-    for file in "${files[@]}" ; do
-        local cfile="/devcontainer-customization/${file}"
-        local hfile="${HOME}/.ssh/${file}"
-        if [ ! -f "${hfile}" ]; then
-            echo "copying \"${file}\""
-            cp "${cfile}" "${hfile}"
-            chmod 600 "${hfile}"
-        fi
-    done
-    echo "~/.ssh directory setup complete!"
-}
--- a/.devcontainer/customization/README.md
+++ b/.devcontainer/customization/README.md
@@ -1,25 +0,0 @@
-Place any additional resources your environment requires in this directory
-
-Script hooks are currently called for:
-`postcreate.sh` and `poststart.sh`
-
-If files with those names exist here, they will be called at the end of the normal script.
-
-This is a good place to set things like `git config --global user.name` are set - and to handle any other files that are mounted via this directory.
-
-To assist in doing so, `source /.devcontainer-scripts/utils.sh` will provide utility functions that may be useful - for example:
-
-```
-#!/bin/bash
-
-source "/.devcontainer-scripts/utils.sh"
-
-sshfiles=("config", "key.pub")
-
-setup_ssh "${sshfiles[@]}"
-
-config_user "YOUR NAME" "YOUR EMAIL"
-
-config_remote "REMOTE NAME" "REMOTE URL"
-
-```
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,24 +0,0 @@
-{
-    "$schema": "https://raw.githubusercontent.com/devcontainers/spec/main/schemas/devContainer.schema.json",
-    "name": "LocalAI",
-    "workspaceFolder": "/workspace",
-    "dockerComposeFile": [ "./docker-compose-devcontainer.yml" ],
-    "service": "api",
-    "shutdownAction": "stopCompose",
-    "customizations": {
-        "vscode": {
-            "extensions": [
-                "golang.go",
-                "ms-vscode.makefile-tools",
-                "ms-azuretools.vscode-docker",
-                "ms-python.python",
-                "ms-python.debugpy",
-                "wayou.vscode-todo-highlight",
-                "waderyan.gitblame"
-            ]
-        }
-    },
-    "forwardPorts": [8080, 3000],
-    "postCreateCommand": "bash /.devcontainer-scripts/postcreate.sh",
-    "postStartCommand": "bash /.devcontainer-scripts/poststart.sh"
-}
--- a/.devcontainer/docker-compose-devcontainer.yml
+++ b/.devcontainer/docker-compose-devcontainer.yml
@@ -1,48 +0,0 @@
-services:
-  api:
-    build:
-      context: ..
-      dockerfile: Dockerfile
-      target: devcontainer
-      args:
-      - FFMPEG=true
-      - IMAGE_TYPE=extras
-      - GO_TAGS=p2p tts
-    env_file:
-      - ../.env
-    ports:
-      - 8080:8080
-    volumes:
-      - localai_workspace:/workspace
-      - ../models:/host-models
-      - ./customization:/devcontainer-customization
-    command: /bin/sh -c "while sleep 1000; do :; done"
-    cap_add:
-      - SYS_PTRACE
-    security_opt:
-      - seccomp:unconfined
-  prometheus:
-    image: prom/prometheus
-    container_name: prometheus
-    command:
-      - '--config.file=/etc/prometheus/prometheus.yml'
-    ports:
-      - 9090:9090
-    restart: unless-stopped
-    volumes:
-      - ./prometheus:/etc/prometheus
-      - prom_data:/prometheus
-  grafana:
-    image: grafana/grafana
-    container_name: grafana
-    ports:
-      - 3000:3000
-    restart: unless-stopped
-    environment:
-      - GF_SECURITY_ADMIN_USER=admin
-      - GF_SECURITY_ADMIN_PASSWORD=grafana
-    volumes:
-      - ./grafana:/etc/grafana/provisioning/datasources
-volumes:
-  prom_data:
-  localai_workspace:
--- a/.devcontainer/grafana/datasource.yml
+++ b/.devcontainer/grafana/datasource.yml
@@ -1,10 +0,0 @@
-
-apiVersion: 1
-
-datasources:
- name: Prometheus
-  type: prometheus
-  url: http://prometheus:9090 
-  isDefault: true
-  access: proxy
-  editable: true
--- a/.devcontainer/prometheus/prometheus.yml
+++ b/.devcontainer/prometheus/prometheus.yml
@@ -1,21 +0,0 @@
-global:
-  scrape_interval: 15s
-  scrape_timeout: 10s
-  evaluation_interval: 15s
-alerting:
-  alertmanagers:
-  - static_configs:
-    - targets: []
-    scheme: http
-    timeout: 10s
-    api_version: v1
-scrape_configs:
- job_name: prometheus
-  honor_timestamps: true
-  scrape_interval: 15s
-  scrape_timeout: 10s
-  metrics_path: /metrics
-  scheme: http
-  static_configs:
-  - targets:
-    - localhost:9090
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,7 +1,6 @@
 .idea
 .github
 .vscode
-.devcontainer
 models
 examples/chatbot-ui/models
 examples/rwkv/models
--- a/.env
+++ b/.env
@@ -38,12 +38,12 @@
 ## Uncomment and set to true to enable rebuilding from source
 # REBUILD=true

-## Enable go tags, available: p2p, tts
-## p2p: enable distributed inferencing
+## Enable go tags, available: stablediffusion, tts
+## stablediffusion: image generation with stablediffusion
 ## tts: enables text-to-speech with go-piper 
 ## (requires REBUILD=true)
 #
-# GO_TAGS=p2p
+# GO_TAGS=stablediffusion

 ## Path where to store generated images
 # LOCALAI_IMAGE_PATH=/tmp/generated/images
@@ -79,18 +79,6 @@
 ### Enable to run parallel requests
 # LOCALAI_PARALLEL_REQUESTS=true

-# Enable to allow p2p mode
-# LOCALAI_P2P=true
-
-# Enable to use federated mode
-# LOCALAI_FEDERATED=true
-
-# Enable to start federation server
-# FEDERATED_SERVER=true
-
-# Define to use federation token
-# TOKEN=""
-
 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1 @@
 *.sh text eol=lf
-backend/cpp/llama/*.hpp linguist-vendored
--- a/.github/bump_deps.sh
+++ b/.github/bump_deps.sh
@@ -6,17 +6,4 @@ VAR=$3

 LAST_COMMIT=$(curl -s -H "Accept: application/vnd.github.VERSION.sha" "https://api.github.com/repos/$REPO/commits/$BRANCH")

-# Read $VAR from Makefile (only first match)
-set +e
-CURRENT_COMMIT="$(grep -m1 "^$VAR?=" Makefile | cut -d'=' -f2)"
-set -e
-
 sed -i Makefile -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
-
-if [ -z "$CURRENT_COMMIT" ]; then
-    echo "Could not find $VAR in Makefile."
-    exit 0
-fi
-
-echo "Changes: https://github.com/$REPO/compare/${CURRENT_COMMIT}..${LAST_COMMIT}" >> "${VAR}_message.txt"
-echo "${LAST_COMMIT}" >> "${VAR}_commit.txt"
--- a/.github/check_and_update.py
+++ b/.github/check_and_update.py
@@ -29,14 +29,9 @@ def calculate_sha256(file_path):
 def manual_safety_check_hf(repo_id):
    scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
    scan = scanResponse.json()
-    # Check if 'hasUnsafeFile' exists in the response
-    if 'hasUnsafeFile' in scan:
-        if scan['hasUnsafeFile']:
-            return scan
-        else:
-            return None
-    else:
-        return None
+    if scan['hasUnsafeFile']:
+        return scan
+    return None

 download_type, repo_id_or_url = parse_uri(uri)

--- a/.github/ci/modelslist.go
+++ b/.github/ci/modelslist.go
@@ -6,7 +6,6 @@ import (
 	"io/ioutil"
 	"os"

-	"github.com/microcosm-cc/bluemonday"
 	"gopkg.in/yaml.v3"
 )

@@ -280,12 +279,6 @@ func main() {
 		return
 	}

-	// Ensure that all arbitrary text content is sanitized before display
-	for i, m := range models {
-		models[i].Name = bluemonday.StrictPolicy().Sanitize(m.Name)
-		models[i].Description = bluemonday.StrictPolicy().Sanitize(m.Description)
-	}
-
 	// render the template
 	data := struct {
 		Models          []*GalleryModel
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -9,8 +9,6 @@ updates:
    directory: "/"
    schedule:
      interval: "weekly"
-    ignore:
-    - dependency-name: "github.com/mudler/LocalAI/pkg/grpc/proto"
  - package-ecosystem: "github-actions"
    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
    directory: "/"
@@ -69,6 +67,10 @@ updates:
    directory: "/backend/python/parler-tts"
    schedule:
      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/petals"
+    schedule:
+      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/rerankers"
    schedule:
@@ -81,6 +83,14 @@ updates:
    directory: "/backend/python/transformers"
    schedule:
      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/transformers-musicgen"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/vall-e-x"
+    schedule:
+      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/vllm"
    schedule:
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,15 +1,6 @@
-enhancement:
+enhancements:
 - head-branch: ['^feature', 'feature']

-dependencies:
- any:
-  - changed-files:
-    - any-glob-to-any-file: 'Makefile'
-  - changed-files:
-    - any-glob-to-any-file: '*.mod'
-  - changed-files:
-    - any-glob-to-any-file: '*.sum'
-
 kind/documentation:
 - any:
  - changed-files:
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -12,14 +12,23 @@ jobs:
          - repository: "ggerganov/llama.cpp"
            variable: "CPPLLAMA_VERSION"
            branch: "master"
+          - repository: "go-skynet/go-ggml-transformers.cpp"
+            variable: "GOGGMLTRANSFORMERS_VERSION"
+            branch: "master"
+          - repository: "donomii/go-rwkv.cpp"
+            variable: "RWKV_VERSION"
+            branch: "main"
          - repository: "ggerganov/whisper.cpp"
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
-          - repository: "PABannier/bark.cpp"
-            variable: "BARKCPP_VERSION"
+          - repository: "go-skynet/go-bert.cpp"
+            variable: "BERT_VERSION"
+            branch: "master"
+          - repository: "go-skynet/bloomz.cpp"
+            variable: "BLOOMZ_VERSION"
            branch: "main"
-          - repository: "leejet/stable-diffusion.cpp"
-            variable: "STABLEDIFFUSION_GGML_VERSION"
+          - repository: "mudler/go-ggllm.cpp"
+            variable: "GOGGLLM_VERSION"
            branch: "master"
          - repository: "mudler/go-stable-diffusion"
            variable: "STABLEDIFFUSION_VERSION"
@@ -31,30 +40,17 @@ jobs:
    steps:
      - uses: actions/checkout@v4
      - name: Bump dependencies 🔧
-        id: bump
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
-          {
-            echo 'message<<EOF'
-            cat "${{ matrix.variable }}_message.txt"
-            echo EOF
-          } >> "$GITHUB_OUTPUT"
-          {
-            echo 'commit<<EOF'
-            cat "${{ matrix.variable }}_commit.txt"
-            echo EOF
-          } >> "$GITHUB_OUTPUT"
-          rm -rfv ${{ matrix.variable }}_message.txt
-          rm -rfv ${{ matrix.variable }}_commit.txt
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Update ${{ matrix.repository }}'
-          title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`'
+          title: 'chore: :arrow_up: Update ${{ matrix.repository }}'
          branch: "update/${{ matrix.variable }}"
-          body: ${{ steps.bump.outputs.message }}
+          body: Bump of ${{ matrix.repository }} version
          signoff: true


--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -17,7 +17,7 @@ jobs:
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -23,7 +23,7 @@ jobs:
          sudo pip install --upgrade pip
          pip install huggingface_hub
      - name: 'Setup yq'
-        uses: dcarbone/install-yq-action@v1.3.1
+        uses: dcarbone/install-yq-action@v1.1.1
        with:
          version: 'v4.44.2'
          download-compressed: true
@@ -36,12 +36,12 @@ jobs:
          sudo chmod 777 /hf_cache
          bash .github/checksum_checker.sh gallery/index.yaml
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Checksum updates in gallery/index.yaml'
-          title: 'chore(model-gallery): :arrow_up: update checksum'
+          title: 'models(gallery): :arrow_up: update checksum'
          branch: "update/checksum"
          body: Updating checksums in gallery/index.yaml
          signoff: true
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.3.0
+        uses: dependabot/fetch-metadata@v2.2.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -1,64 +0,0 @@
-name: Explorer deployment
-
-on:
-  push:
-    branches:
-      - master
-    tags:
-      - 'v*'
-
-concurrency:
-  group: ci-deploy-${{ github.head_ref || github.ref }}-${{ github.repository }}
-
-jobs:
-  build-linux:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - uses: actions/setup-go@v5
-        with:
-          go-version: '1.21.x'
-          cache: false
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
-          make protogen-go
-      - name: Build api
-        run: |
-          CGO_ENABLED=0 make build-api
-      - name: rm
-        uses: appleboy/ssh-action@v1.2.0
-        with:
-            host: ${{ secrets.EXPLORER_SSH_HOST }}
-            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
-            key: ${{ secrets.EXPLORER_SSH_KEY }}
-            port: ${{ secrets.EXPLORER_SSH_PORT }}
-            script: |
-                sudo rm -rf local-ai/ || true
-      - name: copy file via ssh
-        uses: appleboy/scp-action@v0.1.7
-        with:
-            host: ${{ secrets.EXPLORER_SSH_HOST }}
-            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
-            key: ${{ secrets.EXPLORER_SSH_KEY }}
-            port: ${{ secrets.EXPLORER_SSH_PORT }}
-            source: "local-ai"
-            overwrite: true
-            rm: true
-            target: ./local-ai
-      - name: restarting
-        uses: appleboy/ssh-action@v1.2.0
-        with:
-            host: ${{ secrets.EXPLORER_SSH_HOST }}
-            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
-            key: ${{ secrets.EXPLORER_SSH_KEY }}
-            port: ${{ secrets.EXPLORER_SSH_PORT }}
-            script: |
-                sudo cp -rfv local-ai/local-ai /usr/bin/local-ai
-                sudo systemctl restart local-ai
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -2,10 +2,9 @@ name: 'generate and publish GRPC docker caches'

 on:
  workflow_dispatch:
-
-  schedule:
-    # daily at midnight
-    - cron: '0 0 * * *'
+  push:
+    branches:
+      - master

 concurrency:
  group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -17,7 +16,7 @@ jobs:
      matrix:
        include:
          - grpc-base-image: ubuntu:22.04
-            runs-on: 'arc-runner-set'
+            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64,linux/arm64'
    runs-on: ${{matrix.runs-on}}
    steps:
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -13,78 +13,6 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  hipblas-jobs:
-    uses: ./.github/workflows/image_build.yml
-    with:
-      tag-latest: ${{ matrix.tag-latest }}
-      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
-      build-type: ${{ matrix.build-type }}
-      cuda-major-version: ${{ matrix.cuda-major-version }}
-      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-      platforms: ${{ matrix.platforms }}
-      runs-on: ${{ matrix.runs-on }}
-      base-image: ${{ matrix.base-image }}
-      grpc-base-image: ${{ matrix.grpc-base-image }}
-      aio: ${{ matrix.aio }}
-      makeflags: ${{ matrix.makeflags }}
-      latest-image: ${{ matrix.latest-image }}
-      latest-image-aio: ${{ matrix.latest-image-aio }}
-    secrets:
-      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-    strategy:
-      # Pushing with all jobs in parallel
-      # eats the bandwidth of all the nodes
-      max-parallel: 2
-      matrix:
-        include:
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: '-hipblas-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            aio: "-aio-gpu-hipblas"
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            latest-image: 'latest-gpu-hipblas'
-            latest-image-aio: 'latest-aio-gpu-hipblas'
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas'
-            ffmpeg: 'false'
-            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.1"
-            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-            makeflags: "--jobs=3 --output-sync=target"
  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -111,7 +39,7 @@ jobs:
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
+      max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
      matrix:
        include:
          # Extra images
@@ -194,6 +122,29 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-hipblas-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            aio: "-aio-gpu-hipblas"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            latest-image: 'latest-gpu-hipblas'
+            latest-image-aio: 'latest-aio-gpu-hipblas'
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas'
+            ffmpeg: 'false'
+            image-type: 'extras'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
@@ -261,6 +212,26 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-core'
+            ffmpeg: 'false'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"

  core-image-build:
    uses: ./.github/workflows/image_build.yml
@@ -280,7 +251,6 @@ jobs:
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
-      skip-drivers: ${{ matrix.skip-drivers }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -302,7 +272,6 @@ jobs:
            latest-image: 'latest-cpu'
            latest-image-aio: 'latest-aio-cpu'
            makeflags: "--jobs=4 --output-sync=target"
-            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -314,7 +283,6 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
-            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -326,7 +294,6 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
-            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "7"
@@ -338,7 +305,6 @@ jobs:
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
-            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -349,7 +315,6 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
-            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
@@ -360,45 +325,4 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
-            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
-  gh-runner:
-    uses: ./.github/workflows/image_build.yml
-    with:
-      tag-latest: ${{ matrix.tag-latest }}
-      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
-      build-type: ${{ matrix.build-type }}
-      cuda-major-version: ${{ matrix.cuda-major-version }}
-      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-      platforms: ${{ matrix.platforms }}
-      runs-on: ${{ matrix.runs-on }}
-      aio: ${{ matrix.aio }}
-      base-image: ${{ matrix.base-image }}
-      grpc-base-image: ${{ matrix.grpc-base-image }}
-      makeflags: ${{ matrix.makeflags }}
-      latest-image: ${{ matrix.latest-image }}
-      latest-image-aio: ${{ matrix.latest-image-aio }}
-      skip-drivers: ${{ matrix.skip-drivers }}
-    secrets:
-      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-    strategy:
-      matrix:
-        include:
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "0"
-            platforms: 'linux/arm64'
-            tag-latest: 'false'
-            tag-suffix: '-nvidia-l4t-arm64-core'
-            latest-image: 'latest-nvidia-l4t-arm64-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
-            runs-on: 'ubuntu-24.04-arm'
-            makeflags: "--jobs=4 --output-sync=target"
-            skip-drivers: 'true'
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -49,10 +49,6 @@ on:
        description: 'FFMPEG'
        default: ''
        type: string
-      skip-drivers:
-        description: 'Skip drivers by default'
-        default: 'false'
-        type: string
      image-type:
        description: 'Image type'
        default: ''
@@ -238,7 +234,6 @@ jobs:
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
-            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
@@ -267,7 +262,6 @@ jobs:
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
            GRPC_VERSION=v1.65.0
            MAKEFLAGS=${{ inputs.makeflags }}
-            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -18,7 +18,7 @@ jobs:
      with:
        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.8.0
+    - uses: GrantBirki/git-diff-action@v2.7.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
@@ -79,7 +79,7 @@ jobs:
        args: ${{ steps.summarize.outputs.message }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.19
+      uses: mxschmitt/action-tmate@v3.18
      with:
        detached: true
        connect-timeout-seconds: 180
@@ -99,7 +99,7 @@ jobs:
        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
      # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.8.0
+    - uses: GrantBirki/git-diff-action@v2.7.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
@@ -161,7 +161,7 @@ jobs:
        TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
-      uses: mxschmitt/action-tmate@v3.19
+      uses: mxschmitt/action-tmate@v3.18
      with:
        detached: true
        connect-timeout-seconds: 180
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -123,7 +123,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -232,12 +232,45 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
-
+  build-stablediffusion:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21.x'
+          cache: false
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache upx-ucl
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+      - name: Build stablediffusion
+        run: |
+          export PATH=$PATH:$GOPATH/bin
+          make backend-assets/grpc/stablediffusion
+          mkdir -p release && cp backend-assets/grpc/stablediffusion release
+        env:
+          GO_TAGS: stablediffusion
+      - uses: actions/upload-artifact@v4
+        with:
+          name: stablediffusion
+          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*

  build-macOS-x86_64:
    runs-on: macos-13
@@ -261,7 +294,7 @@ jobs:
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
-          export SKIP_GRPC_BACKEND=backend-assets/grpc/whisper
+
          make dist
      - uses: actions/upload-artifact@v4
        with:
@@ -275,7 +308,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -294,7 +327,7 @@ jobs:
          cache: false
      - name: Dependencies
        run: |
-          brew install protobuf grpc libomp llvm
+          brew install protobuf grpc
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build
@@ -303,7 +336,7 @@ jobs:
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
-          export CC=/opt/homebrew/opt/llvm/bin/clang
+
          make dist
      - uses: actions/upload-artifact@v4
        with:
@@ -317,7 +350,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.0
+        uses: securego/gosec@master
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -35,6 +35,30 @@ jobs:
        run: |
           make --jobs=5 --output-sync=target -C backend/python/transformers
           make --jobs=5 --output-sync=target -C backend/python/transformers test
+
+  tests-sentencetransformers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
+
+      - name: Test sentencetransformers
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
+           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test
+
+
  tests-rerankers:
    runs-on: ubuntu-latest
    steps:
@@ -78,7 +102,75 @@ jobs:
          make --jobs=5 --output-sync=target -C backend/python/diffusers
          make --jobs=5 --output-sync=target -C backend/python/diffusers test

-  # tests-transformers-musicgen:
+  tests-parler-tts:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
+
+      - name: Test parler-tts
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts test
+
+  tests-openvoice:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
+
+      - name: Test openvoice
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/openvoice
+           make --jobs=5 --output-sync=target -C backend/python/openvoice test
+
+  tests-transformers-musicgen:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
+
+      - name: Test transformers-musicgen
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
+           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
+
+
+
+  # tests-petals:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
@@ -95,10 +187,12 @@ jobs:
  #         sudo apt-get install -y libopencv-dev
  #         pip install --user --no-cache-dir grpcio-tools==1.64.1

-  #     - name: Test transformers-musicgen
+  #     - name: Test petals
  #       run: |
-  #          make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
-  #          make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
+  #          make --jobs=5 --output-sync=target -C backend/python/petals
+  #          make --jobs=5 --output-sync=target -C backend/python/petals test
+
+

  # tests-bark:
  #   runs-on: ubuntu-latest
@@ -185,6 +279,26 @@ jobs:
  #       run: |
  #          make --jobs=5 --output-sync=target -C backend/python/vllm
  #          make --jobs=5 --output-sync=target -C backend/python/vllm test
+  tests-vallex:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
+      - name: Test vall-e-x
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/vall-e-x
+           make --jobs=5 --output-sync=target -C backend/python/vall-e-x test

  tests-coqui:
    runs-on: ubuntu-latest
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -100,12 +100,15 @@ jobs:
          # The python3-grpc-tools package in 22.04 is too old
          pip install --user grpcio-tools

-          make -C backend/python/transformers
+          sudo rm -rfv /usr/bin/conda || true
+          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers

          # Pre-build piper before we start tests in order to have shared libraries in place
          make sources/go-piper && \
          GO_TAGS="tts" make -C sources/go-piper piper.o && \
-          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/
+          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
+          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
        env:
          CUDA_VERSION: 12-4
      - name: Cache grpc
@@ -127,10 +130,10 @@ jobs:
          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
-          PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -175,26 +178,17 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - name: Dependencies
-        run: |
-          # Install protoc
-          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
-          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-          rm protoc.zip
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
-          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Build images
        run: |
          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
      - name: Test
        run: |
-            PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
+          LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -220,19 +214,18 @@ jobs:
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
-          pip install --user --no-cache-dir grpcio-tools
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
-          export CC=/opt/homebrew/opt/llvm/bin/clang
          # Used to run the newer GNUMake version from brew that supports --output-sync
          export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -25,7 +25,7 @@ jobs:
        run: |
          make protogen-go swagger
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,6 @@
 /sources/
 __pycache__/
 *.a
-*.o
 get-sources
 prepare-sources
 /backend/cpp/llama/grpc-server
@@ -13,6 +12,7 @@ prepare-sources

 go-ggml-transformers
 go-gpt2
+go-rwkv
 whisper.cpp
 /bloomz
 go-bert
@@ -54,6 +54,3 @@ docs/static/gallery.html

 # backend virtual environments
 **/venv
-
-# per-developer customization files for the development container
-.devcontainer/customization/*
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -3,12 +3,12 @@
    "configurations": [
        {
            "name": "Python: Current File",
-            "type": "debugpy",
+            "type": "python",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal",
            "justMyCode": false,
-            "cwd": "${fileDirname}",
+            "cwd": "${workspaceFolder}/examples/langchain-chroma",
            "env": {
                "OPENAI_API_BASE": "http://localhost:8080/v1",
                "OPENAI_API_KEY": "abc"
@@ -19,16 +19,15 @@
            "type": "go",
            "request": "launch",
            "mode": "debug",
-            "program": "${workspaceRoot}",
-            "args": [],
+            "program": "${workspaceFolder}/main.go",
+            "args": [
+                "api"
+            ],
            "env": {
-                "LOCALAI_LOG_LEVEL": "debug",
-                "LOCALAI_P2P": "true",
-                "LOCALAI_FEDERATED": "true"
-            },
-            "buildFlags": ["-tags", "p2p tts", "-v"],
-            "envFile": "${workspaceFolder}/.env",
-            "cwd": "${workspaceRoot}"
+                "C_INCLUDE_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
+                "LIBRARY_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
+                "DEBUG": "true"
+            }
        }
    ]
 }
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,6 +15,8 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
 - [Documentation](#documentation)
 - [Community and Communication](#community-and-communication)

+
+
 ## Getting Started

 ### Prerequisites
@@ -52,7 +54,7 @@ If you find a bug, have a feature request, or encounter any issues, please check

 ## Coding Guidelines

- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here.
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.

 ## Testing

@@ -82,3 +84,5 @@ We are welcome the contribution of the documents, please open new PR or create a
 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
+
+---
--- a/177
+++ b/177
@@ -8,42 +8,29 @@ FROM ${BASE_IMAGE} AS requirements-core

 USER root

-ARG GO_VERSION=1.22.6
-ARG CMAKE_VERSION=3.26.4
-ARG CMAKE_FROM_SOURCE=false
+ARG GO_VERSION=1.22.5
 ARG TARGETARCH
 ARG TARGETVARIANT

 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+

 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
        ccache \
        ca-certificates \
-        curl libssl-dev \
+        cmake \
+        curl \
        git \
        unzip upx-ucl && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

-# Install CMake (the version in 22.04 is too old)
-RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
-        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
-    else
-        apt-get update && \
-        apt-get install -y \
-            cmake && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
 # Install Go
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
-ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
+ENV PATH $PATH:/root/go/bin:/usr/local/go/bin

 # Install grpc compilers
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
@@ -52,36 +39,39 @@ RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates

-RUN test -n "$TARGETARCH" \
-    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
-
 # Use the variables in subsequent instructions
 RUN echo "Target Architecture: $TARGETARCH"
 RUN echo "Target Variant: $TARGETVARIANT"

 # Cuda
-ENV PATH=/usr/local/cuda/bin:${PATH}
+ENV PATH /usr/local/cuda/bin:${PATH}

 # HipBLAS requirements
-ENV PATH=/opt/rocm/bin:${PATH}
+ENV PATH /opt/rocm/bin:${PATH}

 # OpenBLAS requirements and stable diffusion
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        libopenblas-dev && \
+        libopenblas-dev \
+        libopencv-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

+# Set up OpenCV
+RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+
 WORKDIR /build

+RUN test -n "$TARGETARCH" \
+    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
+
 ###################################
 ###################################

 # The requirements-extras target is for any builds with IMAGE_TYPE=extras. It should not be placed in this target unless every IMAGE_TYPE=extras build will use it
 FROM requirements-core AS requirements-extras

-# Install uv as a system package
-RUN curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=/usr/bin sh
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
 ENV PATH="/root/.cargo/bin:${PATH}"

 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
@@ -91,7 +81,7 @@ RUN apt-get update && \
        espeak \
        python3-pip \
        python-is-python3 \
-        python3-dev llvm \
+        python3-dev \
        python3-venv && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* && \
@@ -110,13 +100,12 @@ FROM requirements-${IMAGE_TYPE} AS requirements-drivers
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
 ARG CUDA_MINOR_VERSION=0
-ARG SKIP_DRIVERS=false

 ENV BUILD_TYPE=${BUILD_TYPE}

 # Vulkan requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+    if [ "${BUILD_TYPE}" = "vulkan" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils wget gpg-agent && \
@@ -132,7 +121,7 @@ EOT

 # CuBLAS requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+    if [ "${BUILD_TYPE}" = "cublas" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils
@@ -158,7 +147,7 @@ RUN <<EOT bash
 EOT

 # If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            libclblast-dev && \
@@ -166,7 +155,7 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        rm -rf /var/lib/apt/lists/* \
    ; fi

-RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            hipblas-dev \
@@ -199,8 +188,6 @@ FROM ${GRPC_BASE_IMAGE} AS grpc
 # This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
 ARG GRPC_MAKEFLAGS="-j4 -Otarget"
 ARG GRPC_VERSION=v1.65.0
-ARG CMAKE_FROM_SOURCE=false
-ARG CMAKE_VERSION=3.26.4

 ENV MAKEFLAGS=${GRPC_MAKEFLAGS}

@@ -209,24 +196,12 @@ WORKDIR /build
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        ca-certificates \
-        build-essential curl libssl-dev \
+        build-essential \
+        cmake \
        git && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

-# Install CMake (the version in 22.04 is too old)
-RUN <<EOT bash
-    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
-        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
-    else
-        apt-get update && \
-        apt-get install -y \
-            cmake && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
-    fi
-EOT
-
 # We install GRPC to a different prefix here so that we can copy in only the build artifacts later
 # saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
 # and running make install in the target container
@@ -242,14 +217,13 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
 ###################################
 ###################################

-# The builder-base target has the arguments, variables, and copies shared between full builder images and the uncompiled devcontainer
+# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
+# Adjustments to the build process should likely be made here.
+FROM requirements-drivers AS builder

-FROM requirements-drivers AS builder-base
-
-ARG GO_TAGS="tts p2p"
+ARG GO_TAGS="stablediffusion tts p2p"
 ARG GRPC_BACKENDS
 ARG MAKEFLAGS
-ARG LD_FLAGS="-s -w"

 ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
@@ -257,12 +231,14 @@ ENV MAKEFLAGS=${MAKEFLAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
-ENV LD_FLAGS=${LD_FLAGS}
-
-RUN echo "GO_TAGS: $GO_TAGS" && echo "TARGETARCH: $TARGETARCH"

 WORKDIR /build

+COPY . .
+COPY .git .
+RUN echo "GO_TAGS: $GO_TAGS"
+
+RUN make prepare

 # We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
 # but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
@@ -280,12 +256,8 @@ RUN <<EOT bash
    fi
 EOT

-###################################
-###################################
-
-# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
-# Adjustments to the build process should likely be made here.
-FROM builder-base AS builder
+# stablediffusion does not tolerate a newer version of abseil, build it first
+RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build

 # Install the pre-built GRPC
 COPY --from=grpc /opt/grpc /usr/local
@@ -293,20 +265,8 @@ COPY --from=grpc /opt/grpc /usr/local
 # Rebuild with defaults backends
 WORKDIR /build

-COPY . .
-COPY .git .
-
-RUN make prepare
-
 ## Build the binary
-## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
-## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
-## (both will use CUDA or hipblas for the actual computation)
-RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
-        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
-    else \
-        make build; \
-    fi
+RUN make build

 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
        mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
@@ -316,38 +276,6 @@ RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
 ###################################
 ###################################

-# The devcontainer target is not used on CI. It is a target for developers to use locally -
-# rather than copying files it mounts them locally and leaves building to the developer
-
-FROM builder-base AS devcontainer
-
-ARG FFMPEG
-
-COPY --from=grpc /opt/grpc /usr/local
-
-COPY .devcontainer-scripts /.devcontainer-scripts
-
-# Add FFmpeg
-RUN if [ "${FFMPEG}" = "true" ]; then \
-        apt-get update && \
-        apt-get install -y --no-install-recommends \
-            ffmpeg && \
-        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* \
-    ; fi
-
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-        ssh less wget
-# For the devcontainer, leave apt functional in case additional devtools are needed at runtime.
-
-RUN go install github.com/go-delve/delve/cmd/dlv@latest
-
-RUN go install github.com/mikefarah/yq/v4@latest
-
-###################################
-###################################
-
 # This is the final target. The result of this target will be the image uploaded to the registry.
 # If you cannot find a more suitable place for an addition, this layer is a suitable place for it.
 FROM requirements-drivers
@@ -397,28 +325,42 @@ COPY --from=builder /build/local-ai ./
 # Copy shared libraries for piper
 COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/

+# do not let stablediffusion rebuild (requires an older version of absl)
+COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
+
 # Change the shell to bash so we can use [[ tests below
 SHELL ["/bin/bash", "-c"]
 # We try to strike a balance between individual layer size (as that affects total push time) and total image size
 # Splitting the backends into more groups with fewer items results in a larger image, but a smaller size for the largest layer
 # Splitting the backends into fewer groups with more items results in a smaller image, but a larger size for the largest layer

-RUN if [[ ( "${IMAGE_TYPE}" == "extras ")]]; then \
-        apt-get -qq -y install espeak-ng \
-    ; fi
-
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/coqui \
    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "faster-whisper" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/faster-whisper \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/parler-tts \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/diffusers \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/transformers-musicgen \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/exllama \
    ; fi

-RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/kokoro \
+RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/vall-e-x \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/openvoice \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "petals" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/petals \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "sentencetransformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/sentencetransformers \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/exllama2 \
@@ -438,6 +380,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/rerankers \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/mamba \
    ; fi

 # Make sure the models directory exists
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023-2025 Ettore Di Giacinto (mudler@localai.io)
+Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/419
+++ b/419
@@ -6,27 +6,37 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true

 # llama.cpp versions
-CPPLLAMA_VERSION?=300907b2110cc17b4337334dc397e05de2d8f5e0
+GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
+GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
+CPPLLAMA_VERSION?=b841d0740855c5af1344a81f261139a45a2b39ee
+
+# gpt4all version
+GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
+GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
+
+# go-rwkv version
+RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
+RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
-WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
+WHISPER_CPP_VERSION?=f68298ce06ca3edd6e6f3f21c3d0bb5f073942c3
+
+# bert.cpp version
+BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
+BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4

 # go-piper version
 PIPER_REPO?=https://github.com/mudler/go-piper
-PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
+PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759

-# bark.cpp
-BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
-BARKCPP_VERSION?=v1.0.0
+# stablediffusion version
+STABLEDIFFUSION_REPO?=https://github.com/mudler/go-stable-diffusion
+STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f

-# stablediffusion.cpp (ggml)
-STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=d46ed5e184b97c2018dc2e8105925bdb8775e02c
-
-ONNX_VERSION?=1.20.0
-ONNX_ARCH?=x64
-ONNX_OS?=linux
+# tinydream version
+TINYDREAM_REPO?=https://github.com/M0Rf30/go-tiny-dream
+TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057

 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
@@ -39,7 +49,6 @@ CGO_LDFLAGS_WHISPER+=-lggml
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
 BUILD_ID?=
-NATIVE?=false

 TEST_DIR=/tmp/test

@@ -78,25 +87,7 @@ ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif

-# IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
-ifeq ($(NATIVE),false)
-	CMAKE_ARGS+=-DGGML_NATIVE=OFF
-endif
-
-# Detect if we are running on arm64
-ifneq (,$(findstring aarch64,$(shell uname -m)))
-	ONNX_ARCH=aarch64
-endif
-
 ifeq ($(OS),Darwin)
-	ONNX_OS=osx
-	ifneq (,$(findstring aarch64,$(shell uname -m)))
-		ONNX_ARCH=arm64
-	else ifneq (,$(findstring arm64,$(shell uname -m)))
-		ONNX_ARCH=arm64
-	else
-		ONNX_ARCH=x86_64
-	endif

 	ifeq ($(OSX_SIGNING_IDENTITY),)
 		OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
@@ -149,11 +140,12 @@ ifeq ($(BUILD_TYPE),hipblas)
 	LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
+	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
-	export GGML_HIP=1
+	export GGML_HIPBLAS=1
 	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
-	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
 endif

@@ -172,6 +164,16 @@ ifeq ($(STATIC),true)
 	LD_FLAGS+=-linkmode external -extldflags -static
 endif

+ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
+#	OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
+	OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
+endif
+
+ifeq ($(findstring tinydream,$(GO_TAGS)),tinydream)
+#	OPTIONAL_TARGETS+=go-tiny-dream/libtinydream.a
+	OPTIONAL_GRPC+=backend-assets/grpc/tinydream
+endif
+
 ifeq ($(findstring tts,$(GO_TAGS)),tts)
 #	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
 #	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
@@ -181,23 +183,17 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 endif

 ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
+ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
-ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
+ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
+ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
-
-ifeq ($(ONNX_OS),linux)
-ifeq ($(ONNX_ARCH),x64)
-	ALL_GRPC_BACKENDS+=backend-assets/grpc/bark-cpp
-	ALL_GRPC_BACKENDS+=backend-assets/grpc/stablediffusion-ggml
-endif
-endif
-
 ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
-ALL_GRPC_BACKENDS+=backend-assets/grpc/silero-vad
 ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
 # Use filter-out to remove the specified backends
 ALL_GRPC_BACKENDS := $(filter-out $(SKIP_GRPC_BACKEND),$(ALL_GRPC_BACKENDS))
@@ -218,22 +214,31 @@ endif

 all: help

-## bark.cpp
-sources/bark.cpp:
-	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
-	cd sources/bark.cpp && \
-	git checkout $(BARKCPP_VERSION) && \
+## BERT embeddings
+sources/go-bert.cpp:
+	mkdir -p sources/go-bert.cpp
+	cd sources/go-bert.cpp && \
+	git init && \
+	git remote add origin $(BERT_REPO) && \
+	git fetch origin && \
+	git checkout $(BERT_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch

-sources/bark.cpp/build/libbark.a: sources/bark.cpp
-	cd sources/bark.cpp && \
-	mkdir -p build && \
-	cd build && \
-	cmake $(CMAKE_ARGS) .. && \
-	cmake --build . --config Release
+sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
+	$(MAKE) -C sources/go-bert.cpp libgobert.a

-backend/go/bark/libbark.a: sources/bark.cpp/build/libbark.a
-	$(MAKE) -C backend/go/bark libbark.a
+## go-llama.cpp
+sources/go-llama.cpp:
+	mkdir -p sources/go-llama.cpp
+	cd sources/go-llama.cpp && \
+	git init && \
+	git remote add origin $(GOLLAMA_REPO) && \
+	git fetch origin && \
+	git checkout $(GOLLAMA_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
+	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a

 ## go-piper
 sources/go-piper:
@@ -248,37 +253,57 @@ sources/go-piper:
 sources/go-piper/libpiper_binding.a: sources/go-piper
 	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o

-## stablediffusion (ggml)
-sources/stablediffusion-ggml.cpp:
-	git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
-	cd sources/stablediffusion-ggml.cpp && \
-	git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
+## GPT4ALL
+sources/gpt4all:
+	mkdir -p sources/gpt4all
+	cd sources/gpt4all && \
+	git init && \
+	git remote add origin $(GPT4ALL_REPO) && \
+	git fetch origin && \
+	git checkout $(GPT4ALL_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch

-backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
-	$(MAKE) -C backend/go/image/stablediffusion-ggml build/libstable-diffusion.a
-	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
+sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
+	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a

-backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/stablediffusion-ggml
-endif
+## RWKV
+sources/go-rwkv.cpp:
+	mkdir -p sources/go-rwkv.cpp
+	cd sources/go-rwkv.cpp && \
+	git init && \
+	git remote add origin $(RWKV_REPO) && \
+	git fetch origin && \
+	git checkout $(RWKV_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch

-sources/onnxruntime:
-	mkdir -p sources/onnxruntime
-	curl -L https://github.com/microsoft/onnxruntime/releases/download/v$(ONNX_VERSION)/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz -o sources/onnxruntime/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
-	cd sources/onnxruntime && tar -xvf onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz && rm onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
-	cd sources/onnxruntime && mv onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION)/* ./
+sources/go-rwkv.cpp/librwkv.a: sources/go-rwkv.cpp
+	cd sources/go-rwkv.cpp && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..

-backend-assets/lib/libonnxruntime.so.1: backend-assets/lib sources/onnxruntime
-	cp -rfv sources/onnxruntime/lib/* backend-assets/lib/
-ifeq ($(OS),Darwin)
-	mv backend-assets/lib/libonnxruntime.$(ONNX_VERSION).dylib backend-assets/lib/libonnxruntime.dylib
-else
-	mv backend-assets/lib/libonnxruntime.so.$(ONNX_VERSION) backend-assets/lib/libonnxruntime.so.1
-endif
+## stable diffusion
+sources/go-stable-diffusion:
+	mkdir -p sources/go-stable-diffusion
+	cd sources/go-stable-diffusion && \
+	git init && \
+	git remote add origin $(STABLEDIFFUSION_REPO) && \
+	git fetch origin && \
+	git checkout $(STABLEDIFFUSION_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
+	CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
+
+## tiny-dream
+sources/go-tiny-dream:
+	mkdir -p sources/go-tiny-dream
+	cd sources/go-tiny-dream && \
+	git init && \
+	git remote add origin $(TINYDREAM_REPO) && \
+	git fetch origin && \
+	git checkout $(TINYDREAM_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream
+	$(MAKE) -C sources/go-tiny-dream libtinydream.a

 ## whisper
 sources/whisper.cpp:
@@ -293,17 +318,29 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a

-get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
+get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp

 replace:
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp
+	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
+	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
+	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp

 dropreplace:
+	$(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
+	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-bert.cpp
+	$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
+	$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
+	$(GOCMD) mod edit -dropreplace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang
+	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp

 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@@ -311,8 +348,14 @@ prepare-sources: get-sources replace
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
+	$(MAKE) -C sources/go-llama.cpp clean
+	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
+	$(MAKE) -C sources/go-rwkv.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
+	$(MAKE) -C sources/go-stable-diffusion clean
+	$(MAKE) -C sources/go-bert.cpp clean
 	$(MAKE) -C sources/go-piper clean
+	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build

 prepare: prepare-sources $(OPTIONAL_TARGETS)
@@ -325,9 +368,7 @@ clean: ## Remove build related file
 	rm -rf release/
 	rm -rf backend-assets/*
 	$(MAKE) -C backend/cpp/grpc clean
-	$(MAKE) -C backend/go/bark clean
 	$(MAKE) -C backend/cpp/llama clean
-	$(MAKE) -C backend/go/image/stablediffusion-ggml clean
 	rm -rf backend/cpp/llama-* || true
 	$(MAKE) dropreplace
 	$(MAKE) protogen-clean
@@ -338,9 +379,6 @@ clean-tests:
 	rm -rf test-dir
 	rm -rf core/http/backend-assets

-clean-dc: clean
-	cp -r /build/backend-assets /workspace/backend-assets
-
 ## Build:
 build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
@@ -358,7 +396,7 @@ build-minimal:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build

 build-api:
-	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=p2p $(MAKE) build
+	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build

 backend-assets/lib:
 	mkdir -p backend-assets/lib
@@ -369,7 +407,7 @@ ifeq ($(DETECT_LIBS),true)
 	scripts/prepare-libs.sh backend-assets/grpc/llama-cpp-avx2
 endif
 ifeq ($(OS),Darwin)
-	BUILD_TYPE=none $(MAKE) backend-assets/grpc/llama-cpp-fallback
+	$(info ${GREEN}I Skip CUDA/hipblas build on MacOS${RESET})
 else
 	$(MAKE) backend-assets/grpc/llama-cpp-cuda
 	$(MAKE) backend-assets/grpc/llama-cpp-hipblas
@@ -414,10 +452,12 @@ run: prepare ## run local-ai
 test-models/testmodel.ggml:
 	mkdir test-models
 	mkdir test-dir
-	wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
+	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
+	wget -q https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
+	wget -q https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
 	cp tests/models_fixtures/* test-models

 prepare-test: grpcs
@@ -426,10 +466,12 @@ prepare-test: grpcs

 test: prepare test-models/testmodel.ggml grpcs
 	@echo 'Running tests'
-	export GO_TAGS="tts debug"
+	export GO_TAGS="tts stablediffusion debug"
 	$(MAKE) prepare-test
-	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
+	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
+	$(MAKE) test-gpt4all
+	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion
@@ -444,36 +486,44 @@ run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
 	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests

-run-e2e-aio: protogen-go
+run-e2e-aio:
 	@echo 'Running e2e AIO tests'
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio

 test-e2e:
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
 	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e

 teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)

+test-gpt4all: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r $(TEST_PATHS)
+
+test-llama: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
+
 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)

 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)

 test-stablediffusion: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)

 test-stores: backend-assets/grpc/local-store
 	mkdir -p tests/integration/backend-assets/grpc
 	cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts $(TEST_FLAKES) -v -r tests/integration
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts 1 -v -r tests/integration

 test-container:
 	docker build --target requirements -t local-ai-test-container .
@@ -509,10 +559,10 @@ protogen-go-clean:
 	$(RM) bin/*

 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen

 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean

 .PHONY: autogptq-protogen
 autogptq-protogen:
@@ -546,13 +596,13 @@ diffusers-protogen:
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean

-.PHONY: faster-whisper-protogen
-faster-whisper-protogen:
-	$(MAKE) -C backend/python/faster-whisper protogen
+.PHONY: exllama-protogen
+exllama-protogen:
+	$(MAKE) -C backend/python/exllama protogen

-.PHONY: faster-whisper-protogen-clean
-faster-whisper-protogen-clean:
-	$(MAKE) -C backend/python/faster-whisper protogen-clean
+.PHONY: exllama-protogen-clean
+exllama-protogen-clean:
+	$(MAKE) -C backend/python/exllama protogen-clean

 .PHONY: exllama2-protogen
 exllama2-protogen:
@@ -562,6 +612,22 @@ exllama2-protogen:
 exllama2-protogen-clean:
 	$(MAKE) -C backend/python/exllama2 protogen-clean

+.PHONY: mamba-protogen
+mamba-protogen:
+	$(MAKE) -C backend/python/mamba protogen
+
+.PHONY: mamba-protogen-clean
+mamba-protogen-clean:
+	$(MAKE) -C backend/python/mamba protogen-clean
+
+.PHONY: petals-protogen
+petals-protogen:
+	$(MAKE) -C backend/python/petals protogen
+
+.PHONY: petals-protogen-clean
+petals-protogen-clean:
+	$(MAKE) -C backend/python/petals protogen-clean
+
 .PHONY: rerankers-protogen
 rerankers-protogen:
 	$(MAKE) -C backend/python/rerankers protogen
@@ -570,6 +636,14 @@ rerankers-protogen:
 rerankers-protogen-clean:
 	$(MAKE) -C backend/python/rerankers protogen-clean

+.PHONY: sentencetransformers-protogen
+sentencetransformers-protogen:
+	$(MAKE) -C backend/python/sentencetransformers protogen
+
+.PHONY: sentencetransformers-protogen-clean
+sentencetransformers-protogen-clean:
+	$(MAKE) -C backend/python/sentencetransformers protogen-clean
+
 .PHONY: transformers-protogen
 transformers-protogen:
 	$(MAKE) -C backend/python/transformers protogen
@@ -578,13 +652,37 @@ transformers-protogen:
 transformers-protogen-clean:
 	$(MAKE) -C backend/python/transformers protogen-clean

-.PHONY: kokoro-protogen
-kokoro-protogen:
-	$(MAKE) -C backend/python/kokoro protogen
+.PHONY: parler-tts-protogen
+parler-tts-protogen:
+	$(MAKE) -C backend/python/parler-tts protogen

-.PHONY: kokoro-protogen-clean
-kokoro-protogen-clean:
-	$(MAKE) -C backend/python/kokoro protogen-clean
+.PHONY: parler-tts-protogen-clean
+parler-tts-protogen-clean:
+	$(MAKE) -C backend/python/parler-tts protogen-clean
+
+.PHONY: transformers-musicgen-protogen
+transformers-musicgen-protogen:
+	$(MAKE) -C backend/python/transformers-musicgen protogen
+
+.PHONY: transformers-musicgen-protogen-clean
+transformers-musicgen-protogen-clean:
+	$(MAKE) -C backend/python/transformers-musicgen protogen-clean
+
+.PHONY: vall-e-x-protogen
+vall-e-x-protogen:
+	$(MAKE) -C backend/python/vall-e-x protogen
+
+.PHONY: vall-e-x-protogen-clean
+vall-e-x-protogen-clean:
+	$(MAKE) -C backend/python/vall-e-x protogen-clean
+
+.PHONY: openvoice-protogen
+openvoice-protogen:
+	$(MAKE) -C backend/python/openvoice protogen
+
+.PHONY: openvoice-protogen-clean
+openvoice-protogen-clean:
+	$(MAKE) -C backend/python/openvoice protogen-clean

 .PHONY: vllm-protogen
 vllm-protogen:
@@ -601,11 +699,17 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
-	$(MAKE) -C backend/python/faster-whisper
 	$(MAKE) -C backend/python/vllm
+	$(MAKE) -C backend/python/mamba
+	$(MAKE) -C backend/python/sentencetransformers
 	$(MAKE) -C backend/python/rerankers
 	$(MAKE) -C backend/python/transformers
-	$(MAKE) -C backend/python/kokoro
+	$(MAKE) -C backend/python/transformers-musicgen
+	$(MAKE) -C backend/python/parler-tts
+	$(MAKE) -C backend/python/vall-e-x
+	$(MAKE) -C backend/python/openvoice
+	$(MAKE) -C backend/python/exllama
+	$(MAKE) -C backend/python/petals
 	$(MAKE) -C backend/python/exllama2

 prepare-test-extra: protogen-python
@@ -626,9 +730,29 @@ backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_bindin
 	mkdir -p backend-assets/espeak-ng-data
 	@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data

+backend-assets/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
+	mkdir -p backend-assets/gpt4all
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
+	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
+
 backend-assets/grpc: protogen-go replace
 	mkdir -p backend-assets/grpc

+backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/bert-embeddings
+endif
+
+backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/gpt4all
+endif
+
 backend-assets/grpc/huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/
 ifneq ($(UPX),)
@@ -659,6 +783,9 @@ else
 	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server
 endif
+ifneq ($(UPX),)
+	$(UPX) backend/cpp/${VARIANT}/grpc-server
+endif

 # This target is for manually building a variant with-auto detected flags
 backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/llama.cpp
@@ -675,13 +802,6 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2

-backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
-	cp -rf backend/cpp/llama backend/cpp/llama-avx512
-	$(MAKE) -C backend/cpp/llama-avx512 purge
-	$(info ${GREEN}I llama-cpp build info:avx512${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
-	cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
-
 backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-avx
 	$(MAKE) -C backend/cpp/llama-avx purge
@@ -695,6 +815,10 @@ backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc backend/cpp/llama/ll
 	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
+# TODO: every binary should have its own folder instead, so can have different metal implementations
+ifeq ($(BUILD_TYPE),metal)
+	cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/
+endif

 backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-cuda
@@ -707,7 +831,7 @@ backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc backend/cpp/llama/lla
 	cp -rf backend/cpp/llama backend/cpp/llama-hipblas
 	$(MAKE) -C backend/cpp/llama-hipblas purge
 	$(info ${GREEN}I llama-cpp build info:hipblas${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
+	BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas

 backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc backend/cpp/llama/llama.cpp
@@ -734,12 +858,15 @@ backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc backend/cpp/llama/llama.
 backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
 	mkdir -p backend-assets/util/
 	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
-
-backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
 ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/bark-cpp
+	$(UPX) backend-assets/util/llama-cpp-rpc-server
+endif
+
+backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/llama-ggml
 endif

 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
@@ -749,16 +876,30 @@ ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/piper
 endif

-backend-assets/grpc/silero-vad: backend-assets/grpc backend-assets/lib/libonnxruntime.so.1
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURDIR)/backend-assets/lib \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/silero-vad ./backend/go/vad/silero
+backend-assets/grpc/rwkv: sources/go-rwkv.cpp sources/go-rwkv.cpp/librwkv.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv.cpp LIBRARY_PATH=$(CURDIR)/sources/go-rwkv.cpp \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
 ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/silero-vad
+	$(UPX) backend-assets/grpc/rwkv
+endif
+
+backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/stablediffusion
+endif
+
+backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
+ifneq ($(UPX),)
+	$(UPX) backend-assets/grpc/tinydream
 endif

 backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/whisper
 endif
@@ -809,7 +950,7 @@ docker-aio-all:

 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -817,7 +958,7 @@ docker-image-intel:

 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.2.0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -829,7 +970,7 @@ swagger:

 .PHONY: gen-assets
 gen-assets:
-	$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets
+	$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets

 ## Documentation
 docs/layouts/_default:
--- a/README.md
+++ b/README.md
@@ -38,13 +38,9 @@
 </a>
 </p>

-<p align="center">
-<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
-</p>
-
 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

@@ -60,59 +56,39 @@ curl https://localai.io/install.sh | sh

 Or run with docker:
 ```bash
-# CPU only image:
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
-
-# Nvidia GPU:
-docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
-
-# CPU and GPU image (bigger size):
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
-
-# AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
-```
-
-To load models:
-
-```bash
-# From the model gallery (see available models with `local-ai models list`, in the WebUI from the model tab, or visiting https://models.localai.io)
-local-ai run llama-3.2-1b-instruct:q4_k_m
-# Start LocalAI with the phi-2 model directly from huggingface
-local-ai run huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf
-# Install and run a model from the Ollama OCI registry
-local-ai run ollama://gemma:2b
-# Run a model from a configuration file
-local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
-# Install and run a model from a standard OCI registry (e.g., Docker Hub)
-local-ai run oci://localai/phi-2:latest
+# Alternative images:
+# - if you have an Nvidia GPU:
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
+# - without preconfigured models
+# docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
+# - without preconfigured models for Nvidia GPUs
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12 
 ```

 [💻 Getting started](https://localai.io/basics/getting_started/index.html)

-## 📰 Latest project news
+## 🔥🔥 Hot topics / Roadmap

- Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
- Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204
- Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
- Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723. P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
+[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+
+- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
+- June 2024: 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
+- June 2024: Support for models from OCI registries: https://github.com/mudler/LocalAI/pull/2628
 - May 2024: 🔥🔥 Decentralized P2P llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs  https://localai.io/features/distribute/
+- May 2024: 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
+- May 2024: 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
 - May 2024: 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
+- May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
 - April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121

-Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+Hot topics (looking for contributors):

-## 🔥🔥 Hot topics (looking for help):
-
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
- Realtime API https://github.com/mudler/LocalAI/issues/3714
 - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
 - Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
 - Assistant API: https://github.com/mudler/LocalAI/issues/1273
+- Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
 - Vulkan: https://github.com/mudler/LocalAI/issues/1647
 - Anthropic API: https://github.com/mudler/LocalAI/issues/1808

@@ -120,10 +96,10 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl

 ## 🚀 [Features](https://localai.io/features/)

- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
 - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
 - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
- 🎨 [Image generation](https://localai.io/features/image-generation)
+- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
 - 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) 
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
@@ -131,7 +107,6 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
- 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!

 ## 💻 Usage
@@ -154,7 +129,6 @@ Model galleries
 Other:
 - Helm chart https://github.com/go-skynet/helm-charts
 - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
- Langchain: https://python.langchain.com/docs/integrations/providers/localai/
 - Terminal utility https://github.com/djcopley/ShellOracle
 - Local Smart assistant https://github.com/mudler/LocalAGI
 - Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
@@ -162,9 +136,6 @@ Other:
 - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
 - Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
 - Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
- Another Telegram Bot https://github.com/JackBekket/Hellper
- Auto-documentation https://github.com/JackBekket/Reflexia
- Github bot which answer on issues, with code and documentation as context https://github.com/JackBekket/GitHelper
 - Github Actions: https://github.com/marketplace/actions/start-localai
 - Examples: https://github.com/mudler/LocalAI/tree/master/examples/
  
@@ -179,7 +150,6 @@ Other:

 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

- [Run Visual studio code with LocalAI (SUSE)](https://www.suse.com/c/running-ai-locally/)
 - 🆕 [Run LocalAI on Jetson Nano Devkit](https://mudler.pm/posts/local-ai-jetson-nano-devkit/)
 - [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/blog/low-code-llm-apps-with-local-ai-flowise-and-pulumi/)
 - [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
@@ -239,6 +209,7 @@ LocalAI couldn't have been built without the help of great software already avai
 - https://github.com/antimatter15/alpaca.cpp
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/ggerganov/whisper.cpp
+- https://github.com/saharNooby/rwkv.cpp
 - https://github.com/rhasspy/piper

 ## 🤗 Contributors
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -1,7 +1,7 @@
 name: text-embedding-ada-002
-embeddings: true
+backend: bert-embeddings
 parameters:
-  model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
+  model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin

 usage: |
    You can test this model with curl like this:
--- a/aio/cpu/image-gen.yaml
+++ b/aio/cpu/image-gen.yaml
@@ -1,17 +1,56 @@
 name: stablediffusion
-backend: stablediffusion-ggml
-cfg_scale: 4.5
-
-options:
- sampler:euler
+backend: stablediffusion
 parameters:
-  model: stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf
-step: 25
+  model: stablediffusion_assets
+
+license: "BSD-3"
+urls:
+- https://github.com/EdVince/Stable-Diffusion-NCNN
+- https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
+
+description: |
+     Stable Diffusion in NCNN with c++, supported txt2img and img2img

 download_files:
- filename: "stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
-  sha256: "b8944e9fe0b69b36ae1b5bb0185b3a7b8ef14347fe0fa9af6c64c4829022261f"
-  uri: "huggingface://second-state/stable-diffusion-v1-5-GGUF/stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
+- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
+  sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
+- filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
+  sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
+- filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
+  sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
+- filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
+  sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
+- filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
+  sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
+- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
+  sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
+- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
+  sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
+- filename: "stablediffusion_assets/log_sigmas.bin"
+  sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
+- filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
+  sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
+- filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
+  sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
+- filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
+  sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
+- filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
+  sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
+- filename: "stablediffusion_assets/vocab.txt"
+  sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"

 usage: |
        curl http://localhost:8080/v1/images/generations \
--- a/aio/cpu/vad.yaml
+++ b/aio/cpu/vad.yaml
@@ -1,8 +0,0 @@
-backend: silero-vad
-name: silero-vad
-parameters:
-  model: silero-vad.onnx
-download_files:
- filename: silero-vad.onnx
-  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
-  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4o
+name: gpt-4-vision-preview

 roles:
  user: "USER:"
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -129,7 +129,7 @@ detect_gpu
 detect_gpu_size

 PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
-export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vad.yaml,/aio/${PROFILE}/vision.yaml}"
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"

 check_vars

--- a/aio/gpu-8g/vad.yaml
+++ b/aio/gpu-8g/vad.yaml
@@ -1,8 +0,0 @@
-backend: silero-vad
-name: silero-vad
-parameters:
-  model: silero-vad.onnx
-download_files:
- filename: silero-vad.onnx
-  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
-  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4o
+name: gpt-4-vision-preview

 roles:
  user: "USER:"
--- a/aio/intel/image-gen.yaml
+++ b/aio/intel/image-gen.yaml
@@ -1,6 +1,6 @@
 name: stablediffusion
 parameters:
-  model: Lykon/dreamshaper-8
+  model: runwayml/stable-diffusion-v1-5
 backend: diffusers
 step: 25
 f16: true
--- a/aio/intel/vad.yaml
+++ b/aio/intel/vad.yaml
@@ -1,8 +0,0 @@
-backend: silero-vad
-name: silero-vad
-parameters:
-  model: silero-vad.onnx
-download_files:
- filename: silero-vad.onnx
-  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
-  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 mmap: false
 f16: false
-name: gpt-4o
+name: gpt-4-vision-preview

 roles:
  user: "USER:"
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -16,7 +16,6 @@ service Backend {
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
-  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}

@@ -26,21 +25,6 @@ service Backend {
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}

  rpc Rerank(RerankRequest) returns (RerankResult) {}
-
-  rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
-
-  rpc VAD(VADRequest) returns (VADResponse) {}
-}
-
-// Define the empty request
-message MetricsRequest {}
-
-message MetricsResponse {
-  int32 slot_id = 1;
-  string prompt_json_for_slot = 2;  // Stores the prompt as a JSON string.
-  float tokens_per_second = 3;
-  int32 tokens_generated = 4;
-  int32 prompt_tokens_processed = 5;
 }

 message RerankRequest {
@@ -149,9 +133,6 @@ message PredictOptions {
  repeated string Images = 42;
  bool UseTokenizerTemplate = 43;
  repeated Message Messages = 44;
-  repeated string Videos = 45;
-  repeated string Audios = 46;
-  string CorrelationId = 47;
 }

 // The response message containing the result
@@ -159,13 +140,6 @@ message Reply {
  bytes message = 1;
  int32 tokens = 2;
  int32 prompt_tokens = 3;
-  double timing_prompt_processing = 4;
-  double timing_token_generation = 5;
-}
-
-message GrammarTrigger {
-  string word = 1;
-  bool at_start = 2; 
 }

 message ModelOptions {
@@ -228,7 +202,6 @@ message ModelOptions {
  int32  SwapSpace = 53;
  int32  MaxModelLen = 54;
  int32  TensorParallelSize = 55;
-  string LoadFormat = 58;

  string MMProj = 41;

@@ -242,18 +215,6 @@ message ModelOptions {

  bool FlashAttention = 56;
  bool NoKVOffload = 57;
-
-  string ModelPath = 59;
-
-  repeated string LoraAdapters = 60;
-  repeated float LoraScales = 61;
-
-  repeated string Options = 62;
-
-  string CacheTypeKey = 63;
-  string CacheTypeValue = 64;
-
-  repeated GrammarTrigger GrammarTriggers = 65;
 }

 message Result {
@@ -309,30 +270,6 @@ message TTSRequest {
  optional string language = 5;
 }

-message VADRequest {
-  repeated float audio = 1;
-}
-
-message VADSegment {
-  float start = 1;
-  float end = 2;
-}
-
-message VADResponse {
-  repeated VADSegment segments = 1;
-}
-
-message SoundGenerationRequest {
-  string text = 1;
-  string model = 2;
-  string dst = 3;
-  optional float duration = 4;
-  optional float temperature = 5;
-  optional bool sample = 6;
-  optional string src = 7;
-  optional int32 src_divisor = 8;
-}
-
 message TokenizationResponse {
  int32 length = 1;
  repeated int32 tokens = 2;
@@ -357,4 +294,4 @@ message StatusResponse {
 message Message {
  string role = 1;
  string content = 2;
-}
+}
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -75,11 +75,24 @@ add_library(hw_grpc_proto
  ${hw_proto_hdrs} )

 add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
-target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
-  absl::flags_parse
-  gRPC::${_REFLECTION}
-  gRPC::${_GRPC_GRPCPP}
-  protobuf::${_PROTOBUF_LIBPROTOBUF})
+
+# Conditionally link SYCL to grpc-server
+# https://github.com/ggerganov/llama.cpp/issues/8665
+if ( DEFINED ENV{ONEAPI_ROOT})
+    target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+      absl::flags_parse
+      gRPC::${_REFLECTION}
+      gRPC::${_GRPC_GRPCPP}
+      protobuf::${_PROTOBUF_LIBPROTOBUF}
+      sycl)
+else()
+    target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+      absl::flags_parse
+      gRPC::${_REFLECTION}
+      gRPC::${_GRPC_GRPCPP}
+      protobuf::${_PROTOBUF_LIBPROTOBUF})
+endif()
+
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
--- a/backend/cpp/llama/CMakeLists.txt.rpc-8662
+++ b/backend/cpp/llama/CMakeLists.txt.rpc-8662
@@ -0,0 +1,8 @@
+# https://github.com/ggerganov/llama.cpp/issues/8665
+
+add_executable(rpc-server rpc-server.cpp)
+if ( DEFINED ENV{ONEAPI_ROOT})
+target_link_libraries(rpc-server PRIVATE ggml llama sycl)
+else()
+target_link_libraries(rpc-server PRIVATE ggml llama)
+endif()
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -22,7 +22,7 @@ else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIP=ON
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)
@@ -30,7 +30,9 @@ else ifeq ($(OS),Darwin)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
 		CMAKE_ARGS+=-DGGML_METAL=ON
-		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+# Until this is tested properly, we disable embedded metal file
+# as we already embed it as part of the LocalAI assets
+		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=OFF
 		TARGET+=--target ggml-metal
 	endif
 endif
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -1,13 +0,0 @@
-diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
-index 3cd0d2fa..6c5e811a 100644
--- a/examples/llava/clip.cpp
-+++ b/examples/llava/clip.cpp
-@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
-                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
-                 int* patches_data = (int*)malloc(ggml_nbytes(patches));
-                 for (int i = 0; i < num_patches; i++) {
-                    patches_data[i] = i + 1;
-+                    patches_data[i] = i;
-                 }
-                 ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-                 free(patches_data);
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -1,12 +1,5 @@
 #!/bin/bash

-## Patches
-## Apply patches from the `patches` directory
-for patch in $(ls patches); do
-    echo "Applying patch $patch"
-    patch -d llama.cpp/ -p1 < patches/$patch
-done 
-
 cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
 cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
 cp -rfv json.hpp llama.cpp/examples/grpc-server/
@@ -24,4 +17,7 @@ cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
 cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
 echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
 cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
-cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
+cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
+
+# https://github.com/ggerganov/llama.cpp/issues/8665
+cp -rfv CMakeLists.txt.rpc-8662 llama.cpp/examples/rpc/CMakeLists.txt
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -480,4 +480,31 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
    }

    return ret;
+}
+
+//
+// random string / id
+//
+
+static std::string random_string()
+{
+    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+    std::random_device rd;
+    std::mt19937 generator(rd());
+
+    std::string result(32, ' ');
+
+    for (int i = 0; i < 32; ++i) {
+        result[i] = str[generator() % str.size()];
+    }
+
+    return result;
+}
+
+static std::string gen_chatcmplid()
+{
+    std::stringstream chatcmplid;
+    chatcmplid << "chatcmpl-" << random_string();
+    return chatcmplid.str();
 }
--- a/backend/go/bark/Makefile
+++ b/backend/go/bark/Makefile
@@ -1,25 +0,0 @@
-INCLUDE_PATH := $(abspath ./)
-LIBRARY_PATH := $(abspath ./)
-
-AR?=ar
-
-BUILD_TYPE?=
-# keep standard at C11 and C++11
-CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../sources/bark.cpp/examples -I$(INCLUDE_PATH)/../../../sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/../../../sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
-LDFLAGS  = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/../../../sources/bark.cpp/build/examples -lbark -lstdc++ -lm
-
-# warnings
-CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
-
-gobark.o:
-	$(CXX) $(CXXFLAGS) gobark.cpp -o gobark.o -c $(LDFLAGS)
-
-libbark.a: gobark.o
-	cp $(INCLUDE_PATH)/../../../sources/bark.cpp/build/libbark.a ./
-	$(AR) rcs libbark.a gobark.o
-	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml.c.o
-	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-alloc.c.o
-	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-backend.c.o
-
-clean:
-	rm -f gobark.o libbark.a
--- a/backend/go/bark/gobark.cpp
+++ b/backend/go/bark/gobark.cpp
@@ -1,85 +0,0 @@
-#include <iostream>
-#include <tuple>
-
-#include "bark.h"
-#include "gobark.h"
-#include "common.h"
-#include "ggml.h"
-
-struct bark_context *c;
-
-void bark_print_progress_callback(struct bark_context *bctx, enum bark_encoding_step step, int progress, void *user_data) {
-    if (step == bark_encoding_step::SEMANTIC) {
-        printf("\rGenerating semantic tokens... %d%%", progress);
-    } else if (step == bark_encoding_step::COARSE) {
-        printf("\rGenerating coarse tokens... %d%%", progress);
-    } else if (step == bark_encoding_step::FINE) {
-        printf("\rGenerating fine tokens... %d%%", progress);
-    }
-    fflush(stdout);
-}
-
-int load_model(char *model) {
-    // initialize bark context
-    struct bark_context_params ctx_params = bark_context_default_params();
-    bark_params params;
-
-    params.model_path = model;
-
-   // ctx_params.verbosity = verbosity;
-    ctx_params.progress_callback = bark_print_progress_callback;
-    ctx_params.progress_callback_user_data = nullptr;
-
-    struct bark_context *bctx = bark_load_model(params.model_path.c_str(), ctx_params, params.seed);
-    if (!bctx) {
-        fprintf(stderr, "%s: Could not load model\n", __func__);
-        return 1;
-    }
-
-    c = bctx;
-
-    return 0;
-}
-
-int tts(char *text,int  threads, char *dst ) {
-
-    ggml_time_init();
-    const int64_t t_main_start_us = ggml_time_us();
-
-    // generate audio
-    if (!bark_generate_audio(c, text, threads)) {
-        fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
-        return 1;
-    }
-
-    const float *audio_data = bark_get_audio_data(c);
-    if (audio_data == NULL) {
-        fprintf(stderr, "%s: Could not get audio data\n", __func__);
-        return 1;
-    }
-
-    const int audio_arr_size = bark_get_audio_data_size(c);
-
-    std::vector<float> audio_arr(audio_data, audio_data + audio_arr_size);
-
-    write_wav_on_disk(audio_arr, dst);
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-        const int64_t t_load_us = bark_get_load_time(c);
-        const int64_t t_eval_us = bark_get_eval_time(c);
-
-        printf("\n\n");
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
-        printf("%s:     eval time = %8.2f ms\n", __func__, t_eval_us / 1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
-    }
-    
-    return 0;
-}
-
-int unload() {
-    bark_free(c);
-}
-
--- a/backend/go/bark/gobark.go
+++ b/backend/go/bark/gobark.go
@@ -1,52 +0,0 @@
-package main
-
-// #cgo CXXFLAGS: -I${SRCDIR}/../../../sources/bark.cpp/ -I${SRCDIR}/../../../sources/bark.cpp/encodec.cpp -I${SRCDIR}/../../../sources/bark.cpp/examples -I${SRCDIR}/../../../sources/bark.cpp/spm-headers
-// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../sources/bark.cpp/build/examples -L${SRCDIR}/../../../sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon
-// #include <gobark.h>
-// #include <stdlib.h>
-import "C"
-
-import (
-	"fmt"
-	"unsafe"
-
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-)
-
-type Bark struct {
-	base.SingleThread
-	threads int
-}
-
-func (sd *Bark) Load(opts *pb.ModelOptions) error {
-
-	sd.threads = int(opts.Threads)
-
-	modelFile := C.CString(opts.ModelFile)
-	defer C.free(unsafe.Pointer(modelFile))
-
-	ret := C.load_model(modelFile)
-	if ret != 0 {
-		return fmt.Errorf("inference failed")
-	}
-
-	return nil
-}
-
-func (sd *Bark) TTS(opts *pb.TTSRequest) error {
-	t := C.CString(opts.Text)
-	defer C.free(unsafe.Pointer(t))
-
-	dst := C.CString(opts.Dst)
-	defer C.free(unsafe.Pointer(dst))
-
-	threads := C.int(sd.threads)
-
-	ret := C.tts(t, threads, dst)
-	if ret != 0 {
-		return fmt.Errorf("inference failed")
-	}
-
-	return nil
-}
--- a/backend/go/bark/gobark.h
+++ b/backend/go/bark/gobark.h
@@ -1,8 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-int load_model(char *model);
-int tts(char *text,int  threads, char *dst );
-#ifdef __cplusplus
-}
-#endif
--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -1,96 +0,0 @@
-INCLUDE_PATH := $(abspath ./)
-LIBRARY_PATH := $(abspath ./)
-
-AR?=ar
-CMAKE_ARGS?=
-BUILD_TYPE?=
-ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
-# keep standard at C11 and C++11
-CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
-
-# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
-CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
-
-# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
-ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DGGML_CUDA=ON
-# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-# to CMAKE_ARGS automatically
-else ifeq ($(BUILD_TYPE),openblas)
-	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
-# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
-else ifeq ($(BUILD_TYPE),clblas)
-	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
-# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
-else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIP=ON
-# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
-# But if it's OSX without metal, disable it here
-else ifeq ($(OS),Darwin)
-	ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DGGML_METAL=OFF
-	else
-		CMAKE_ARGS+=-DGGML_METAL=ON
-		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
-		TARGET+=--target ggml-metal
-	endif
-endif
-
-# ifeq ($(BUILD_TYPE),sycl_f16)
-# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
-# endif
-
-# ifeq ($(BUILD_TYPE),sycl_f32)
-# 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
-# endif
-
-# warnings
-CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
-
-# Find all .a archives in ARCHIVE_DIR
-# (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
-GGML_ARCHIVE_DIR := build/ggml/src/
-ALL_ARCHIVES := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.a')
-
-# Name of the single merged library
-COMBINED_LIB := libggmlall.a
-
-# Rule to merge all the .a files into one
-$(COMBINED_LIB): $(ALL_ARCHIVES)
-	@echo "Merging all .a into $(COMBINED_LIB)"
-	rm -f $@
-	mkdir -p merge-tmp
-	for a in $(ALL_ARCHIVES); do \
-		( cd merge-tmp && ar x ../$$a ); \
-	done
-	( cd merge-tmp && ar rcs ../$@ *.o )
-	# Ensure we have a proper index
-	ranlib $@
-	# Clean up
-	rm -rf merge-tmp
-
-build/libstable-diffusion.a:
-	@echo "Building SD with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
-ifneq (,$(findstring sycl,$(BUILD_TYPE)))
-	+bash -c "source $(ONEAPI_VARS); \
-	mkdir -p build && \
-	cd build && \
-	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
-	cmake --build . --config Release"
-else
-	mkdir -p build && \
-	cd build && \
-	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
-	cmake --build . --config Release
-endif
-	$(MAKE) $(COMBINED_LIB)
-
-gosd.o:
-	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
-
-libsd.a: gosd.o
-	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
-	$(AR) rcs libsd.a gosd.o
-
-clean:
-	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/go/image/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/image/stablediffusion-ggml/gosd.cpp
@@ -1,228 +0,0 @@
-#include <stdio.h>
-#include <string.h>
-#include <time.h>
-#include <iostream>
-#include <random>
-#include <string>
-#include <vector>
-#include "gosd.h"
-
-// #include "preprocessing.hpp"
-#include "flux.hpp"
-#include "stable-diffusion.h"
-
-#define STB_IMAGE_IMPLEMENTATION
-#define STB_IMAGE_STATIC
-#include "stb_image.h"
-
-#define STB_IMAGE_WRITE_IMPLEMENTATION
-#define STB_IMAGE_WRITE_STATIC
-#include "stb_image_write.h"
-
-#define STB_IMAGE_RESIZE_IMPLEMENTATION
-#define STB_IMAGE_RESIZE_STATIC
-#include "stb_image_resize.h"
-
-// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
-const char* sample_method_str[] = {
-    "euler_a",
-    "euler",
-    "heun",
-    "dpm2",
-    "dpm++2s_a",
-    "dpm++2m",
-    "dpm++2mv2",
-    "ipndm",
-    "ipndm_v",
-    "lcm",
-};
-
-// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
-const char* schedule_str[] = {
-    "default",
-    "discrete",
-    "karras",
-    "exponential",
-    "ays",
-    "gits",
-};
-
-sd_ctx_t* sd_c;
-
-sample_method_t sample_method;
-
-int load_model(char *model, char* options[], int threads, int diff) {
-    fprintf (stderr, "Loading model!\n");
-
-    char *stableDiffusionModel = "";
-    if (diff == 1 ) {
-        stableDiffusionModel = model;
-        model = "";
-    }
-
-    // decode options. Options are in form optname:optvale, or if booleans only optname.
-    char *clip_l_path  = "";
-    char *clip_g_path  = "";
-    char *t5xxl_path  = "";
-    char *vae_path  = "";
-    char *scheduler = "";
-    char *sampler = "";
-
-    // If options is not NULL, parse options
-    for (int i = 0; options[i] != NULL; i++) {
-        char *optname = strtok(options[i], ":");
-        char *optval = strtok(NULL, ":");
-        if (optval == NULL) {
-            optval = "true";
-        }
-
-        if (!strcmp(optname, "clip_l_path")) {
-            clip_l_path = optval;
-        }
-        if (!strcmp(optname, "clip_g_path")) {
-            clip_g_path = optval;
-        }
-        if (!strcmp(optname, "t5xxl_path")) {
-            t5xxl_path = optval;
-        }
-        if (!strcmp(optname, "vae_path")) {
-            vae_path = optval;
-        }
-        if (!strcmp(optname, "scheduler")) {
-            scheduler = optval;
-        }
-        if (!strcmp(optname, "sampler")) {
-            sampler = optval;
-        }
-    }
-
-    int sample_method_found = -1;
-    for (int m = 0; m < N_SAMPLE_METHODS; m++) {
-        if (!strcmp(sampler, sample_method_str[m])) {
-            sample_method_found = m;
-        }
-    }
-    if (sample_method_found == -1) {
-        fprintf(stderr, "Invalid sample method, default to EULER_A!\n");
-        sample_method_found = EULER_A;
-    }
-    sample_method = (sample_method_t)sample_method_found;
-
-    int schedule_found            = -1;
-    for (int d = 0; d < N_SCHEDULES; d++) {
-        if (!strcmp(scheduler, schedule_str[d])) {
-            schedule_found = d;
-                fprintf (stderr, "Found scheduler: %s\n", scheduler);
-
-        }
-    }
-
-    if (schedule_found == -1) {
-        fprintf (stderr, "Invalid scheduler! using DEFAULT\n");
-        schedule_found = DEFAULT;
-    }
-
-    schedule_t schedule = (schedule_t)schedule_found;
-    
-    fprintf (stderr, "Creating context\n");
-    sd_ctx_t* sd_ctx = new_sd_ctx(model,
-                                  clip_l_path,
-                                  clip_g_path,
-                                  t5xxl_path,
-                                  stableDiffusionModel,
-                                  vae_path,
-                                  "",
-                                  "",
-                                  "",
-                                  "",
-                                  "",
-                                  false,
-                                  false,
-                                  false,
-                                  threads,
-                                  SD_TYPE_COUNT,
-                                  STD_DEFAULT_RNG,
-                                  schedule,
-                                  false,
-                                  false,
-                                  false,
-                                  false);
-
-    if (sd_ctx == NULL) {
-        fprintf (stderr, "failed loading model (generic error)\n");
-        return 1;
-    }
-    fprintf (stderr, "Created context: OK\n");
-
-    sd_c = sd_ctx;
-
-    return 0;
-}
-
-int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed , char *dst, float cfg_scale) {
-
-    sd_image_t* results;
-
-    std::vector<int> skip_layers = {7, 8, 9};
-
-    fprintf (stderr, "Generating image\n");
-
-    results = txt2img(sd_c,
-                            text,
-                            negativeText,
-                            -1, //clip_skip
-                            cfg_scale, // sfg_scale
-                            3.5f,
-                            width,
-                            height,
-                            sample_method, 
-                            steps,
-                            seed,
-                            1,
-                            NULL,
-                            0.9f,
-                            20.f,
-                            false,
-                            "",
-                            skip_layers.data(),
-                            skip_layers.size(),
-                            0,
-                            0.01,
-                            0.2);
-
-    if (results == NULL) {
-        fprintf (stderr, "NO results\n");
-        return 1;
-    }
-
-    if (results[0].data == NULL) {
-        fprintf (stderr, "Results with no data\n");
-        return 1;
-    }
-
-    fprintf (stderr, "Writing PNG\n");
-
-    fprintf (stderr, "DST: %s\n", dst);
-    fprintf (stderr, "Width: %d\n", results[0].width);
-    fprintf (stderr, "Height: %d\n", results[0].height);
-    fprintf (stderr, "Channel: %d\n", results[0].channel);
-    fprintf (stderr, "Data: %p\n", results[0].data);
-
-    stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
-                       results[0].data, 0, NULL);
-    fprintf (stderr, "Saved resulting image to '%s'\n", dst);
-
-    // TODO: free results. Why does it crash?
-
-    free(results[0].data);
-    results[0].data = NULL;
-    free(results);
-    fprintf (stderr, "gen_image is done", dst);
-
-    return 0;
-}
-
-int unload() {
-    free_sd_ctx(sd_c);
-}
-
--- a/backend/go/image/stablediffusion-ggml/gosd.go
+++ b/backend/go/image/stablediffusion-ggml/gosd.go
@@ -1,96 +0,0 @@
-package main
-
-// #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
-// #cgo LDFLAGS: -L${SRCDIR}/ -lsd -lstdc++ -lm -lggmlall -lgomp
-// #include <gosd.h>
-// #include <stdlib.h>
-import "C"
-
-import (
-	"fmt"
-	"os"
-	"path/filepath"
-	"strings"
-	"unsafe"
-
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/utils"
-)
-
-type SDGGML struct {
-	base.SingleThread
-	threads      int
-	sampleMethod string
-	cfgScale     float32
-}
-
-func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
-
-	sd.threads = int(opts.Threads)
-
-	modelFile := C.CString(opts.ModelFile)
-	defer C.free(unsafe.Pointer(modelFile))
-
-	var options **C.char
-	// prepare the options array to pass to C
-
-	size := C.size_t(unsafe.Sizeof((*C.char)(nil)))
-	length := C.size_t(len(opts.Options))
-	options = (**C.char)(C.malloc(length * size))
-	view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0:len(opts.Options):len(opts.Options)]
-
-	var diffusionModel int
-
-	var oo []string
-	for _, op := range opts.Options {
-		if op == "diffusion_model" {
-			diffusionModel = 1
-			continue
-		}
-
-		// If it's an option path, we resolve absolute path from the model path
-		if strings.Contains(op, ":") && strings.Contains(op, "path") {
-			data := strings.Split(op, ":")
-			data[1] = filepath.Join(opts.ModelPath, data[1])
-			if err := utils.VerifyPath(data[1], opts.ModelPath); err == nil {
-				oo = append(oo, strings.Join(data, ":"))
-			}
-		} else {
-			oo = append(oo, op)
-		}
-	}
-
-	fmt.Fprintf(os.Stderr, "Options: %+v\n", oo)
-
-	for i, x := range oo {
-		view[i] = C.CString(x)
-	}
-
-	sd.cfgScale = opts.CFGScale
-
-	ret := C.load_model(modelFile, options, C.int(opts.Threads), C.int(diffusionModel))
-	if ret != 0 {
-		return fmt.Errorf("could not load model")
-	}
-
-	return nil
-}
-
-func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
-	t := C.CString(opts.PositivePrompt)
-	defer C.free(unsafe.Pointer(t))
-
-	dst := C.CString(opts.Dst)
-	defer C.free(unsafe.Pointer(dst))
-
-	negative := C.CString(opts.NegativePrompt)
-	defer C.free(unsafe.Pointer(negative))
-
-	ret := C.gen_image(t, negative, C.int(opts.Width), C.int(opts.Height), C.int(opts.Step), C.int(opts.Seed), dst, C.float(sd.cfgScale))
-	if ret != 0 {
-		return fmt.Errorf("inference failed")
-	}
-
-	return nil
-}
--- a/backend/go/image/stablediffusion-ggml/gosd.h
+++ b/backend/go/image/stablediffusion-ggml/gosd.h
@@ -1,8 +0,0 @@
-#ifdef __cplusplus
-extern "C" {
-#endif
-int load_model(char *model, char* options[], int threads, int diffusionModel);
-int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed, char *dst, float cfg_scale);
-#ifdef __cplusplus
-}
-#endif
--- a/backend/go/image/stablediffusion-ggml/main.go
+++ b/backend/go/image/stablediffusion-ggml/main.go
@@ -1,6 +1,7 @@
 package main

 // Note: this is started internally by LocalAI and a server is allocated for each model
+
 import (
 	"flag"

@@ -14,7 +15,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &SDGGML{}); err != nil {
+	if err := grpc.StartServer(*addr, &Image{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/image/stablediffusion/stablediffusion.go
+++ b/backend/go/image/stablediffusion/stablediffusion.go
@@ -0,0 +1,33 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/stablediffusion"
+)
+
+type Image struct {
+	base.SingleThread
+	stablediffusion *stablediffusion.StableDiffusion
+}
+
+func (image *Image) Load(opts *pb.ModelOptions) error {
+	var err error
+	// Note: the Model here is a path to a directory containing the model files
+	image.stablediffusion, err = stablediffusion.New(opts.ModelFile)
+	return err
+}
+
+func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
+	return image.stablediffusion.GenerateImage(
+		int(opts.Height),
+		int(opts.Width),
+		int(opts.Mode),
+		int(opts.Step),
+		int(opts.Seed),
+		opts.PositivePrompt,
+		opts.NegativePrompt,
+		opts.Dst)
+}
--- a/backend/go/image/tinydream/main.go
+++ b/backend/go/image/tinydream/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &Image{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/image/tinydream/tinydream.go
+++ b/backend/go/image/tinydream/tinydream.go
@@ -0,0 +1,32 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/tinydream"
+)
+
+type Image struct {
+	base.SingleThread
+	tinydream *tinydream.TinyDream
+}
+
+func (image *Image) Load(opts *pb.ModelOptions) error {
+	var err error
+	// Note: the Model here is a path to a directory containing the model files
+	image.tinydream, err = tinydream.New(opts.ModelFile)
+	return err
+}
+
+func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
+	return image.tinydream.GenerateImage(
+		int(opts.Height),
+		int(opts.Width),
+		int(opts.Step),
+		int(opts.Seed),
+		opts.PositivePrompt,
+		opts.NegativePrompt,
+		opts.Dst)
+}
--- a/backend/go/llm/bert/bert.go
+++ b/backend/go/llm/bert/bert.go
@@ -0,0 +1,34 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	bert "github.com/go-skynet/go-bert.cpp"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+type Embeddings struct {
+	base.SingleThread
+	bert *bert.Bert
+}
+
+func (llm *Embeddings) Load(opts *pb.ModelOptions) error {
+	model, err := bert.New(opts.ModelFile)
+	llm.bert = model
+	return err
+}
+
+func (llm *Embeddings) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
+
+	if len(opts.EmbeddingTokens) > 0 {
+		tokens := []int{}
+		for _, t := range opts.EmbeddingTokens {
+			tokens = append(tokens, int(t))
+		}
+		return llm.bert.TokenEmbeddings(tokens, bert.SetThreads(int(opts.Threads)))
+	}
+
+	return llm.bert.Embeddings(opts.Embeddings, bert.SetThreads(int(opts.Threads)))
+}
--- a/backend/go/llm/bert/main.go
+++ b/backend/go/llm/bert/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &Embeddings{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/llm/gpt4all/gpt4all.go
+++ b/backend/go/llm/gpt4all/gpt4all.go
@@ -0,0 +1,62 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	gpt4all "github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
+)
+
+type LLM struct {
+	base.SingleThread
+
+	gpt4all *gpt4all.Model
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	model, err := gpt4all.New(opts.ModelFile,
+		gpt4all.SetThreads(int(opts.Threads)),
+		gpt4all.SetLibrarySearchPath(opts.LibrarySearchPath))
+	llm.gpt4all = model
+	return err
+}
+
+func buildPredictOptions(opts *pb.PredictOptions) []gpt4all.PredictOption {
+	predictOptions := []gpt4all.PredictOption{
+		gpt4all.SetTemperature(float64(opts.Temperature)),
+		gpt4all.SetTopP(float64(opts.TopP)),
+		gpt4all.SetTopK(int(opts.TopK)),
+		gpt4all.SetTokens(int(opts.Tokens)),
+	}
+
+	if opts.Batch != 0 {
+		predictOptions = append(predictOptions, gpt4all.SetBatch(int(opts.Batch)))
+	}
+	return predictOptions
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	return llm.gpt4all.Predict(opts.Prompt, buildPredictOptions(opts)...)
+}
+
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	predictOptions := buildPredictOptions(opts)
+
+	go func() {
+		llm.gpt4all.SetTokenCallback(func(token string) bool {
+			results <- token
+			return true
+		})
+		_, err := llm.gpt4all.Predict(opts.Prompt, predictOptions...)
+		if err != nil {
+			fmt.Println("err: ", err)
+		}
+		llm.gpt4all.SetTokenCallback(nil)
+		close(results)
+	}()
+
+	return nil
+}
--- a/backend/go/llm/gpt4all/main.go
+++ b/backend/go/llm/gpt4all/main.go
@@ -15,7 +15,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &VAD{}); err != nil {
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/llama-ggml/llama.go
+++ b/backend/go/llm/llama-ggml/llama.go
@@ -0,0 +1,204 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+
+	"github.com/go-skynet/go-llama.cpp"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+type LLM struct {
+	base.SingleThread
+
+	llama *llama.LLama
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	ropeFreqBase := float32(10000)
+	ropeFreqScale := float32(1)
+
+	if opts.RopeFreqBase != 0 {
+		ropeFreqBase = opts.RopeFreqBase
+	}
+	if opts.RopeFreqScale != 0 {
+		ropeFreqScale = opts.RopeFreqScale
+	}
+
+	llamaOpts := []llama.ModelOption{
+		llama.WithRopeFreqBase(ropeFreqBase),
+		llama.WithRopeFreqScale(ropeFreqScale),
+	}
+
+	if opts.NGQA != 0 {
+		llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
+	}
+
+	if opts.RMSNormEps != 0 {
+		llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
+	}
+
+	if opts.ContextSize != 0 {
+		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
+	}
+	if opts.F16Memory {
+		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
+	}
+	if opts.Embeddings {
+		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
+	}
+	if opts.NGPULayers != 0 {
+		llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
+	}
+
+	llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
+	llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
+	llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
+	if opts.NBatch != 0 {
+		llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
+	} else {
+		llamaOpts = append(llamaOpts, llama.SetNBatch(512))
+	}
+
+	if opts.NUMA {
+		llamaOpts = append(llamaOpts, llama.EnableNUMA)
+	}
+
+	if opts.LowVRAM {
+		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
+	}
+
+	model, err := llama.New(opts.ModelFile, llamaOpts...)
+	llm.llama = model
+
+	return err
+}
+
+func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
+	ropeFreqBase := float32(10000)
+	ropeFreqScale := float32(1)
+
+	if opts.RopeFreqBase != 0 {
+		ropeFreqBase = opts.RopeFreqBase
+	}
+	if opts.RopeFreqScale != 0 {
+		ropeFreqScale = opts.RopeFreqScale
+	}
+	predictOptions := []llama.PredictOption{
+		llama.SetTemperature(opts.Temperature),
+		llama.SetTopP(opts.TopP),
+		llama.SetTopK(int(opts.TopK)),
+		llama.SetTokens(int(opts.Tokens)),
+		llama.SetThreads(int(opts.Threads)),
+		llama.WithGrammar(opts.Grammar),
+		llama.SetRopeFreqBase(ropeFreqBase),
+		llama.SetRopeFreqScale(ropeFreqScale),
+		llama.SetNegativePromptScale(opts.NegativePromptScale),
+		llama.SetNegativePrompt(opts.NegativePrompt),
+	}
+
+	if opts.PromptCacheAll {
+		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
+	}
+
+	if opts.PromptCacheRO {
+		predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
+	}
+
+	// Expected absolute path
+	if opts.PromptCachePath != "" {
+		predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
+	}
+
+	if opts.Mirostat != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
+	}
+
+	if opts.MirostatETA != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
+	}
+
+	if opts.MirostatTAU != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
+	}
+
+	if opts.Debug {
+		predictOptions = append(predictOptions, llama.Debug)
+	}
+
+	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
+
+	if opts.PresencePenalty != 0 {
+		predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
+	}
+
+	if opts.NKeep != 0 {
+		predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
+	}
+
+	if opts.Batch != 0 {
+		predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
+	}
+
+	if opts.F16KV {
+		predictOptions = append(predictOptions, llama.EnableF16KV)
+	}
+
+	if opts.IgnoreEOS {
+		predictOptions = append(predictOptions, llama.IgnoreEOS)
+	}
+
+	if opts.Seed != 0 {
+		predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
+	}
+
+	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
+
+	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
+	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
+	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
+	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
+	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
+	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
+	predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
+	return predictOptions
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
+}
+
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	predictOptions := buildPredictOptions(opts)
+
+	predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
+		results <- token
+		return true
+	}))
+
+	go func() {
+		_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
+		if err != nil {
+			fmt.Println("err: ", err)
+		}
+		close(results)
+	}()
+
+	return nil
+}
+
+func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
+	predictOptions := buildPredictOptions(opts)
+
+	if len(opts.EmbeddingTokens) > 0 {
+		tokens := []int{}
+		for _, t := range opts.EmbeddingTokens {
+			tokens = append(tokens, int(t))
+		}
+		return llm.llama.TokenEmbeddings(tokens, predictOptions...)
+	}
+
+	return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
+}
--- a/backend/go/llm/llama-ggml/main.go
+++ b/backend/go/llm/llama-ggml/main.go
@@ -0,0 +1,19 @@
+package main
+
+import (
+	"flag"
+
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/llm/rwkv/main.go
+++ b/backend/go/llm/rwkv/main.go
@@ -1,6 +1,7 @@
 package main

 // Note: this is started internally by LocalAI and a server is allocated for each model
+
 import (
 	"flag"

@@ -14,7 +15,7 @@ var (
 func main() {
 	flag.Parse()

-	if err := grpc.StartServer(*addr, &Bark{}); err != nil {
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/rwkv/rwkv.go
+++ b/backend/go/llm/rwkv/rwkv.go
@@ -0,0 +1,95 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+	"path/filepath"
+
+	"github.com/donomii/go-rwkv.cpp"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+const tokenizerSuffix = ".tokenizer.json"
+
+type LLM struct {
+	base.SingleThread
+
+	rwkv *rwkv.RwkvState
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	tokenizerFile := opts.Tokenizer
+	if tokenizerFile == "" {
+		modelFile := filepath.Base(opts.ModelFile)
+		tokenizerFile = modelFile + tokenizerSuffix
+	}
+	modelPath := filepath.Dir(opts.ModelFile)
+	tokenizerPath := filepath.Join(modelPath, tokenizerFile)
+
+	model := rwkv.LoadFiles(opts.ModelFile, tokenizerPath, uint32(opts.GetThreads()))
+
+	if model == nil {
+		return fmt.Errorf("rwkv could not load model")
+	}
+	llm.rwkv = model
+	return nil
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	stopWord := "\n"
+	if len(opts.StopPrompts) > 0 {
+		stopWord = opts.StopPrompts[0]
+	}
+
+	if err := llm.rwkv.ProcessInput(opts.Prompt); err != nil {
+		return "", err
+	}
+
+	response := llm.rwkv.GenerateResponse(int(opts.Tokens), stopWord, float32(opts.Temperature), float32(opts.TopP), nil)
+
+	return response, nil
+}
+
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	go func() {
+
+		stopWord := "\n"
+		if len(opts.StopPrompts) > 0 {
+			stopWord = opts.StopPrompts[0]
+		}
+
+		if err := llm.rwkv.ProcessInput(opts.Prompt); err != nil {
+			fmt.Println("Error processing input: ", err)
+			return
+		}
+
+		llm.rwkv.GenerateResponse(int(opts.Tokens), stopWord, float32(opts.Temperature), float32(opts.TopP), func(s string) bool {
+			results <- s
+			return true
+		})
+		close(results)
+	}()
+
+	return nil
+}
+
+func (llm *LLM) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationResponse, error) {
+	tokens, err := llm.rwkv.Tokenizer.Encode(opts.Prompt)
+	if err != nil {
+		return pb.TokenizationResponse{}, err
+	}
+
+	l := len(tokens)
+	i32Tokens := make([]int32, l)
+
+	for i, t := range tokens {
+		i32Tokens[i] = int32(t.ID)
+	}
+
+	return pb.TokenizationResponse{
+		Length: int32(l),
+		Tokens: i32Tokens,
+	}, nil
+}
--- a/backend/go/stores/store.go
+++ b/backend/go/stores/store.go
@@ -311,16 +311,12 @@ func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error)
 }

 func isNormalized(k []float32) bool {
-	var sum float64
-
+	var sum float32
 	for _, v := range k {
-		v64 := float64(v)
-		sum += v64*v64
+		sum += v
 	}

-	s := math.Sqrt(sum)
-
-	return s >= 0.99 && s <= 1.01
+	return sum == 1.0
 }

 // TODO: This we could replace with handwritten SIMD code
@@ -332,7 +328,7 @@ func normalizedCosineSimilarity(k1, k2 []float32) float32 {
 		dot += k1[i] * k2[i]
 	}

-	assert(dot >= -1.01 && dot <= 1.01, fmt.Sprintf("dot = %f", dot))
+	assert(dot >= -1 && dot <= 1, fmt.Sprintf("dot = %f", dot))

 	// 2.0 * (1.0 - dot) would be the Euclidean distance
 	return dot
@@ -422,7 +418,7 @@ func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {

 	sim := float32(dot / (mag1 * math.Sqrt(mag2)))

-	assert(sim >= -1.01 && sim <= 1.01, fmt.Sprintf("sim = %f", sim))
+	assert(sim >= -1 && sim <= 1, fmt.Sprintf("sim = %f", sim))

 	return sim
 }
--- a/backend/go/transcribe/whisper/main.go
+++ b/backend/go/transcribe/whisper/main.go
--- a/backend/go/transcribe/transcript.go
+++ b/backend/go/transcribe/transcript.go
@@ -0,0 +1,104 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+
+	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	"github.com/go-audio/wav"
+	"github.com/mudler/LocalAI/core/schema"
+)
+
+func ffmpegCommand(args []string) (string, error) {
+	cmd := exec.Command("ffmpeg", args...) // Constrain this to ffmpeg to permit security scanner to see that the command is safe.
+	cmd.Env = os.Environ()
+	out, err := cmd.CombinedOutput()
+	return string(out), err
+}
+
+// AudioToWav converts audio to wav for transcribe.
+// TODO: use https://github.com/mccoyst/ogg?
+func audioToWav(src, dst string) error {
+	commandArgs := []string{"-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
+	out, err := ffmpegCommand(commandArgs)
+	if err != nil {
+		return fmt.Errorf("error: %w out: %s", err, out)
+	}
+	return nil
+}
+
+func Transcript(model whisper.Model, audiopath, language string, translate bool, threads uint) (schema.TranscriptionResult, error) {
+	res := schema.TranscriptionResult{}
+
+	dir, err := os.MkdirTemp("", "whisper")
+	if err != nil {
+		return res, err
+	}
+	defer os.RemoveAll(dir)
+
+	convertedPath := filepath.Join(dir, "converted.wav")
+
+	if err := audioToWav(audiopath, convertedPath); err != nil {
+		return res, err
+	}
+
+	// Open samples
+	fh, err := os.Open(convertedPath)
+	if err != nil {
+		return res, err
+	}
+	defer fh.Close()
+
+	// Read samples
+	d := wav.NewDecoder(fh)
+	buf, err := d.FullPCMBuffer()
+	if err != nil {
+		return res, err
+	}
+
+	data := buf.AsFloat32Buffer().Data
+
+	// Process samples
+	context, err := model.NewContext()
+	if err != nil {
+		return res, err
+
+	}
+
+	context.SetThreads(threads)
+
+	if language != "" {
+		context.SetLanguage(language)
+	} else {
+		context.SetLanguage("auto")
+	}
+
+	if translate {
+		context.SetTranslate(true)
+	}
+
+	if err := context.Process(data, nil, nil); err != nil {
+		return res, err
+	}
+
+	for {
+		s, err := context.NextSegment()
+		if err != nil {
+			break
+		}
+
+		var tokens []int
+		for _, t := range s.Tokens {
+			tokens = append(tokens, t.Id)
+		}
+
+		segment := schema.Segment{Id: s.Num, Text: s.Text, Start: s.Start, End: s.End, Tokens: tokens}
+		res.Segments = append(res.Segments, segment)
+
+		res.Text += s.Text
+	}
+
+	return res, nil
+}
--- a/backend/go/transcribe/whisper.go
+++ b/backend/go/transcribe/whisper.go
@@ -0,0 +1,26 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+type Whisper struct {
+	base.SingleThread
+	whisper whisper.Model
+}
+
+func (sd *Whisper) Load(opts *pb.ModelOptions) error {
+	// Note: the Model here is a path to a directory containing the model files
+	w, err := whisper.New(opts.ModelFile)
+	sd.whisper = w
+	return err
+}
+
+func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.TranscriptionResult, error) {
+	return Transcript(sd.whisper, opts.Dst, opts.Language, opts.Translate, uint(opts.Threads))
+}
--- a/backend/go/transcribe/whisper/whisper.go
+++ b/backend/go/transcribe/whisper/whisper.go
@@ -1,105 +0,0 @@
-package main
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"os"
-	"path/filepath"
-
-	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
-	"github.com/go-audio/wav"
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/mudler/LocalAI/pkg/utils"
-)
-
-type Whisper struct {
-	base.SingleThread
-	whisper whisper.Model
-}
-
-func (sd *Whisper) Load(opts *pb.ModelOptions) error {
-	// Note: the Model here is a path to a directory containing the model files
-	w, err := whisper.New(opts.ModelFile)
-	sd.whisper = w
-	return err
-}
-
-func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
-
-	dir, err := os.MkdirTemp("", "whisper")
-	if err != nil {
-		return pb.TranscriptResult{}, err
-	}
-	defer os.RemoveAll(dir)
-
-	convertedPath := filepath.Join(dir, "converted.wav")
-
-	if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil {
-		return pb.TranscriptResult{}, err
-	}
-
-	// Open samples
-	fh, err := os.Open(convertedPath)
-	if err != nil {
-		return pb.TranscriptResult{}, err
-	}
-	defer fh.Close()
-
-	// Read samples
-	d := wav.NewDecoder(fh)
-	buf, err := d.FullPCMBuffer()
-	if err != nil {
-		return pb.TranscriptResult{}, err
-	}
-
-	data := buf.AsFloat32Buffer().Data
-
-	// Process samples
-	context, err := sd.whisper.NewContext()
-	if err != nil {
-		return pb.TranscriptResult{}, err
-
-	}
-
-	context.SetThreads(uint(opts.Threads))
-
-	if opts.Language != "" {
-		context.SetLanguage(opts.Language)
-	} else {
-		context.SetLanguage("auto")
-	}
-
-	if opts.Translate {
-		context.SetTranslate(true)
-	}
-
-	if err := context.Process(data, nil, nil); err != nil {
-		return pb.TranscriptResult{}, err
-	}
-
-	segments := []*pb.TranscriptSegment{}
-	text := ""
-	for {
-		s, err := context.NextSegment()
-		if err != nil {
-			break
-		}
-
-		var tokens []int32
-		for _, t := range s.Tokens {
-			tokens = append(tokens, int32(t.Id))
-		}
-
-		segment := &pb.TranscriptSegment{Id: int32(s.Num), Text: s.Text, Start: int64(s.Start), End: int64(s.End), Tokens: tokens}
-		segments = append(segments, segment)
-
-		text += s.Text
-	}
-
-	return pb.TranscriptResult{
-		Segments: segments,
-		Text:     text,
-	}, nil
-
-}
--- a/backend/go/vad/silero/vad.go
+++ b/backend/go/vad/silero/vad.go
@@ -1,54 +0,0 @@
-package main
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-	"github.com/streamer45/silero-vad-go/speech"
-)
-
-type VAD struct {
-	base.SingleThread
-	detector *speech.Detector
-}
-
-func (vad *VAD) Load(opts *pb.ModelOptions) error {
-	v, err := speech.NewDetector(speech.DetectorConfig{
-		ModelPath:  opts.ModelFile,
-		SampleRate: 16000,
-		//WindowSize:           1024,
-		Threshold:            0.5,
-		MinSilenceDurationMs: 0,
-		SpeechPadMs:          0,
-	})
-	if err != nil {
-		return fmt.Errorf("create silero detector: %w", err)
-	}
-
-	vad.detector = v
-	return err
-}
-
-func (vad *VAD) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
-	audio := req.Audio
-
-	segments, err := vad.detector.Detect(audio)
-	if err != nil {
-		return pb.VADResponse{}, fmt.Errorf("detect: %w", err)
-	}
-
-	vadSegments := []*pb.VADSegment{}
-	for _, s := range segments {
-		vadSegments = append(vadSegments, &pb.VADSegment{
-			Start: float32(s.SpeechStartAt),
-			End:   float32(s.SpeechEndAt),
-		})
-	}
-
-	return pb.VADResponse{
-		Segments: vadSegments,
-	}, nil
-}
--- a/backend/python/autogptq/requirements-cublas11.txt
+++ b/backend/python/autogptq/requirements-cublas11.txt
@@ -1,2 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch==2.4.1+cu118
--- a/backend/python/autogptq/requirements-cublas12.txt
+++ b/backend/python/autogptq/requirements-cublas12.txt
@@ -1 +0,0 @@
-torch==2.4.1
--- a/backend/python/autogptq/requirements-hipblas.txt
+++ b/backend/python/autogptq/requirements-hipblas.txt
@@ -1,2 +1,2 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.4.1+rocm6.0
+torch
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -1,6 +1,5 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
-torch==2.3.1+cxx11.abi
-oneccl_bind_pt==2.3.100+xpu
+intel-extension-for-pytorch
+torch
 optimum[openvino]
-setuptools
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,7 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.70.0
+grpcio==1.65.1
 protobuf
+torch
 certifi
 transformers
--- a/backend/python/bark/requirements-cpu.txt
+++ b/backend/python/bark/requirements-cpu.txt
@@ -1,4 +0,0 @@
-transformers
-accelerate
-torch==2.4.1
-torchaudio==2.4.1
--- a/backend/python/bark/requirements-cublas11.txt
+++ b/backend/python/bark/requirements-cublas11.txt
@@ -1,5 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch==2.4.1+cu118
-torchaudio==2.4.1+cu118
-transformers
-accelerate
--- a/backend/python/bark/requirements-cublas12.txt
+++ b/backend/python/bark/requirements-cublas12.txt
@@ -1,4 +0,0 @@
-torch==2.4.1
-torchaudio==2.4.1
-transformers
-accelerate
--- a/backend/python/bark/requirements-hipblas.txt
+++ b/backend/python/bark/requirements-hipblas.txt
@@ -1,5 +1,3 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.4.1+rocm6.0
-torchaudio==2.4.1+rocm6.0
-transformers
-accelerate
+torch
+torchaudio
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -1,9 +1,6 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
-torch==2.3.1+cxx11.abi
-torchaudio==2.3.1+cxx11.abi
-oneccl_bind_pt==2.3.100+xpu
+intel-extension-for-pytorch
+torch
+torchaudio
 optimum[openvino]
-setuptools
-transformers
-accelerate
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,6 @@
+accelerate
 bark==0.1.5
-grpcio==1.70.0
+grpcio==1.65.1
 protobuf
-certifi
+certifi
+transformers
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -17,27 +17,11 @@
 # LIMIT_TARGETS="cublas12"
 # source $(dirname $0)/../common/libbackend.sh
 #
-
-PYTHON_VERSION="3.10"
-
 function init() {
-    # Name of the backend (directory name)
    BACKEND_NAME=${PWD##*/}
-
-    # Path where all backends files are
    MY_DIR=$(realpath `dirname $0`)
-
-    # Build type
    BUILD_PROFILE=$(getBuildProfile)

-    # Environment directory
-    EDIR=${MY_DIR}
-
-    # Allow to specify a custom env dir for shared environments
-    if [ "x${ENV_DIR}" != "x" ]; then
-        EDIR=${ENV_DIR}
-    fi
-
    # If a backend has defined a list of valid build profiles...
    if [ ! -z "${LIMIT_TARGETS}" ]; then
        isValidTarget=$(checkTargets ${LIMIT_TARGETS})
@@ -90,14 +74,13 @@ function getBuildProfile() {
 # This function is idempotent, so you can call it as many times as you want and it will
 # always result in an activated virtual environment
 function ensureVenv() {
-    if [ ! -d "${EDIR}/venv" ]; then
-        uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
+    if [ ! -d "${MY_DIR}/venv" ]; then
+        uv venv ${MY_DIR}/venv
        echo "virtualenv created"
    fi
-
-    # Source if we are not already in a Virtual env
-    if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then
-        source ${EDIR}/venv/bin/activate
+    
+    if [ "x${VIRTUAL_ENV}" != "x${MY_DIR}/venv" ]; then
+        source ${MY_DIR}/venv/bin/activate
        echo "virtualenv activated"
    fi

@@ -130,24 +113,13 @@ function installRequirements() {

    # These are the requirements files we will attempt to install, in order
    declare -a requirementFiles=(
-        "${EDIR}/requirements-install.txt"
-        "${EDIR}/requirements.txt"
-        "${EDIR}/requirements-${BUILD_TYPE}.txt"
+        "${MY_DIR}/requirements-install.txt"
+        "${MY_DIR}/requirements.txt"
+        "${MY_DIR}/requirements-${BUILD_TYPE}.txt"
    )

    if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
-        requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}.txt")
-    fi
-
-    # if BUILD_TYPE is empty, we are a CPU build, so we should try to install the CPU requirements
-    if [ "x${BUILD_TYPE}" == "x" ]; then
-        requirementFiles+=("${EDIR}/requirements-cpu.txt")
-    fi
-
-    requirementFiles+=("${EDIR}/requirements-after.txt")
-
-    if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
-        requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}-after.txt")
+        requirementFiles+=("${MY_DIR}/requirements-${BUILD_PROFILE}.txt")
    fi

    for reqFile in ${requirementFiles[@]}; do
--- a/backend/python/common/template/Makefile
+++ b/backend/python/common/template/Makefile
@@ -1,9 +1,8 @@
 .DEFAULT_GOAL := install

 .PHONY: install
-install:
+install: protogen
 	bash install.sh
-	$(MAKE) protogen

 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
@@ -13,7 +12,7 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py

 backend_pb2_grpc.py backend_pb2.py:
-	bash protogen.sh
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto

 .PHONY: clean
 clean: protogen-clean
--- a/backend/python/common/template/protogen.sh
+++ b/backend/python/common/template/protogen.sh
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-
-source $(dirname $0)/../common/libbackend.sh
-
-python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/common/template/requirements-intel.txt
+++ b/backend/python/common/template/requirements-intel.txt
@@ -1,5 +1,4 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
-torch==2.3.1+cxx11.abi
-oneccl_bind_pt==2.3.100+xpu
+intel-extension-for-pytorch
+torch
 optimum[openvino]
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,2 @@
-grpcio==1.70.0
-protobuf
-grpcio-tools
+grpcio==1.65.1
+protobuf
--- a/backend/python/coqui/requirements-cpu.txt
+++ b/backend/python/coqui/requirements-cpu.txt
@@ -1,4 +0,0 @@
-transformers
-accelerate
-torch==2.4.1
-coqui-tts
--- a/backend/python/coqui/requirements-cublas11.txt
+++ b/backend/python/coqui/requirements-cublas11.txt
@@ -1,6 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch==2.4.1+cu118
-torchaudio==2.4.1+cu118
-transformers
-accelerate
-coqui-tts
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ettore Di Giacinto	59bfc67ead	workaround upstream issue Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2024-07-24 11:02:58 +02:00
Ettore Di Giacinto	f80b6dfc2d	test Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2024-07-24 10:23:41 +02:00
Ettore Di Giacinto	4c16957448	deps(llama.cpp): bump, try to fix sycl linking issues Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2024-07-24 08:59:18 +02:00