experiment: build with a single image with all the deps

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-02-07 05:04:29 -05:00 · 2024-07-01 19:43:18 +02:00
650 changed files with 17337 additions and 29101 deletions
--- a/Generation/musicgen.bru
+++ b/Generation/musicgen.bru
@@ -1,23 +0,0 @@
 meta {
  name: musicgen
  type: http
  seq: 1
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/sound-generation
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model_id": "facebook/musicgen-small",
      "text": "Exciting 80s Newscast Interstitial",
      "duration_seconds": 8
  }
 }
--- a/Requests/model
+++ b/Requests/model
@@ -1,11 +0,0 @@
 meta {
  name: model delete
  type: http
  seq: 7
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: none
  auth: none
 }
--- a/Requests/transcription/gb1.ogg
+++ b/Requests/transcription/gb1.ogg
--- a/Requests/transcription/transcribe.bru
+++ b/Requests/transcription/transcribe.bru
@@ -1,16 +0,0 @@
 meta {
  name: transcribe
  type: http
  seq: 1
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/audio/transcriptions
  body: multipartForm
  auth: none
 }
 body:multipart-form {
  file: @file(transcription/gb1.ogg)
  model: whisper-1
 }
--- a/.devcontainer-scripts/postcreate.sh
+++ b/.devcontainer-scripts/postcreate.sh
@@ -1,17 +0,0 @@
 #!/bin/bash
 cd /workspace
 # Get the files into the volume without a bind mount
 if [ ! -d ".git" ]; then
    git clone https://github.com/mudler/LocalAI.git .
 else
    git fetch
 fi
 echo "Standard Post-Create script completed."
 if [ -f "/devcontainer-customization/postcreate.sh" ]; then
    echo "Launching customization postcreate.sh"
    bash "/devcontainer-customization/postcreate.sh"
 fi
--- a/.devcontainer-scripts/poststart.sh
+++ b/.devcontainer-scripts/poststart.sh
@@ -1,16 +0,0 @@
 #!/bin/bash
 cd /workspace
 # Grab the pre-stashed backend assets to avoid build issues
 cp -r /build/backend-assets /workspace/backend-assets
 # Ensures generated source files are present upon load
 make prepare
 echo "Standard Post-Start script completed."
 if [ -f "/devcontainer-customization/poststart.sh" ]; then
    echo "Launching customization poststart.sh"
    bash "/devcontainer-customization/poststart.sh"
 fi
--- a/.devcontainer-scripts/utils.sh
+++ b/.devcontainer-scripts/utils.sh
@@ -1,55 +0,0 @@
 #!/bin/bash
 # This file contains some really simple functions that are useful when building up customization scripts.
 # Checks if the git config has a user registered - and sets it up if not.
 #
 # Param 1: name
 # Param 2: email
 #
 config_user() {
    echo "Configuring git for $1 <$2>"
    local gcn=$(git config --global user.name)
    if [ -z "${gcn}" ]; then
        echo "Setting up git user / remote"
        git config --global user.name "$1"
        git config --global user.email "$2"
    fi
 }
 # Checks if the git remote is configured - and sets it up if not. Fetches either way.
 #
 # Param 1: remote name
 # Param 2: remote url
 #
 config_remote() {
    echo "Adding git remote and fetching $2 as $1"
    local gr=$(git remote -v | grep $1)
    if [ -z "${gr}" ]; then
        git remote add $1 $2
    fi
    git fetch $1
 }
 # Setup special .ssh files
 # Prints out lines of text to make things pretty
 # Param 1: bash array, filenames relative to the customization directory that should be copied to ~/.ssh
 setup_ssh() {
    echo "starting ~/.ssh directory setup..."
    mkdir -p "${HOME}.ssh"
    chmod 0700 "${HOME}/.ssh"
    echo "-----"
    local files=("$@")
    for file in "${files[@]}" ; do
        local cfile="/devcontainer-customization/${file}"
        local hfile="${HOME}/.ssh/${file}"
        if [ ! -f "${hfile}" ]; then
            echo "copying \"${file}\""
            cp "${cfile}" "${hfile}"
            chmod 600 "${hfile}"
        fi
    done
    echo "~/.ssh directory setup complete!"
 }
--- a/.devcontainer/customization/README.md
+++ b/.devcontainer/customization/README.md
@@ -1,25 +0,0 @@
 Place any additional resources your environment requires in this directory
 Script hooks are currently called for:
 `postcreate.sh` and `poststart.sh`
 If files with those names exist here, they will be called at the end of the normal script.
 This is a good place to set things like `git config --global user.name` are set - and to handle any other files that are mounted via this directory.
 To assist in doing so, `source /.devcontainer-scripts/utils.sh` will provide utility functions that may be useful - for example:
 ```
 #!/bin/bash
 source "/.devcontainer-scripts/utils.sh"
 sshfiles=("config", "key.pub")
 setup_ssh "${sshfiles[@]}"
 config_user "YOUR NAME" "YOUR EMAIL"
 config_remote "REMOTE NAME" "REMOTE URL"
 ```
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,24 +0,0 @@
 {
    "$schema": "https://raw.githubusercontent.com/devcontainers/spec/main/schemas/devContainer.schema.json",
    "name": "LocalAI",
    "workspaceFolder": "/workspace",
    "dockerComposeFile": [ "./docker-compose-devcontainer.yml" ],
    "service": "api",
    "shutdownAction": "stopCompose",
    "customizations": {
        "vscode": {
            "extensions": [
                "golang.go",
                "ms-vscode.makefile-tools",
                "ms-azuretools.vscode-docker",
                "ms-python.python",
                "ms-python.debugpy",
                "wayou.vscode-todo-highlight",
                "waderyan.gitblame"
            ]
        }
    },
    "forwardPorts": [8080, 3000],
    "postCreateCommand": "bash /.devcontainer-scripts/postcreate.sh",
    "postStartCommand": "bash /.devcontainer-scripts/poststart.sh"
 }
--- a/.devcontainer/docker-compose-devcontainer.yml
+++ b/.devcontainer/docker-compose-devcontainer.yml
@@ -1,48 +0,0 @@
 services:
  api:
    build:
      context: ..
      dockerfile: Dockerfile
      target: devcontainer
      args:
      - FFMPEG=true
      - IMAGE_TYPE=extras
      - GO_TAGS=p2p tts
    env_file:
      - ../.env
    ports:
      - 8080:8080
    volumes:
      - localai_workspace:/workspace
      - ../models:/host-models
      - ./customization:/devcontainer-customization
    command: /bin/sh -c "while sleep 1000; do :; done"
    cap_add:
      - SYS_PTRACE
    security_opt:
      - seccomp:unconfined
  prometheus:
    image: prom/prometheus
    container_name: prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
    ports:
      - 9090:9090
    restart: unless-stopped
    volumes:
      - ./prometheus:/etc/prometheus
      - prom_data:/prometheus
  grafana:
    image: grafana/grafana
    container_name: grafana
    ports:
      - 3000:3000
    restart: unless-stopped
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=grafana
    volumes:
      - ./grafana:/etc/grafana/provisioning/datasources
 volumes:
  prom_data:
  localai_workspace:
--- a/.devcontainer/grafana/datasource.yml
+++ b/.devcontainer/grafana/datasource.yml
@@ -1,10 +0,0 @@
 apiVersion: 1
 datasources:
 - name: Prometheus
  type: prometheus
  url: http://prometheus:9090 
  isDefault: true
  access: proxy
  editable: true
--- a/.devcontainer/prometheus/prometheus.yml
+++ b/.devcontainer/prometheus/prometheus.yml
@@ -1,21 +0,0 @@
 global:
  scrape_interval: 15s
  scrape_timeout: 10s
  evaluation_interval: 15s
 alerting:
  alertmanagers:
  - static_configs:
    - targets: []
    scheme: http
    timeout: 10s
    api_version: v1
 scrape_configs:
 - job_name: prometheus
  honor_timestamps: true
  scrape_interval: 15s
  scrape_timeout: 10s
  metrics_path: /metrics
  scheme: http
  static_configs:
  - targets:
    - localhost:9090
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,7 +1,6 @@
 .idea
 .github
 .vscode
 .devcontainer
 models
 examples/chatbot-ui/models
 examples/rwkv/models
--- a/.env
+++ b/.env
@@ -38,12 +38,12 @@
 ## Uncomment and set to true to enable rebuilding from source
 # REBUILD=true
-## Enable go tags, available: p2p, tts
+## Enable go tags, available: stablediffusion, tts
-## p2p: enable distributed inferencing
+## stablediffusion: image generation with stablediffusion
 ## tts: enables text-to-speech with go-piper 
 ## (requires REBUILD=true)
 #
-# GO_TAGS=p2p
+# GO_TAGS=stablediffusion
 ## Path where to store generated images
 # LOCALAI_IMAGE_PATH=/tmp/generated/images
@@ -79,18 +79,6 @@
 ### Enable to run parallel requests
 # LOCALAI_PARALLEL_REQUESTS=true
 # Enable to allow p2p mode
 # LOCALAI_P2P=true
 # Enable to use federated mode
 # LOCALAI_FEDERATED=true
 # Enable to start federation server
 # FEDERATED_SERVER=true
 # Define to use federation token
 # TOKEN=""
 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1 @@
 *.sh text eol=lf
 backend/cpp/llama/*.hpp linguist-vendored
--- a/.github/bump_deps.sh
+++ b/.github/bump_deps.sh
@@ -6,17 +6,4 @@ VAR=$3
 LAST_COMMIT=$(curl -s -H "Accept: application/vnd.github.VERSION.sha" "https://api.github.com/repos/$REPO/commits/$BRANCH")
 # Read $VAR from Makefile (only first match)
 set +e
 CURRENT_COMMIT="$(grep -m1 "^$VAR?=" Makefile | cut -d'=' -f2)"
 set -e
 sed -i Makefile -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
 if [ -z "$CURRENT_COMMIT" ]; then
    echo "Could not find $VAR in Makefile."
    exit 0
 fi
 echo "Changes: https://github.com/$REPO/compare/${CURRENT_COMMIT}..${LAST_COMMIT}" >> "${VAR}_message.txt"
 echo "${LAST_COMMIT}" >> "${VAR}_commit.txt"
--- a/.github/check_and_update.py
+++ b/.github/check_and_update.py
@@ -1,85 +0,0 @@
 import hashlib
 from huggingface_hub import hf_hub_download, get_paths_info
 import requests
 import sys
 import os
 uri = sys.argv[1]
 file_name = uri.split('/')[-1]
 # Function to parse the URI and determine download method
 def parse_uri(uri):
    if uri.startswith('huggingface://'):
        repo_id = uri.split('://')[1]
        return 'huggingface', repo_id.rsplit('/', 1)[0]
    elif 'huggingface.co' in uri:
        parts = uri.split('/resolve/')
        if len(parts) > 1:
            repo_path = parts[0].split('https://huggingface.co/')[-1]
            return 'huggingface', repo_path
    return 'direct', uri
 def calculate_sha256(file_path):
    sha256_hash = hashlib.sha256()
    with open(file_path, 'rb') as f:
        for byte_block in iter(lambda: f.read(4096), b''):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()
 def manual_safety_check_hf(repo_id):
    scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
    scan = scanResponse.json()
    # Check if 'hasUnsafeFile' exists in the response
    if 'hasUnsafeFile' in scan:
        if scan['hasUnsafeFile']:
            return scan
        else:
            return None
    else:
        return None
 download_type, repo_id_or_url = parse_uri(uri)
 new_checksum =  None
 file_path = None
 # Decide download method based on URI type
 if download_type == 'huggingface':
    # Check if the repo is flagged as dangerous by HF
    hazard = manual_safety_check_hf(repo_id_or_url)
    if hazard != None:
        print(f'Error: HuggingFace has detected security problems for {repo_id_or_url}: {str(hazard)}', filename=file_name)
        sys.exit(5)
    # Use HF API to pull sha
    for file in get_paths_info(repo_id_or_url, [file_name], repo_type='model'):
        try:
            new_checksum = file.lfs.sha256
            break
        except Exception as e:
            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
            sys.exit(2)
    if new_checksum is None:
        try:
            file_path = hf_hub_download(repo_id=repo_id_or_url, filename=file_name)
        except Exception as e:
            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
            sys.exit(2)
 else:
    response = requests.get(repo_id_or_url)
    if response.status_code == 200:
        with open(file_name, 'wb') as f:
            f.write(response.content)
        file_path = file_name
    elif response.status_code == 404:
        print(f'File not found: {response.status_code}', file=sys.stderr)
        sys.exit(2)
    else:
        print(f'Error downloading file: {response.status_code}', file=sys.stderr)
        sys.exit(1)
 if new_checksum is None:
    new_checksum = calculate_sha256(file_path)
    print(new_checksum)
    os.remove(file_path)
 else:
    print(new_checksum)
--- a/.github/checksum_checker.sh
+++ b/.github/checksum_checker.sh
@@ -14,14 +14,77 @@ function check_and_update_checksum() {
    idx="$5"
    # Download the file and calculate new checksum using Python
-    new_checksum=$(python3 ./.github/check_and_update.py $uri)
+    new_checksum=$(python3 -c "
-    result=$?
+import hashlib
 from huggingface_hub import hf_hub_download, get_paths_info
 import requests
 import sys
 import os
-    if [[ $result -eq 5 ]]; then
+uri = '$uri'
-        echo "Contaminated entry detected, deleting entry for $model_name..."
+file_name = uri.split('/')[-1]
-        yq eval -i "del([$idx])" "$input_yaml"
+
-        return
+# Function to parse the URI and determine download method
-    fi
+# Function to parse the URI and determine download method
 def parse_uri(uri):
    if uri.startswith('huggingface://'):
        repo_id = uri.split('://')[1]
        return 'huggingface', repo_id.rsplit('/', 1)[0]
    elif 'huggingface.co' in uri:
        parts = uri.split('/resolve/')
        if len(parts) > 1:
            repo_path = parts[0].split('https://huggingface.co/')[-1]
            return 'huggingface', repo_path
    return 'direct', uri
 def calculate_sha256(file_path):
    sha256_hash = hashlib.sha256()
    with open(file_path, 'rb') as f:
        for byte_block in iter(lambda: f.read(4096), b''):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()
 download_type, repo_id_or_url = parse_uri(uri)
 new_checksum =  None
 # Decide download method based on URI type
 if download_type == 'huggingface':
    # Use HF API to pull sha
    for file in get_paths_info(repo_id_or_url, [file_name], repo_type='model'):
        try:
            new_checksum = file.lfs.sha256
            break
        except Exception as e:
            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
            sys.exit(2)
    if new_checksum is None:
        try:
            file_path = hf_hub_download(repo_id=repo_id_or_url, filename=file_name)
        except Exception as e:
            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
            sys.exit(2)
 else:
    response = requests.get(repo_id_or_url)
    if response.status_code == 200:
        with open(file_name, 'wb') as f:
            f.write(response.content)
        file_path = file_name
    elif response.status_code == 404:
        print(f'File not found: {response.status_code}', file=sys.stderr)
        sys.exit(2)
    else:
        print(f'Error downloading file: {response.status_code}', file=sys.stderr)
        sys.exit(1)
 if new_checksum is None:
    new_checksum = calculate_sha256(file_path)
    print(new_checksum)
    os.remove(file_path)
 else:
    print(new_checksum)
 ")
    if [[ "$new_checksum" == "" ]]; then
        echo "Error calculating checksum for $file_name. Skipping..."
@@ -31,7 +94,7 @@ function check_and_update_checksum() {
    echo "Checksum for $file_name: $new_checksum"
    # Compare and update the YAML file if checksums do not match
-    
+    result=$?
    if [[ $result -eq 2 ]]; then
        echo "File not found, deleting entry for $file_name..."
        # yq eval -i "del(.[$idx].files[] | select(.filename == \"$file_name\"))" "$input_yaml"
--- a/.github/ci/modelslist.go
+++ b/.github/ci/modelslist.go
@@ -6,7 +6,6 @@ import (
 	"io/ioutil"
 	"os"
 	"github.com/microcosm-cc/bluemonday"
 	"gopkg.in/yaml.v3"
 )
@@ -280,12 +279,6 @@ func main() {
 		return
 	}
 	// Ensure that all arbitrary text content is sanitized before display
 	for i, m := range models {
 		models[i].Name = bluemonday.StrictPolicy().Sanitize(m.Name)
 		models[i].Description = bluemonday.StrictPolicy().Sanitize(m.Description)
 	}
 	// render the template
 	data := struct {
 		Models          []*GalleryModel
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -1,16 +1,10 @@
 # https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 version: 2
 updates:
  - package-ecosystem: "gitsubmodule"
    directory: "/"
    schedule:
      interval: "weekly"
  - package-ecosystem: "gomod"
    directory: "/"
    schedule:
      interval: "weekly"
    ignore:
    - dependency-name: "github.com/mudler/LocalAI/pkg/grpc/proto"
  - package-ecosystem: "github-actions"
    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
    directory: "/"
@@ -29,99 +23,3 @@ updates:
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/autogptq"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/bark"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/common/template"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/coqui"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/diffusers"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/exllama"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/exllama2"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/mamba"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/openvoice"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/parler-tts"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/rerankers"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/sentencetransformers"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/transformers"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/vllm"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/examples/chainlit"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/examples/functions"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/examples/langchain/langchainpy-localai-example"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/examples/langchain-chroma"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/examples/streamlit-bot"
    schedule:
      interval: "weekly"
  - package-ecosystem: "docker"
    directory: "/examples/k8sgpt"
    schedule:
      interval: "weekly"
  - package-ecosystem: "docker"
    directory: "/examples/kubernetes"
    schedule:
      interval: "weekly"
  - package-ecosystem: "docker"
    directory: "/examples/langchain"
    schedule:
      interval: "weekly"
  - package-ecosystem: "gomod"
    directory: "/examples/semantic-todo"
    schedule:
      interval: "weekly"
  - package-ecosystem: "docker"
    directory: "/examples/telegram-bot"
    schedule:
      interval: "weekly"
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,15 +1,6 @@
 enhancements:
 - head-branch: ['^feature', 'feature']
 dependencies:
 - any:
  - changed-files:
    - any-glob-to-any-file: 'Makefile'
  - changed-files:
    - any-glob-to-any-file: '*.mod'
  - changed-files:
    - any-glob-to-any-file: '*.sum'
 kind/documentation:
 - any:
  - changed-files:
--- a/.github/release.yml
+++ b/.github/release.yml
@@ -13,9 +13,6 @@ changelog:
      labels:
        - bug
        - regression
    - title: "🖧 P2P area"
      labels:
         - area/p2p
    - title: Exciting New Features 🎉
      labels:
        - Semver-Minor
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -9,17 +9,32 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - repository: "go-skynet/go-llama.cpp"
            variable: "GOLLAMA_VERSION"
            branch: "master"
          - repository: "ggerganov/llama.cpp"
            variable: "CPPLLAMA_VERSION"
            branch: "master"
          - repository: "go-skynet/go-ggml-transformers.cpp"
            variable: "GOGGMLTRANSFORMERS_VERSION"
            branch: "master"
          - repository: "donomii/go-rwkv.cpp"
            variable: "RWKV_VERSION"
            branch: "main"
          - repository: "ggerganov/whisper.cpp"
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
-          - repository: "PABannier/bark.cpp"
+          - repository: "go-skynet/go-bert.cpp"
-            variable: "BARKCPP_VERSION"
+            variable: "BERT_VERSION"
            branch: "master"
          - repository: "go-skynet/bloomz.cpp"
            variable: "BLOOMZ_VERSION"
            branch: "main"
-          - repository: "leejet/stable-diffusion.cpp"
+          - repository: "nomic-ai/gpt4all"
-            variable: "STABLEDIFFUSION_GGML_VERSION"
+            variable: "GPT4ALL_VERSION"
            branch: "main"
          - repository: "mudler/go-ggllm.cpp"
            variable: "GOGGLLM_VERSION"
            branch: "master"
          - repository: "mudler/go-stable-diffusion"
            variable: "STABLEDIFFUSION_VERSION"
@@ -31,30 +46,17 @@ jobs:
    steps:
      - uses: actions/checkout@v4
      - name: Bump dependencies 🔧
        id: bump
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
          {
            echo 'message<<EOF'
            cat "${{ matrix.variable }}_message.txt"
            echo EOF
          } >> "$GITHUB_OUTPUT"
          {
            echo 'commit<<EOF'
            cat "${{ matrix.variable }}_commit.txt"
            echo EOF
          } >> "$GITHUB_OUTPUT"
          rm -rfv ${{ matrix.variable }}_message.txt
          rm -rfv ${{ matrix.variable }}_commit.txt
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Update ${{ matrix.repository }}'
-          title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`'
+          title: ':arrow_up: Update ${{ matrix.repository }}'
          branch: "update/${{ matrix.variable }}"
-          body: ${{ steps.bump.outputs.message }}
+          body: Bump of ${{ matrix.repository }} version
          signoff: true
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -17,12 +17,12 @@ jobs:
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Update docs version ${{ matrix.repository }}'
-          title: 'docs: :arrow_up: update docs version ${{ matrix.repository }}'
+          title: ':arrow_up: Update docs version ${{ matrix.repository }}'
          branch: "update/docs"
          body: Bump of ${{ matrix.repository }} version inside docs
          signoff: true
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -23,9 +23,9 @@ jobs:
          sudo pip install --upgrade pip 
          pip install huggingface_hub
      - name: 'Setup yq'
-        uses: dcarbone/install-yq-action@v1.3.1
+        uses: dcarbone/install-yq-action@v1.1.1
        with:
-          version: 'v4.44.2'
+          version: 'v4.43.1'
          download-compressed: true
          force: true
@@ -36,12 +36,12 @@ jobs:
          sudo chmod 777 /hf_cache
          bash .github/checksum_checker.sh gallery/index.yaml
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Checksum updates in gallery/index.yaml'
-          title: 'chore(model-gallery): :arrow_up: update checksum'
+          title: 'models(gallery): :arrow_up: update checksum'
          branch: "update/checksum"
          body: Updating checksums in gallery/index.yaml
          signoff: true
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.2.0
+        uses: dependabot/fetch-metadata@v2.1.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -1,64 +0,0 @@
 name: Explorer deployment
 on:
  push:
    branches:
      - master
    tags:
      - 'v*'
 concurrency:
  group: ci-deploy-${{ github.head_ref || github.ref }}-${{ github.repository }}
 jobs:
  build-linux:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          make protogen-go
      - name: Build api
        run: |
          CGO_ENABLED=0 make build-api
      - name: rm
        uses: appleboy/ssh-action@v1.2.0
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
            key: ${{ secrets.EXPLORER_SSH_KEY }}
            port: ${{ secrets.EXPLORER_SSH_PORT }}
            script: |
                sudo rm -rf local-ai/ || true
      - name: copy file via ssh
        uses: appleboy/scp-action@v0.1.7
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
            key: ${{ secrets.EXPLORER_SSH_KEY }}
            port: ${{ secrets.EXPLORER_SSH_PORT }}
            source: "local-ai"
            overwrite: true
            rm: true
            target: ./local-ai
      - name: restarting
        uses: appleboy/ssh-action@v1.2.0
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
            key: ${{ secrets.EXPLORER_SSH_KEY }}
            port: ${{ secrets.EXPLORER_SSH_PORT }}
            script: |
                sudo cp -rfv local-ai/local-ai /usr/bin/local-ai
                sudo systemctl restart local-ai
--- a/.github/workflows/disabled/comment-pr.yaml
+++ b/.github/workflows/disabled/comment-pr.yaml
@@ -1,83 +0,0 @@
 name: Comment PRs
 on:
  pull_request_target:
 jobs:
  comment-pr:
    env:
        MODEL_NAME: hermes-2-theta-llama-3-8b
    runs-on: ubuntu-latest
    steps:
    - name: Checkout code
      uses: actions/checkout@v3
      with:
        ref: "${{ github.event.pull_request.merge_commit_sha }}"
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
      # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.7.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
            raw_diff_file_output: diff.txt
            file_output_only: "true"
            base_branch: ${{ github.event.pull_request.base.sha }}
    - name: Show diff
      env:
        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
      run: |
            cat $DIFF
    - name: Summarize
      env:
        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
      id: summarize
      run: |
            input="$(cat $DIFF)"
            # Define the LocalAI API endpoint
            API_URL="http://localhost:8080/chat/completions"
            # Create a JSON payload using jq to handle special characters
            json_payload=$(jq -n --arg input "$input" '{
            model: "'$MODEL_NAME'",
            messages: [
                {
                role: "system",
                content: "You are LocalAI-bot in Github that helps understanding PRs and assess complexity. Explain what has changed in this PR diff and why"
                },
                {
                role: "user",
                content: $input
                }
            ]
            }')
            # Send the request to LocalAI
            response=$(curl -s -X POST $API_URL \
            -H "Content-Type: application/json" \
            -d "$json_payload")
            # Extract the summary from the response
            summary="$(echo $response | jq -r '.choices[0].message.content')"
            # Print the summary
            #  -H "Authorization: Bearer $API_KEY" \
            echo "Summary:"
            echo "$summary"
            echo "payload sent"
            echo "$json_payload"
            {
                echo 'message<<EOF'
                echo "$summary"
                echo EOF
              } >> "$GITHUB_OUTPUT"
            docker logs --tail 10 local-ai
    - uses: mshick/add-pr-comment@v2
      if: always()
      with:
          repo-token: ${{ secrets.UPDATE_BOT_TOKEN }}
          message: ${{ steps.summarize.outputs.message }}
          message-failure: |
            Uh oh! Could not analyze this PR, maybe it's too big?
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -75,7 +75,7 @@ jobs:
        uses: actions/checkout@v4
      - name: Cache GRPC
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
@@ -84,7 +84,7 @@ jobs:
          build-args: |
            GRPC_BASE_IMAGE=${{ matrix.grpc-base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.65.0
+            GRPC_VERSION=v1.64.0
          context: .
          file: ./Dockerfile
          cache-to: type=gha,ignore-error=true
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
@@ -46,7 +46,7 @@ jobs:
        uses: actions/checkout@v4
      - name: Cache Intel images
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -35,19 +35,18 @@ jobs:
      max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
      matrix:
        include:
-          # This is basically covered by the AIO test
+          - build-type: ''
-          # - build-type: ''
+            platforms: 'linux/amd64'
-          #   platforms: 'linux/amd64'
+            tag-latest: 'false'
-          #   tag-latest: 'false'
+            tag-suffix: '-ffmpeg'
-          #   tag-suffix: '-ffmpeg'
+            ffmpeg: 'true'
-          #   ffmpeg: 'true'
+            image-type: 'extras'
-          #   image-type: 'extras'
+            runs-on: 'arc-runner-set'
-          #   runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
-          #   base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
          #   makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "0"
+            cuda-minor-version: "5"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg'
@@ -56,85 +55,85 @@ jobs:
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
-          # - build-type: 'hipblas'
+          - build-type: 'hipblas'
-          #   platforms: 'linux/amd64'
+            platforms: 'linux/amd64'
-          #   tag-latest: 'false'
+            tag-latest: 'false'
-          #   tag-suffix: '-hipblas'
+            tag-suffix: '-hipblas'
-          #   ffmpeg: 'false'
+            ffmpeg: 'false'
-          #   image-type: 'extras'
+            image-type: 'extras'
-          #   base-image: "rocm/dev-ubuntu-22.04:6.1"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
-          #   grpc-base-image: "ubuntu:22.04"
+            grpc-base-image: "ubuntu:22.04"
-          #   runs-on: 'arc-runner-set'
+            runs-on: 'arc-runner-set'
-          #   makeflags: "--jobs=3 --output-sync=target"
+            makeflags: "--jobs=3 --output-sync=target"
-          # - build-type: 'sycl_f16'
+          - build-type: 'sycl_f16'
-          #   platforms: 'linux/amd64'
+            platforms: 'linux/amd64'
-          #   tag-latest: 'false'
+            tag-latest: 'false'
-          #   base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-          #   grpc-base-image: "ubuntu:22.04"
+            grpc-base-image: "ubuntu:22.04"
-          #   tag-suffix: 'sycl-f16-ffmpeg'
+            tag-suffix: 'sycl-f16-ffmpeg'
-          #   ffmpeg: 'true'
+            ffmpeg: 'true'
-          #   image-type: 'extras'
+            image-type: 'extras'
-          #   runs-on: 'arc-runner-set'
+            runs-on: 'arc-runner-set'
-          #   makeflags: "--jobs=3 --output-sync=target"
+            makeflags: "--jobs=3 --output-sync=target"
-  # core-image-build:
+  core-image-build:
-  #   uses: ./.github/workflows/image_build.yml
+    uses: ./.github/workflows/image_build.yml
-  #   with:
+    with:
-  #     tag-latest: ${{ matrix.tag-latest }}
+      tag-latest: ${{ matrix.tag-latest }}
-  #     tag-suffix: ${{ matrix.tag-suffix }}
+      tag-suffix: ${{ matrix.tag-suffix }}
-  #     ffmpeg: ${{ matrix.ffmpeg }}
+      ffmpeg: ${{ matrix.ffmpeg }}
-  #     image-type: ${{ matrix.image-type }}
+      image-type: ${{ matrix.image-type }}
-  #     build-type: ${{ matrix.build-type }}
+      build-type: ${{ matrix.build-type }}
-  #     cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
-  #     cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-  #     platforms: ${{ matrix.platforms }}
+      platforms: ${{ matrix.platforms }}
-  #     runs-on: ${{ matrix.runs-on }}
+      runs-on: ${{ matrix.runs-on }}
-  #     base-image: ${{ matrix.base-image }}
+      base-image: ${{ matrix.base-image }}
-  #     grpc-base-image: ${{ matrix.grpc-base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
-  #     makeflags: ${{ matrix.makeflags }}
+      makeflags: ${{ matrix.makeflags }}
-  #   secrets:
+    secrets:
-  #     dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-  #     dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-  #     quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-  #   strategy:
+    strategy:
-  #     matrix:
+      matrix:
-  #       include:
+        include:
-          # - build-type: ''
+          - build-type: ''
-          #   platforms: 'linux/amd64'
+            platforms: 'linux/amd64'
-          #   tag-latest: 'false'
+            tag-latest: 'false'
-          #   tag-suffix: '-ffmpeg-core'
+            tag-suffix: '-ffmpeg-core'
-          #   ffmpeg: 'true'
+            ffmpeg: 'true'
-          #   image-type: 'core'
+            image-type: 'core'
-          #   runs-on: 'ubuntu-latest'
+            runs-on: 'ubuntu-latest'
-          #   base-image: "ubuntu:22.04"
+            base-image: "ubuntu:22.04"
-          #   makeflags: "--jobs=4 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
-          # - build-type: 'sycl_f16'
+          - build-type: 'sycl_f16'
-          #   platforms: 'linux/amd64'
+            platforms: 'linux/amd64'
-          #   tag-latest: 'false'
+            tag-latest: 'false'
-          #   base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
-          #   grpc-base-image: "ubuntu:22.04"
+            grpc-base-image: "ubuntu:22.04"
-          #   tag-suffix: 'sycl-f16-ffmpeg-core'
+            tag-suffix: 'sycl-f16-ffmpeg-core'
-          #   ffmpeg: 'true'
+            ffmpeg: 'true'
-          #   image-type: 'core'
+            image-type: 'core'
-          #   runs-on: 'arc-runner-set'
+            runs-on: 'arc-runner-set'
-          #   makeflags: "--jobs=3 --output-sync=target"
+            makeflags: "--jobs=3 --output-sync=target"
-          # - build-type: 'cublas'
+          - build-type: 'cublas'
-          #   cuda-major-version: "12"
+            cuda-major-version: "12"
-          #   cuda-minor-version: "0"
+            cuda-minor-version: "5"
-          #   platforms: 'linux/amd64'
+            platforms: 'linux/amd64'
-          #   tag-latest: 'false'
+            tag-latest: 'false'
-          #   tag-suffix: '-cublas-cuda12-ffmpeg-core'
+            tag-suffix: '-cublas-cuda12-ffmpeg-core'
-          #   ffmpeg: 'true'
+            ffmpeg: 'true'
-          #   image-type: 'core'
+            image-type: 'core'
-          #   runs-on: 'ubuntu-latest'
+            runs-on: 'ubuntu-latest'
-          #   base-image: "ubuntu:22.04"
+            base-image: "ubuntu:22.04"
-          #   makeflags: "--jobs=4 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
-          # - build-type: 'vulkan'
+          - build-type: 'vulkan'
-          #   platforms: 'linux/amd64'
+            platforms: 'linux/amd64'
-          #   tag-latest: 'false'
+            tag-latest: 'false'
-          #   tag-suffix: '-vulkan-ffmpeg-core'
+            tag-suffix: '-vulkan-ffmpeg-core'
-          #   ffmpeg: 'true'
+            ffmpeg: 'true'
-          #   image-type: 'core'
+            image-type: 'core'
-          #   runs-on: 'ubuntu-latest'
+            runs-on: 'ubuntu-latest'
-          #   base-image: "ubuntu:22.04"
+            base-image: "ubuntu:22.04"
-          #   makeflags: "--jobs=4 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -13,78 +13,6 @@ concurrency:
  cancel-in-progress: true
 jobs:
  hipblas-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
      ffmpeg: ${{ matrix.ffmpeg }}
      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
      grpc-base-image: ${{ matrix.grpc-base-image }}
      aio: ${{ matrix.aio }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
      max-parallel: 2
      matrix:
        include:
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-hipblas-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-core'
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -111,7 +39,7 @@ jobs:
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 5 || 8 }}
+      max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
      matrix:
        include:
          # Extra images
@@ -136,7 +64,7 @@ jobs:
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
-            cuda-minor-version: "7"
+            cuda-minor-version: "8"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11'
@@ -147,7 +75,7 @@ jobs:
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "0"
+            cuda-minor-version: "5"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12'
@@ -158,7 +86,7 @@ jobs:
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
-            cuda-minor-version: "7"
+            cuda-minor-version: "8"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda11-ffmpeg'
@@ -172,7 +100,7 @@ jobs:
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "0"
+            cuda-minor-version: "5"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda12-ffmpeg'
@@ -194,6 +122,29 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-hipblas-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
@@ -261,6 +212,26 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-core'
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
  core-image-build:
    uses: ./.github/workflows/image_build.yml
@@ -280,7 +251,6 @@ jobs:
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
      skip-drivers: ${{ matrix.skip-drivers }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -302,10 +272,9 @@ jobs:
            latest-image: 'latest-cpu'
            latest-image-aio: 'latest-aio-cpu'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
-            cuda-minor-version: "7"
+            cuda-minor-version: "8"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11-core'
@@ -314,10 +283,9 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "0"
+            cuda-minor-version: "5"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-core'
@@ -326,10 +294,9 @@ jobs:
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
-            cuda-minor-version: "7"
+            cuda-minor-version: "8"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11-ffmpeg-core'
@@ -338,10 +305,9 @@ jobs:
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "0"
+            cuda-minor-version: "5"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg-core'
@@ -349,7 +315,6 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
@@ -360,45 +325,4 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
  gh-runner:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
      ffmpeg: ${{ matrix.ffmpeg }}
      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      aio: ${{ matrix.aio }}
      base-image: ${{ matrix.base-image }}
      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
      skip-drivers: ${{ matrix.skip-drivers }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
      matrix:
        include:
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/arm64'
            tag-latest: 'false'
            tag-suffix: '-nvidia-l4t-arm64-core'
            latest-image: 'latest-nvidia-l4t-arm64-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
            runs-on: 'ubuntu-24.04-arm'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'true'
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -23,7 +23,7 @@ on:
        type: string
      cuda-minor-version:
        description: 'CUDA minor version'
-        default: "4"
+        default: "5"
        type: string
      platforms:
        description: 'Platforms'
@@ -49,10 +49,6 @@ on:
        description: 'FFMPEG'
        default: ''
        type: string
      skip-drivers:
        description: 'Skip drivers by default'
        default: 'false'
        type: string
      image-type:
        description: 'Image type'
        default: ''
@@ -219,7 +215,7 @@ jobs:
          password: ${{ secrets.quayPassword }}
      - name: Build and push
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        if: github.event_name != 'pull_request'
        with:
          builder: ${{ steps.buildx.outputs.name }}
@@ -236,9 +232,8 @@ jobs:
            BASE_IMAGE=${{ inputs.base-image }}
            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.65.0
+            GRPC_VERSION=v1.64.0
            MAKEFLAGS=${{ inputs.makeflags }}
            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
@@ -248,7 +243,7 @@ jobs:
          labels: ${{ steps.meta.outputs.labels }}
 ### Start testing image
      - name: Build and push
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        if: github.event_name == 'pull_request'
        with:
          builder: ${{ steps.buildx.outputs.name }}
@@ -265,9 +260,8 @@ jobs:
            BASE_IMAGE=${{ inputs.base-image }}
            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.65.0
+            GRPC_VERSION=v1.64.0
            MAKEFLAGS=${{ inputs.makeflags }}
            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
@@ -282,7 +276,7 @@ jobs:
 ## End testing image
      - name: Build and push AIO image
        if: inputs.aio != ''
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
@@ -297,7 +291,7 @@ jobs:
      - name: Build and push AIO image (dockerhub)
        if: inputs.aio != ''
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -1,168 +0,0 @@
 name: Notifications for new models
 on:
  pull_request:
     types:
       - closed
 jobs:
  notify-discord:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
        MODEL_NAME: hermes-2-theta-llama-3-8b
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
      with:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.7.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
            raw_diff_file_output: diff.txt
            file_output_only: "true"
    - name: Summarize
      env:
        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
      id: summarize
      run: |
            input="$(cat $DIFF)"
            # Define the LocalAI API endpoint
            API_URL="http://localhost:8080/chat/completions"
            # Create a JSON payload using jq to handle special characters
            json_payload=$(jq -n --arg input "$input" '{
            model: "'$MODEL_NAME'",
            messages: [
                {
                role: "system",
                content: "You are LocalAI-bot. Write a discord message to notify everyone about the new model from the git diff. Make it informal. An example can include: the URL of the model, the name, and a brief description of the model if exists. Also add an hint on how to install it in LocalAI and that can be browsed over https://models.localai.io. For example: local-ai run model_name_here"
                },
                {
                role: "user",
                content: $input
                }
            ]
            }')
            # Send the request to LocalAI
            response=$(curl -s -X POST $API_URL \
            -H "Content-Type: application/json" \
            -d "$json_payload")
            # Extract the summary from the response
            summary="$(echo $response | jq -r '.choices[0].message.content')"
            # Print the summary
            #  -H "Authorization: Bearer $API_KEY" \
            echo "Summary:"
            echo "$summary"
            echo "payload sent"
            echo "$json_payload"
            {
                echo 'message<<EOF'
                echo "$summary"
                echo EOF
              } >> "$GITHUB_OUTPUT"
            docker logs --tail 10 local-ai
    - name: Discord notification
      env:
        DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK_URL }}
        DISCORD_USERNAME: "LocalAI-Bot"
        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
      uses: Ilshidur/action-discord@master
      with:
        args: ${{ steps.summarize.outputs.message }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
      uses: mxschmitt/action-tmate@v3.19
      with:
        detached: true
        connect-timeout-seconds: 180
        limit-access-to-actor: true
  notify-twitter:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
        MODEL_NAME: hermes-2-theta-llama-3-8b
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
      with:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - name: Start LocalAI
      run: |
        echo "Starting LocalAI..."
        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
      # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.7.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
            raw_diff_file_output: diff.txt
            file_output_only: "true"
    - name: Summarize
      env:
        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
      id: summarize
      run: |
            input="$(cat $DIFF)"
            # Define the LocalAI API endpoint
            API_URL="http://localhost:8080/chat/completions"
            # Create a JSON payload using jq to handle special characters
            json_payload=$(jq -n --arg input "$input" '{
            model: "'$MODEL_NAME'",
            messages: [
                {
                role: "system",
                content: "You are LocalAI-bot. Write a twitter message to notify everyone about the new model from the git diff. Make it informal and really short. An example can include: the name, and a brief description of the model if exists. Also add an hint on how to install it in LocalAI. For example: local-ai run model_name_here"
                },
                {
                role: "user",
                content: $input
                }
            ]
            }')
            # Send the request to LocalAI
            response=$(curl -s -X POST $API_URL \
            -H "Content-Type: application/json" \
            -d "$json_payload")
            # Extract the summary from the response
            summary="$(echo $response | jq -r '.choices[0].message.content')"
            # Print the summary
            #  -H "Authorization: Bearer $API_KEY" \
            echo "Summary:"
            echo "$summary"
            echo "payload sent"
            echo "$json_payload"
            {
                echo 'message<<EOF'
                echo "$summary"
                echo EOF
              } >> "$GITHUB_OUTPUT"
            docker logs --tail 10 local-ai
    - uses: Eomm/why-don-t-you-tweet@v2
      with:
        tweet-message: ${{ steps.summarize.outputs.message }}
      env:
        # Get your tokens from https://developer.twitter.com/apps
        TWITTER_CONSUMER_API_KEY: ${{ secrets.TWITTER_APP_KEY }}
        TWITTER_CONSUMER_API_SECRET: ${{ secrets.TWITTER_APP_SECRET }}
        TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }}
        TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
      uses: mxschmitt/action-tmate@v3.19
      with:
        detached: true
        connect-timeout-seconds: 180
        limit-access-to-actor: true
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@@ -1,63 +0,0 @@
 name: Release notifications
 on:
  release:
    types:
      - published
 jobs:
  notify-discord:
    runs-on: ubuntu-latest
    env:
        RELEASE_BODY: ${{ github.event.release.body }}
        RELEASE_TITLE: ${{ github.event.release.name }}
        RELEASE_TAG_NAME: ${{ github.event.release.tag_name }}
    steps:
    - uses: mudler/localai-github-action@v1
      with:
        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
    - name: Summarize
      id: summarize
      run: |
            input="$RELEASE_TITLE\b$RELEASE_BODY"
            # Define the LocalAI API endpoint
            API_URL="http://localhost:8080/chat/completions"
            # Create a JSON payload using jq to handle special characters
            json_payload=$(jq -n --arg input "$input" '{
            model: "'$MODEL_NAME'",
            messages: [
                {
                role: "system",
                content: "Write a discord message with a bullet point summary of the release notes."
                },
                {
                role: "user",
                content: $input
                }
            ]
            }')
            # Send the request to LocalAI API
            response=$(curl -s -X POST $API_URL \
            -H "Content-Type: application/json" \
            -d "$json_payload")
            # Extract the summary from the response
            summary=$(echo $response | jq -r '.choices[0].message.content')
            # Print the summary
            #  -H "Authorization: Bearer $API_KEY" \
            {
                echo 'message<<EOF'
                echo "$summary"
                echo EOF
              } >> "$GITHUB_OUTPUT"
    - name: Discord notification
      env:
        DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK_URL_RELEASE }}
        DISCORD_USERNAME: "LocalAI-Bot"
        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
      uses: Ilshidur/action-discord@master
      with:
        args: ${{ steps.summarize.outputs.message }}
--- a/.github/workflows/prlint.yaml
+++ b/.github/workflows/prlint.yaml
@@ -1,28 +0,0 @@
 name: Check PR style
 on:
  pull_request_target:
    types:
      - opened
      - reopened
      - edited
      - synchronize
 jobs:
  title-lint:
    runs-on: ubuntu-latest
    permissions:
      statuses: write
    steps:
      - uses: aslafy-z/conventional-pr-title-action@v3
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 #  check-pr-description:
 #    runs-on: ubuntu-latest
 #    steps:
 #      - uses: actions/checkout@v2
 #      - uses: jadrol/pr-description-checker-action@v1.0.0
 #        id: description-checker
 #        with:
 #          repo-token: ${{ secrets.GITHUB_TOKEN }}
 #          exempt-labels: no qa
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,15 +1,11 @@
 name: Build and Release
 on:
-  push:
+- push
-    branches:
+- pull_request
      - master
    tags:
      - 'v*'
  pull_request:
 env:
-  GRPC_VERSION: v1.65.0
+  GRPC_VERSION: v1.64.0
 permissions:
  contents: write
@@ -31,11 +27,12 @@ jobs:
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
+          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
-          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
+          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
      - name: Install CUDA Dependencies
        run: |
          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
@@ -43,7 +40,7 @@ jobs:
          sudo apt-get update
          sudo apt-get install -y cuda-cross-aarch64 cuda-nvcc-cross-aarch64-${CUDA_VERSION} libcublas-cross-aarch64-${CUDA_VERSION}
        env:
-          CUDA_VERSION: 12-4
+          CUDA_VERSION: 12-5
      - name: Cache grpc
        id: cache-grpc
        uses: actions/cache@v4
@@ -55,8 +52,7 @@ jobs:
        run: |
          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && sed -i "216i\  TESTONLY" "third_party/abseil-cpp/absl/container/CMakeLists.txt" && mkdir -p cmake/build && \
+          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
          cd cmake/build && cmake -DgRPC_INSTALL=ON \
            -DgRPC_BUILD_TESTS=OFF \
            ../.. && sudo make --jobs 5 --output-sync=target
      - name: Install gRPC
@@ -100,13 +96,14 @@ jobs:
          CROSS_TOOLCHAIN=/usr/$GNU_HOST
          CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
          CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
          export PATH=$PATH:$GOPATH/bin
          export PATH=/usr/local/cuda/bin:$PATH
          sudo rm -rf /usr/aarch64-linux-gnu/lib/libstdc++.so.6
          sudo cp -rf /usr/aarch64-linux-gnu/lib/libstdc++.so* /usr/aarch64-linux-gnu/lib/libstdc++.so.6
          sudo cp /usr/aarch64-linux-gnu/lib/ld-linux-aarch64.so.1 ld.so
          GO_TAGS=p2p \
          BACKEND_LIBS="./grpc/cmake/cross_build/third_party/re2/libre2.a ./grpc/cmake/cross_build/libgrpc.a ./grpc/cmake/cross_build/libgrpc++.a ./grpc/cmake/cross_build/third_party/protobuf/libprotobuf.a /usr/aarch64-linux-gnu/lib/libc.so.6 /usr/aarch64-linux-gnu/lib/libstdc++.so.6 /usr/aarch64-linux-gnu/lib/libgomp.so.1 /usr/aarch64-linux-gnu/lib/libm.so.6 /usr/aarch64-linux-gnu/lib/libgcc_s.so.1 /usr/aarch64-linux-gnu/lib/libdl.so.2 /usr/aarch64-linux-gnu/lib/libpthread.so.0 ./ld.so" \
          GOOS=linux \
          GOARCH=arm64 \
@@ -123,7 +120,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -150,7 +147,7 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
+          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache cmake
      - name: Intel Dependencies
        run: |
          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
@@ -164,7 +161,7 @@ jobs:
          sudo apt-get update
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
        env:
-          CUDA_VERSION: 12-5
+          CUDA_VERSION: 12-3
      - name: "Install Hipblas"
        env:
          ROCM_VERSION: "6.1"
@@ -200,8 +197,7 @@ jobs:
        if: steps.cache-grpc.outputs.cache-hit != 'true'
        run: |
          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && sed -i "216i\  TESTONLY" "third_party/abseil-cpp/absl/container/CMakeLists.txt" && mkdir -p cmake/build && \
+          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
          cd cmake/build && cmake -DgRPC_INSTALL=ON \
            -DgRPC_BUILD_TESTS=OFF \
            ../.. && sudo make --jobs 5 --output-sync=target
      - name: Install gRPC
@@ -211,14 +207,15 @@ jobs:
      - name: Build
        id: build
        run: |
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
          export PATH=$PATH:$GOPATH/bin
          export PATH=/usr/local/cuda/bin:$PATH
          export PATH=/opt/rocm/bin:$PATH
          source /opt/intel/oneapi/setvars.sh
          sudo cp /lib64/ld-linux-x86-64.so.2 ld.so
-          BACKEND_LIBS="./ld.so ./sources/go-piper/piper/build/fi/lib/libfmt.a ./sources/go-piper/piper-phonemize/pi/lib/libonnxruntime.so.1.14.1 ./sources/go-piper/piper-phonemize/pi/src/libespeak-ng/libespeak-ng.so /usr/lib/x86_64-linux-gnu/libdl.so.2 /usr/lib/x86_64-linux-gnu/librt.so.1 /usr/lib/x86_64-linux-gnu/libpthread.so.0 ./sources/go-piper/piper-phonemize/pi/lib/libpiper_phonemize.so.1 ./sources/go-piper/piper/build/si/lib/libspdlog.a ./sources/go-piper/espeak/ei/lib/libucd.so" \
+          GO_TAGS=p2p \
          BACKEND_LIBS="./ld.so /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /usr/lib/x86_64-linux-gnu/libm.so.6 /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 /usr/lib/x86_64-linux-gnu/libc.so.6 /usr/lib/x86_64-linux-gnu/libgomp.so.1" \
          make -j4 dist
      - uses: actions/upload-artifact@v4
        with:
@@ -232,15 +229,48 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
  build-stablediffusion:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
      - name: Build stablediffusion
        run: |
          export PATH=$PATH:$GOPATH/bin
          make backend-assets/grpc/stablediffusion
          mkdir -p release && cp backend-assets/grpc/stablediffusion release
        env:
          GO_TAGS: stablediffusion
      - uses: actions/upload-artifact@v4
        with:
          name: stablediffusion
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
            release/*
-
+  build-macOS-arm64:
-  build-macOS-x86_64:
+    runs-on: macos-14
    runs-on: macos-13
    steps:
      - name: Clone
        uses: actions/checkout@v4
@@ -261,50 +291,8 @@ jobs:
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
          export SKIP_GRPC_BACKEND=backend-assets/grpc/whisper
          make dist
      - uses: actions/upload-artifact@v4
        with:
          name: LocalAI-MacOS-x86_64
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.19
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
-  build-macOS-arm64:
+          BACKEND_LIBS="$(ls /opt/homebrew/opt/grpc/lib/*.dylib /opt/homebrew/opt/re2/lib/*.dylib /opt/homebrew/opt/openssl@3/lib/*.dylib /opt/homebrew/opt/protobuf/lib/*.dylib /opt/homebrew/opt/abseil/lib/*.dylib | xargs)" GO_TAGS=p2p make dist
    runs-on: macos-14
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          brew install protobuf grpc libomp llvm
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build
        id: build
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
          export CC=/opt/homebrew/opt/llvm/bin/clang
          make dist
      - uses: actions/upload-artifact@v4
        with:
          name: LocalAI-MacOS-arm64
@@ -317,7 +305,7 @@ jobs:
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.0
+        uses: securego/gosec@master
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -29,12 +29,36 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user --no-cache-dir grpcio-tools==1.64.1
+          pip install --user grpcio-tools==1.64.0
      - name: Test transformers
        run: |
           make --jobs=5 --output-sync=target -C backend/python/transformers
           make --jobs=5 --output-sync=target -C backend/python/transformers test
  tests-sentencetransformers:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools==1.64.0
      - name: Test sentencetransformers
        run: |
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test
  tests-rerankers:
    runs-on: ubuntu-latest
    steps:
@@ -50,7 +74,7 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user --no-cache-dir grpcio-tools==1.64.1
+          pip install --user grpcio-tools==1.64.0
      - name: Test rerankers
        run: |
@@ -72,13 +96,81 @@ jobs:
          sudo apt-get install -y libopencv-dev
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
-          pip install --user --no-cache-dir grpcio-tools==1.64.1
+          pip install --user grpcio-tools==1.64.0
      - name: Test diffusers
        run: |
          make --jobs=5 --output-sync=target -C backend/python/diffusers
          make --jobs=5 --output-sync=target -C backend/python/diffusers test
-  # tests-transformers-musicgen:
+  tests-parler-tts:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools==1.64.0
      - name: Test parler-tts
        run: |
           make --jobs=5 --output-sync=target -C backend/python/parler-tts
           make --jobs=5 --output-sync=target -C backend/python/parler-tts test
  tests-openvoice:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools==1.64.0
      - name: Test openvoice
        run: |
           make --jobs=5 --output-sync=target -C backend/python/openvoice
           make --jobs=5 --output-sync=target -C backend/python/openvoice test
  tests-transformers-musicgen:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools==1.64.0
      - name: Test transformers-musicgen
        run: |
           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
  # tests-petals:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
@@ -93,12 +185,14 @@ jobs:
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
+  #         pip install --user grpcio-tools==1.64.0
-  #     - name: Test transformers-musicgen
+  #     - name: Test petals
  #       run: |
-  #          make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
+  #          make --jobs=5 --output-sync=target -C backend/python/petals
-  #          make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
+  #          make --jobs=5 --output-sync=target -C backend/python/petals test
  # tests-bark:
  #   runs-on: ubuntu-latest
@@ -155,7 +249,7 @@ jobs:
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
+  #         pip install --user grpcio-tools==1.64.0
  #     - name: Test bark
  #       run: |
@@ -180,11 +274,31 @@ jobs:
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
+  #         pip install --user grpcio-tools==1.64.0
  #     - name: Test vllm
  #       run: |
  #          make --jobs=5 --output-sync=target -C backend/python/vllm
  #          make --jobs=5 --output-sync=target -C backend/python/vllm test
  tests-vallex:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools==1.64.0
      - name: Test vall-e-x
        run: |
           make --jobs=5 --output-sync=target -C backend/python/vall-e-x
           make --jobs=5 --output-sync=target -C backend/python/vall-e-x test
  tests-coqui:
    runs-on: ubuntu-latest
@@ -200,7 +314,7 @@ jobs:
          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng python3-pip
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
-          pip install --user --no-cache-dir grpcio-tools==1.64.1
+          pip install --user grpcio-tools==1.64.0
      - name: Test coqui
        run: |
          make --jobs=5 --output-sync=target -C backend/python/coqui
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -10,7 +10,7 @@ on:
      - '*'
 env:
-  GRPC_VERSION: v1.65.0
+  GRPC_VERSION: v1.64.0
 concurrency:
  group: ci-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -70,8 +70,7 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
+          sudo apt-get install build-essential curl ffmpeg
          sudo apt-get install -y libgmock-dev
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
@@ -94,20 +93,23 @@ jobs:
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
          export CUDACXX=/usr/local/cuda/bin/nvcc
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
          # The python3-grpc-tools package in 22.04 is too old
          pip install --user grpcio-tools
-          make -C backend/python/transformers
+          sudo rm -rfv /usr/bin/conda || true
          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
          # Pre-build piper before we start tests in order to have shared libraries in place
          make sources/go-piper && \
          GO_TAGS="tts" make -C sources/go-piper piper.o && \
-          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/
+          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
        env:
-          CUDA_VERSION: 12-4
+          CUDA_VERSION: 12-3
      - name: Cache grpc
        id: cache-grpc
        uses: actions/cache@v4
@@ -118,8 +120,7 @@ jobs:
        if: steps.cache-grpc.outputs.cache-hit != 'true'
        run: |
          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --jobs 5 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && sed -i "216i\  TESTONLY" "third_party/abseil-cpp/absl/container/CMakeLists.txt" && mkdir -p cmake/build && cd cmake/build && \
+          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
          cmake -DgRPC_INSTALL=ON \
            -DgRPC_BUILD_TESTS=OFF \
            ../.. && sudo make --jobs 5
      - name: Install gRPC
@@ -127,10 +128,10 @@ jobs:
          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
-          PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -175,26 +176,17 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
      - name: Dependencies
        run: |
          # Install protoc
          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Build images
        run: |
          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
      - name: Test
        run: |
-            PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
+          LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -220,19 +212,18 @@ jobs:
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
-          pip install --user --no-cache-dir grpcio-tools
+          pip install --user grpcio-tools==1.64.0
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export CC=/opt/homebrew/opt/llvm/bin/clang
          # Used to run the newer GNUMake version from brew that supports --output-sync
          export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.19
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -13,19 +13,13 @@ jobs:
      - uses: actions/setup-go@v5
        with:
          go-version: 'stable'
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install protobuf-compiler
      - run: |
          go install github.com/swaggo/swag/cmd/swag@latest
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Bump swagger 🔧
        run: |
-          make protogen-go swagger
+          make swagger
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,6 @@
 /sources/
 __pycache__/
 *.a
 *.o
 get-sources
 prepare-sources
 /backend/cpp/llama/grpc-server
@@ -13,6 +12,7 @@ prepare-sources
 go-ggml-transformers
 go-gpt2
 go-rwkv
 whisper.cpp
 /bloomz
 go-bert
@@ -54,6 +54,3 @@ docs/static/gallery.html
 # backend virtual environments
 **/venv
 # per-developer customization files for the development container
 .devcontainer/customization/*
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -3,12 +3,12 @@
    "configurations": [
        {
            "name": "Python: Current File",
-            "type": "debugpy",
+            "type": "python",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal",
            "justMyCode": false,
-            "cwd": "${fileDirname}",
+            "cwd": "${workspaceFolder}/examples/langchain-chroma",
            "env": {
                "OPENAI_API_BASE": "http://localhost:8080/v1",
                "OPENAI_API_KEY": "abc"
@@ -19,16 +19,15 @@
            "type": "go",
            "request": "launch",
            "mode": "debug",
-            "program": "${workspaceRoot}",
+            "program": "${workspaceFolder}/main.go",
-            "args": [],
+            "args": [
                "api"
            ],
            "env": {
-                "LOCALAI_LOG_LEVEL": "debug",
+                "C_INCLUDE_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
-                "LOCALAI_P2P": "true",
+                "LIBRARY_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
-                "LOCALAI_FEDERATED": "true"
+                "DEBUG": "true"
-            },
+            }
            "buildFlags": ["-tags", "p2p tts", "-v"],
            "envFile": "${workspaceFolder}/.env",
            "cwd": "${workspaceRoot}"
        }
    ]
 }
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,6 +15,8 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
 - [Documentation](#documentation)
 - [Community and Communication](#community-and-communication)
 ## Getting Started
 ### Prerequisites
@@ -52,7 +54,7 @@ If you find a bug, have a feature request, or encounter any issues, please check
 ## Coding Guidelines
- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here.
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
 ## Testing
@@ -82,3 +84,5 @@ We are welcome the contribution of the documents, please open new PR or create a
 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
 ---
--- a/316
+++ b/316
@@ -5,45 +5,38 @@ ARG INTEL_BASE_IMAGE=${BASE_IMAGE}
 # The requirements-core target is common to all images.  It should not be placed in requirements-core unless every single build will use it.
 FROM ${BASE_IMAGE} AS requirements-core
 # TODO(mudler): install all accellerators here
 # and use make dist instead of build.
 # TODO(mudler): modify make dist to build also go-piper and stablediffusion
 # This way the same binary can work for everything(!)
 # TODO(mudler): also make sure that we bundle all the required libs in the backend-assets/lib
 # For the GPU-accell we are going to generate a tar file instead that will be extracted by the bash installer, and the libs will also be installed in the final docker image, so no need to pull ALL the dependencies
 USER root
-ARG GO_VERSION=1.22.6
+ARG GO_VERSION=1.22.4
 ARG CMAKE_VERSION=3.26.4
 ARG CMAKE_FROM_SOURCE=false
 ARG TARGETARCH
 ARG TARGETVARIANT
 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
        ccache \
        ca-certificates \
-        curl libssl-dev \
+        cmake \
        curl \
        git \
-        unzip upx-ucl && \
+        unzip && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Install CMake (the version in 22.04 is too old)
 RUN <<EOT bash
    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
    else
        apt-get update && \
        apt-get install -y \
            cmake && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 # Install Go
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
-ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
+ENV PATH $PATH:/root/go/bin:/usr/local/go/bin
 # Install grpc compilers
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
@@ -52,46 +45,49 @@ RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
 RUN test -n "$TARGETARCH" \
    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
 # Use the variables in subsequent instructions
 RUN echo "Target Architecture: $TARGETARCH"
 RUN echo "Target Variant: $TARGETVARIANT"
 # Cuda
-ENV PATH=/usr/local/cuda/bin:${PATH}
+ENV PATH /usr/local/cuda/bin:${PATH}
 # HipBLAS requirements
-ENV PATH=/opt/rocm/bin:${PATH}
+ENV PATH /opt/rocm/bin:${PATH}
-# OpenBLAS requirements and stable diffusion
+# OpenBLAS requirements and stable diffusion, tts (espeak)
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        libopenblas-dev && \
+        libopenblas-dev \
        espeak-ng \
        espeak \
        libopencv-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Set up OpenCV
 RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
 WORKDIR /build
 RUN test -n "$TARGETARCH" \
    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
 ###################################
 ###################################
 # The requirements-extras target is for any builds with IMAGE_TYPE=extras. It should not be placed in this target unless every IMAGE_TYPE=extras build will use it
 FROM requirements-core AS requirements-extras
-# Install uv as a system package
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
 RUN curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=/usr/bin sh
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        espeak-ng \
        espeak \
        python3-pip \
        python-is-python3 \
-        python3-dev llvm \
+        python3-dev \
        python3-venv && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* && \
@@ -103,27 +99,25 @@ RUN pip install --user grpcio-tools
 ###################################
 ###################################
-# The requirements-drivers target is for BUILD_TYPE specific items.  If you need to install something specific to CUDA, or specific to ROCM, it goes here.
+# Base image for the build-type. 
-# This target will be built on top of requirements-core or requirements-extras as retermined by the IMAGE_TYPE build-arg
+FROM requirements-${IMAGE_TYPE} AS run-requirements-drivers
 FROM requirements-${IMAGE_TYPE} AS requirements-drivers
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
-ARG CUDA_MINOR_VERSION=0
+ARG CUDA_MINOR_VERSION=5
 ARG SKIP_DRIVERS=false
 ENV BUILD_TYPE=${BUILD_TYPE}
 # Vulkan requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+    if [ "${BUILD_TYPE}" = "vulkan" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils wget gpg-agent && \
+                        software-properties-common pciutils wget gpg-agent && \
        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
        apt-get update && \
-        apt-get install -y \
+            apt-get install -y \
            vulkan-sdk && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
@@ -132,16 +126,36 @@ EOT
 # CuBLAS requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+    if [ "${BUILD_TYPE}" = "cublas" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils
+                        software-properties-common pciutils
        if [ "amd64" = "$TARGETARCH" ]; then
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-        fi
+            fi
        if [ "arm64" = "$TARGETARCH" ]; then
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
        fi
        dpkg -i cuda-keyring_1.1-1_all.deb && \
            rm -f cuda-keyring_1.1-1_all.deb && \
            apt-get update && \
            apt-get install -y --no-install-recommends \
                cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
            apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils && \
        curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
        apt-get update && \
@@ -153,12 +167,11 @@ RUN <<EOT bash
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
        apt-get clean && \
-        rm -rf /var/lib/apt/lists/*
+        rm -rf /var/lib/apt/lists/* \
-    fi
+    ; fi
 EOT
 # If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            libclblast-dev && \
@@ -166,7 +179,7 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        rm -rf /var/lib/apt/lists/* \
    ; fi
-RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            hipblas-dev \
@@ -178,6 +191,82 @@ RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        ldconfig \
    ; fi
 # The build-requirements-drivers target is for BUILD_TYPE specific items.  If you need to install something specific to CUDA, or specific to ROCM, it goes here.
 # This target will be built on top of requirements-core or requirements-extras as retermined by the IMAGE_TYPE build-arg
 FROM requirements-${IMAGE_TYPE} AS build-requirements-drivers
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
 ARG CUDA_MINOR_VERSION=5
 ENV BUILD_TYPE=${BUILD_TYPE}
 # Vulkan requirements
 RUN <<EOT bash
        apt-get update && \
        apt-get install -y  --no-install-recommends \
                        software-properties-common pciutils wget gpg-agent && \
        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
        apt-get update && \
            apt-get install -y \
            vulkan-sdk && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
 EOT
 # CuBLAS requirements
 RUN <<EOT bash
    apt-get update && \
    apt-get install -y  --no-install-recommends \
                    software-properties-common pciutils
    if [ "amd64" = "$TARGETARCH" ]; then
        curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
        fi
    if [ "arm64" = "$TARGETARCH" ]; then
        curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
    fi
    dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
        apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 EOT
 # clblas
 RUN apt-get update && \
        apt-get install -y --no-install-recommends \
            libclblast-dev && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
 # intel
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && apt update && apt install -y intel-basekit && apt-get clean && \
 rm -rf /var/lib/apt/lists/*
 # hipblas
 RUN wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
        gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null && apt-get update && \
        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/6.1.2/ubuntu jammy main" \
        | tee /etc/apt/sources.list.d/amdgpu.list && \
        echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.1.2 jammy main" |  tee --append /etc/apt/sources.list.d/rocm.list && printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | tee /etc/apt/preferences.d/rocm-pin-600 && \
        apt update && \
        apt-get install -y --no-install-recommends \
            hipblas-dev rocm-dev \
            rocblas-dev && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/* && \
        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
        ldconfig
 ###################################
 ###################################
@@ -198,9 +287,7 @@ FROM ${GRPC_BASE_IMAGE} AS grpc
 # This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
 ARG GRPC_MAKEFLAGS="-j4 -Otarget"
-ARG GRPC_VERSION=v1.65.0
+ARG GRPC_VERSION=v1.64.2
 ARG CMAKE_FROM_SOURCE=false
 ARG CMAKE_VERSION=3.26.4
 ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
@@ -209,31 +296,18 @@ WORKDIR /build
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        ca-certificates \
-        build-essential curl libssl-dev \
+        build-essential \
        cmake \
        git && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Install CMake (the version in 22.04 is too old)
 RUN <<EOT bash
    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
    else
        apt-get update && \
        apt-get install -y \
            cmake && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 # We install GRPC to a different prefix here so that we can copy in only the build artifacts later
 # saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
 # and running make install in the target container
 RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
    mkdir -p /build/grpc/cmake/build && \
    cd /build/grpc/cmake/build && \
    sed -i "216i\  TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
    make && \
    make install && \
@@ -242,14 +316,13 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
 ###################################
 ###################################
-# The builder-base target has the arguments, variables, and copies shared between full builder images and the uncompiled devcontainer
+# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
 # Adjustments to the build process should likely be made here.
 FROM build-requirements-drivers AS builder
-FROM requirements-drivers AS builder-base
+ARG GO_TAGS="stablediffusion tts p2p"
 ARG GO_TAGS="tts p2p"
 ARG GRPC_BACKENDS
 ARG MAKEFLAGS
 ARG LD_FLAGS="-s -w"
 ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
@@ -257,12 +330,14 @@ ENV MAKEFLAGS=${MAKEFLAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
 ENV LD_FLAGS=${LD_FLAGS}
 RUN echo "GO_TAGS: $GO_TAGS" && echo "TARGETARCH: $TARGETARCH"
 WORKDIR /build
 COPY . .
 COPY .git .
 RUN echo "GO_TAGS: $GO_TAGS"
 RUN make prepare
 # We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
 # but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
@@ -280,33 +355,16 @@ RUN <<EOT bash
    fi
 EOT
-###################################
+# stablediffusion does not tolerate a newer version of abseil, build it first
-###################################
+RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
 # The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
 # Adjustments to the build process should likely be made here.
 FROM builder-base AS builder
 # Install the pre-built GRPC
 COPY --from=grpc /opt/grpc /usr/local
 # Rebuild with defaults backends
 WORKDIR /build
-
+# Need to build tts and stablediffusion separately first (?)
-COPY . .
+RUN make dist && rm release/*.sha256 && mv release/* local-ai
 COPY .git .
 RUN make prepare
 ## Build the binary
 ## If it's CUDA or hipblas, we want to skip some of the llama-compat backends to save space
 ## We only leave the most CPU-optimized variant and the fallback for the cublas/hipblas build
 ## (both will use CUDA or hipblas for the actual computation)
 RUN if [ "${BUILD_TYPE}" = "cublas" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
        SKIP_GRPC_BACKEND="backend-assets/grpc/llama-cpp-avx512 backend-assets/grpc/llama-cpp-avx backend-assets/grpc/llama-cpp-avx2" make build; \
    else \
        make build; \
    fi
 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
        mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
@@ -316,41 +374,9 @@ RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
 ###################################
 ###################################
 # The devcontainer target is not used on CI. It is a target for developers to use locally -
 # rather than copying files it mounts them locally and leaves building to the developer
 FROM builder-base AS devcontainer
 ARG FFMPEG
 COPY --from=grpc /opt/grpc /usr/local
 COPY .devcontainer-scripts /.devcontainer-scripts
 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            ffmpeg && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/* \
    ; fi
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        ssh less wget
 # For the devcontainer, leave apt functional in case additional devtools are needed at runtime.
 RUN go install github.com/go-delve/delve/cmd/dlv@latest
 RUN go install github.com/mikefarah/yq/v4@latest
 ###################################
 ###################################
 # This is the final target. The result of this target will be the image uploaded to the registry.
 # If you cannot find a more suitable place for an addition, this layer is a suitable place for it.
-FROM requirements-drivers
+FROM run-requirements-drivers
 ARG FFMPEG
 ARG BUILD_TYPE
@@ -395,30 +421,45 @@ RUN make prepare-sources
 COPY --from=builder /build/local-ai ./
 # Copy shared libraries for piper
 # TODO(mudler): bundle these libs in backend-assets/lib/ (like we do for llama.cpp deps)
 COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
 # do not let stablediffusion rebuild (requires an older version of absl)
 COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
 # Change the shell to bash so we can use [[ tests below
 SHELL ["/bin/bash", "-c"]
 # We try to strike a balance between individual layer size (as that affects total push time) and total image size
 # Splitting the backends into more groups with fewer items results in a larger image, but a smaller size for the largest layer
 # Splitting the backends into fewer groups with more items results in a smaller image, but a larger size for the largest layer
 RUN if [[ ( "${IMAGE_TYPE}" == "extras ")]]; then \
        apt-get -qq -y install espeak-ng \
    ; fi
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/coqui \
    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "faster-whisper" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/faster-whisper \
+        make -C backend/python/parler-tts \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/diffusers \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/transformers-musicgen \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/exllama \
    ; fi
-RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/kokoro \
+        make -C backend/python/vall-e-x \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/openvoice \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "petals" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/petals \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "sentencetransformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/sentencetransformers \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/exllama2 \
@@ -438,6 +479,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/rerankers \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/mamba \
    ; fi
 # Make sure the models directory exists
--- a/481
+++ b/481
@@ -3,45 +3,42 @@ GOTEST=$(GOCMD) test
 GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai
 DETECT_LIBS?=true
 # llama.cpp versions
-GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
+GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
+CPPLLAMA_VERSION?=9ef07800622e4c371605f9419864d15667c3558f
-CPPLLAMA_VERSION?=6152129d05870cb38162c422c6ba80434e021e9f
+
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
 GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
 # go-rwkv version
 RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
 # whisper.cpp version
-WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
+WHISPER_CPP_VERSION?=b29b3b29240aac8b71ce8e5a4360c1f1562ad66f
-WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
+
 # bert.cpp version
 BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4
 # go-piper version
-PIPER_REPO?=https://github.com/mudler/go-piper
+PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759
 PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
-# bark.cpp
+# stablediffusion version
-BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
+STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
 BARKCPP_VERSION?=v1.0.0
-# stablediffusion.cpp (ggml)
+# tinydream version
-STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
+TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057
 STABLEDIFFUSION_GGML_VERSION?=5eb15ef4d022bef4a391de4f5f6556e81fbb5024
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
 ONNX_OS?=linux
 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
 export CMAKE_ARGS?=
 export BACKEND_LIBS?=
 CGO_LDFLAGS?=
 CGO_LDFLAGS_WHISPER?=
 CGO_LDFLAGS_WHISPER+=-lggml
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
 BUILD_ID?=
 NATIVE?=false
 TEST_DIR=/tmp/test
@@ -51,9 +48,9 @@ RANDOM := $(shell bash -c 'echo $$RANDOM')
 VERSION?=$(shell git describe --always --tags || echo "dev" )
 # go tool nm ./local-ai | grep Commit
-LD_FLAGS?=-s -w
+LD_FLAGS?=
-override LD_FLAGS += -X "github.com/mudler/LocalAI/internal.Version=$(VERSION)"
+override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Version=$(VERSION)"
-override LD_FLAGS += -X "github.com/mudler/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"
+override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"
 OPTIONAL_TARGETS?=
@@ -65,14 +62,6 @@ WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)
 UPX?=
 # check if upx exists
 ifeq (, $(shell which upx))
 	UPX=
 else
 	UPX=$(shell which upx)
 endif
 # Default Docker bridge IP
 E2E_BRIDGE_IP?=172.17.0.1
@@ -80,25 +69,7 @@ ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
 # IF native is false, we add -DGGML_NATIVE=OFF to CMAKE_ARGS
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # Detect if we are running on arm64
 ifneq (,$(findstring aarch64,$(shell uname -m)))
 	ONNX_ARCH=aarch64
 endif
 ifeq ($(OS),Darwin)
 	ONNX_OS=osx
 	ifneq (,$(findstring aarch64,$(shell uname -m)))
 		ONNX_ARCH=arm64
 	else ifneq (,$(findstring arm64,$(shell uname -m)))
 		ONNX_ARCH=arm64
 	else
 		ONNX_ARCH=x86_64
 	endif
 	ifeq ($(OSX_SIGNING_IDENTITY),)
 		OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
@@ -111,25 +82,24 @@ ifeq ($(OS),Darwin)
 	else ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 		export GGML_NO_ACCELERATE=1
 		export GGML_NO_METAL=1
 	endif
 	ifeq ($(BUILD_TYPE),metal)
 #			-lcblas 	removed: it seems to always be listed as a duplicate flag.
 		CGO_LDFLAGS += -framework Accelerate
 	endif
 else
 CGO_LDFLAGS_WHISPER+=-lgomp
 endif
 ifeq ($(BUILD_TYPE),openblas)
 	CGO_LDFLAGS+=-lopenblas
-	export GGML_OPENBLAS=1
+	export WHISPER_OPENBLAS=1
 endif
 ifeq ($(BUILD_TYPE),cublas)
 	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
 	export GGML_CUDA=1
 	export WHISPER_CUDA=1
 	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
 endif
@@ -137,14 +107,6 @@ ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=1
 endif
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	export GGML_SYCL=1
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
 	export GGML_SYCL_F16=1
 endif
 ifeq ($(BUILD_TYPE),hipblas)
 	ROCM_HOME ?= /opt/rocm
 	ROCM_PATH ?= /opt/rocm
@@ -153,26 +115,37 @@ ifeq ($(BUILD_TYPE),hipblas)
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
-	export GGML_HIP=1
+	export WHISPER_HIPBLAS=1
 	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
-	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
 endif
 ifeq ($(BUILD_TYPE),metal)
 	CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 	export GGML_METAL=1
 	export WHISPER_METAL=1
 endif
 ifeq ($(BUILD_TYPE),clblas)
 	CGO_LDFLAGS+=-lOpenCL -lclblast
-	export GGML_OPENBLAS=1
+	export WHISPER_CLBLAST=1
 endif
 # glibc-static or glibc-devel-static required
 ifeq ($(STATIC),true)
-	LD_FLAGS+=-linkmode external -extldflags -static
+	LD_FLAGS=-linkmode external -extldflags -static
 endif
 ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
 #	OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
 	OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
 endif
 ifeq ($(findstring tinydream,$(GO_TAGS)),tinydream)
 #	OPTIONAL_TARGETS+=go-tiny-dream/libtinydream.a
 	OPTIONAL_GRPC+=backend-assets/grpc/tinydream
 endif
 ifeq ($(findstring tts,$(GO_TAGS)),tts)
@@ -184,27 +157,18 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
 endif
 ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
 ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
 ifeq ($(ONNX_OS),linux)
 ifeq ($(ONNX_ARCH),x64)
 	ALL_GRPC_BACKENDS+=backend-assets/grpc/bark-cpp
 	ALL_GRPC_BACKENDS+=backend-assets/grpc/stablediffusion-ggml
 endif
 endif
 ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
 ALL_GRPC_BACKENDS+=backend-assets/grpc/silero-vad
 ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
 # Use filter-out to remove the specified backends
 ALL_GRPC_BACKENDS := $(filter-out $(SKIP_GRPC_BACKEND),$(ALL_GRPC_BACKENDS))
 GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
 TEST_PATHS?=./api/... ./pkg/... ./core/...
@@ -222,106 +186,92 @@ endif
 all: help
 ## BERT embeddings
 sources/go-bert.cpp:
 	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert.cpp
 	cd sources/go-bert.cpp && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
 sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
 	$(MAKE) -C sources/go-bert.cpp libgobert.a
 ## go-llama.cpp
 sources/go-llama.cpp:
-	mkdir -p sources/go-llama.cpp
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama.cpp
-	cd sources/go-llama.cpp && \
+	cd sources/go-llama.cpp && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
 	git init && \
 	git remote add origin $(GOLLAMA_REPO) && \
 	git fetch origin && \
 	git checkout $(GOLLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
 	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
 ## bark.cpp
 sources/bark.cpp:
 	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
 	cd sources/bark.cpp && \
 	git checkout $(BARKCPP_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/bark.cpp/build/libbark.a: sources/bark.cpp
 	cd sources/bark.cpp && \
 	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) .. && \
 	cmake --build . --config Release
 backend/go/bark/libbark.a: sources/bark.cpp/build/libbark.a
 	$(MAKE) -C backend/go/bark libbark.a
 ## go-piper
 sources/go-piper:
-	mkdir -p sources/go-piper
+	git clone --recurse-submodules https://github.com/mudler/go-piper sources/go-piper
-	cd sources/go-piper && \
+	cd sources/go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
 	git init && \
 	git remote add origin $(PIPER_REPO) && \
 	git fetch origin && \
 	git checkout $(PIPER_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/go-piper/libpiper_binding.a: sources/go-piper
 	$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
-## stablediffusion (ggml)
+## GPT4ALL
-sources/stablediffusion-ggml.cpp:
+sources/gpt4all:
-	git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
+	git clone --recurse-submodules $(GPT4ALL_REPO) sources/gpt4all
-	cd sources/stablediffusion-ggml.cpp && \
+	cd sources/gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
 	git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
-backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp
+sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
-	$(MAKE) -C backend/go/image/stablediffusion-ggml build/libstable-diffusion.a
+	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a
 	$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
-backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
+## RWKV
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
+sources/go-rwkv.cpp:
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
+	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv.cpp
-ifneq ($(UPX),)
+	cd sources/go-rwkv.cpp && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
 	$(UPX) backend-assets/grpc/stablediffusion-ggml
 endif
-sources/onnxruntime:
+sources/go-rwkv.cpp/librwkv.a: sources/go-rwkv.cpp
-	mkdir -p sources/onnxruntime
+	cd sources/go-rwkv.cpp && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
 	curl -L https://github.com/microsoft/onnxruntime/releases/download/v$(ONNX_VERSION)/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz -o sources/onnxruntime/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
 	cd sources/onnxruntime && tar -xvf onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz && rm onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
 	cd sources/onnxruntime && mv onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION)/* ./
-backend-assets/lib/libonnxruntime.so.1: backend-assets/lib sources/onnxruntime
+## stable diffusion
-	cp -rfv sources/onnxruntime/lib/* backend-assets/lib/
+sources/go-stable-diffusion:
-ifeq ($(OS),Darwin)
+	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion sources/go-stable-diffusion
-	mv backend-assets/lib/libonnxruntime.$(ONNX_VERSION).dylib backend-assets/lib/libonnxruntime.dylib
+	cd sources/go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1
-else
+
-	mv backend-assets/lib/libonnxruntime.so.$(ONNX_VERSION) backend-assets/lib/libonnxruntime.so.1
+sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
-endif
+	CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
 ## tiny-dream
 sources/go-tiny-dream:
 	git clone --recurse-submodules https://github.com/M0Rf30/go-tiny-dream sources/go-tiny-dream
 	cd sources/go-tiny-dream && git checkout -b build $(TINYDREAM_VERSION) && git submodule update --init --recursive --depth 1
 sources/go-tiny-dream/libtinydream.a: sources/go-tiny-dream
 	$(MAKE) -C sources/go-tiny-dream libtinydream.a
 ## whisper
 sources/whisper.cpp:
-	mkdir -p sources/whisper.cpp
+	git clone https://github.com/ggerganov/whisper.cpp sources/whisper.cpp
-	cd sources/whisper.cpp && \
+	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
 	git init && \
 	git remote add origin $(WHISPER_REPO) && \
 	git fetch origin && \
 	git checkout $(WHISPER_CPP_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
-	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
+	cd sources/whisper.cpp && $(MAKE) libwhisper.a
-get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
+get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream
 replace:
 	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp
 	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
 	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/donomii/go-rwkv.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-bert.cpp
 	$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
 	$(GOCMD) mod edit -dropreplace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang
 	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
 prepare-sources: get-sources replace
@@ -331,8 +281,13 @@ prepare-sources: get-sources replace
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
 	$(MAKE) -C sources/go-rwkv.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-stable-diffusion clean
 	$(MAKE) -C sources/go-bert.cpp clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build
 prepare: prepare-sources $(OPTIONAL_TARGETS)
@@ -345,9 +300,7 @@ clean: ## Remove build related file
 	rm -rf release/
 	rm -rf backend-assets/*
 	$(MAKE) -C backend/cpp/grpc clean
 	$(MAKE) -C backend/go/bark clean
 	$(MAKE) -C backend/cpp/llama clean
 	$(MAKE) -C backend/go/image/stablediffusion-ggml clean
 	rm -rf backend/cpp/llama-* || true
 	$(MAKE) dropreplace
 	$(MAKE) protogen-clean
@@ -358,49 +311,40 @@ clean-tests:
 	rm -rf test-dir
 	rm -rf core/http/backend-assets
 clean-dc: clean
 	cp -r /build/backend-assets /workspace/backend-assets
 ## Build:
 build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
 	$(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
 	$(info ${GREEN}I UPX: ${YELLOW}$(UPX)${RESET})
 ifneq ($(BACKEND_LIBS),)
 	$(MAKE) backend-assets/lib
-	cp -f $(BACKEND_LIBS) backend-assets/lib/
+	cp $(BACKEND_LIBS) backend-assets/lib/
 endif
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
 build-minimal:
-	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=p2p $(MAKE) build
+	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=none $(MAKE) build
 build-api:
-	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=p2p $(MAKE) build
+	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build
 backend-assets/lib:
 	mkdir -p backend-assets/lib
 dist:
 	$(MAKE) backend-assets/grpc/llama-cpp-avx2
 ifeq ($(DETECT_LIBS),true)
 	scripts/prepare-libs.sh backend-assets/grpc/llama-cpp-avx2
 endif
 ifeq ($(OS),Darwin)
-	BUILD_TYPE=none $(MAKE) backend-assets/grpc/llama-cpp-fallback
+	$(info ${GREEN}I Skip CUDA/hipblas build on MacOS${RESET})
 else
 ifneq ($(ARCH),arm64)
 	$(MAKE) backend-assets/grpc/llama-cpp-cuda
 	$(MAKE) backend-assets/grpc/llama-cpp-hipblas
 	$(MAKE) backend-assets/grpc/llama-cpp-sycl_f16
 	$(MAKE) backend-assets/grpc/llama-cpp-sycl_f32
 endif
 	GO_TAGS="tts p2p" $(MAKE) build
 ifeq ($(DETECT_LIBS),true)
 	scripts/prepare-libs.sh backend-assets/grpc/piper
 endif
-	GO_TAGS="tts p2p" STATIC=true $(MAKE) build
+	STATIC=true $(MAKE) build
 	mkdir -p release
 # if BUILD_ID is empty, then we don't append it to the binary name
 ifeq ($(BUILD_ID),)
@@ -412,7 +356,7 @@ else
 endif
 dist-cross-linux-arm64: 
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" GO_TAGS="p2p" \
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" \
 	STATIC=true $(MAKE) build
 	mkdir -p release
 # if BUILD_ID is empty, then we don't append it to the binary name
@@ -438,6 +382,8 @@ test-models/testmodel.ggml:
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
 	wget -q https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
 	wget -q https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
 	cp tests/models_fixtures/* test-models
 prepare-test: grpcs
@@ -446,10 +392,11 @@ prepare-test: grpcs
 test: prepare test-models/testmodel.ggml grpcs
 	@echo 'Running tests'
-	export GO_TAGS="tts debug"
+	export GO_TAGS="tts stablediffusion debug"
 	$(MAKE) prepare-test
-	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-gpt4all
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
@@ -459,46 +406,50 @@ prepare-e2e:
 	mkdir -p $(TEST_DIR)
 	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
 	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
-	docker build --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=12 --build-arg CUDA_MINOR_VERSION=0 --build-arg FFMPEG=true -t localai-tests .
+	docker build --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=12 --build-arg CUDA_MINOR_VERSION=5 --build-arg FFMPEG=true -t localai-tests .
 run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
 	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
-run-e2e-aio: protogen-go
+run-e2e-aio:
 	@echo 'Running e2e AIO tests'
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
 test-e2e:
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
 	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
 teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)
 test-gpt4all: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r $(TEST_PATHS)
 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)
 test-tts: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)
 test-stablediffusion: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)
 test-stores: backend-assets/grpc/local-store
 	mkdir -p tests/integration/backend-assets/grpc
 	cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts $(TEST_FLAKES) -v -r tests/integration
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts 1 -v -r tests/integration
 test-container:
 	docker build --target requirements -t local-ai-test-container .
@@ -534,10 +485,10 @@ protogen-go-clean:
 	$(RM) bin/*
 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
 .PHONY: autogptq-protogen
 autogptq-protogen:
@@ -571,13 +522,13 @@ diffusers-protogen:
 diffusers-protogen-clean:
 	$(MAKE) -C backend/python/diffusers protogen-clean
-.PHONY: faster-whisper-protogen
+.PHONY: exllama-protogen
-faster-whisper-protogen:
+exllama-protogen:
-	$(MAKE) -C backend/python/faster-whisper protogen
+	$(MAKE) -C backend/python/exllama protogen
-.PHONY: faster-whisper-protogen-clean
+.PHONY: exllama-protogen-clean
-faster-whisper-protogen-clean:
+exllama-protogen-clean:
-	$(MAKE) -C backend/python/faster-whisper protogen-clean
+	$(MAKE) -C backend/python/exllama protogen-clean
 .PHONY: exllama2-protogen
 exllama2-protogen:
@@ -587,6 +538,22 @@ exllama2-protogen:
 exllama2-protogen-clean:
 	$(MAKE) -C backend/python/exllama2 protogen-clean
 .PHONY: mamba-protogen
 mamba-protogen:
 	$(MAKE) -C backend/python/mamba protogen
 .PHONY: mamba-protogen-clean
 mamba-protogen-clean:
 	$(MAKE) -C backend/python/mamba protogen-clean
 .PHONY: petals-protogen
 petals-protogen:
 	$(MAKE) -C backend/python/petals protogen
 .PHONY: petals-protogen-clean
 petals-protogen-clean:
 	$(MAKE) -C backend/python/petals protogen-clean
 .PHONY: rerankers-protogen
 rerankers-protogen:
 	$(MAKE) -C backend/python/rerankers protogen
@@ -595,6 +562,14 @@ rerankers-protogen:
 rerankers-protogen-clean:
 	$(MAKE) -C backend/python/rerankers protogen-clean
 .PHONY: sentencetransformers-protogen
 sentencetransformers-protogen:
 	$(MAKE) -C backend/python/sentencetransformers protogen
 .PHONY: sentencetransformers-protogen-clean
 sentencetransformers-protogen-clean:
 	$(MAKE) -C backend/python/sentencetransformers protogen-clean
 .PHONY: transformers-protogen
 transformers-protogen:
 	$(MAKE) -C backend/python/transformers protogen
@@ -603,13 +578,37 @@ transformers-protogen:
 transformers-protogen-clean:
 	$(MAKE) -C backend/python/transformers protogen-clean
-.PHONY: kokoro-protogen
+.PHONY: parler-tts-protogen
-kokoro-protogen:
+parler-tts-protogen:
-	$(MAKE) -C backend/python/kokoro protogen
+	$(MAKE) -C backend/python/parler-tts protogen
-.PHONY: kokoro-protogen-clean
+.PHONY: parler-tts-protogen-clean
-kokoro-protogen-clean:
+parler-tts-protogen-clean:
-	$(MAKE) -C backend/python/kokoro protogen-clean
+	$(MAKE) -C backend/python/parler-tts protogen-clean
 .PHONY: transformers-musicgen-protogen
 transformers-musicgen-protogen:
 	$(MAKE) -C backend/python/transformers-musicgen protogen
 .PHONY: transformers-musicgen-protogen-clean
 transformers-musicgen-protogen-clean:
 	$(MAKE) -C backend/python/transformers-musicgen protogen-clean
 .PHONY: vall-e-x-protogen
 vall-e-x-protogen:
 	$(MAKE) -C backend/python/vall-e-x protogen
 .PHONY: vall-e-x-protogen-clean
 vall-e-x-protogen-clean:
 	$(MAKE) -C backend/python/vall-e-x protogen-clean
 .PHONY: openvoice-protogen
 openvoice-protogen:
 	$(MAKE) -C backend/python/openvoice protogen
 .PHONY: openvoice-protogen-clean
 openvoice-protogen-clean:
 	$(MAKE) -C backend/python/openvoice protogen-clean
 .PHONY: vllm-protogen
 vllm-protogen:
@@ -626,11 +625,17 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
 	$(MAKE) -C backend/python/faster-whisper
 	$(MAKE) -C backend/python/vllm
 	$(MAKE) -C backend/python/mamba
 	$(MAKE) -C backend/python/sentencetransformers
 	$(MAKE) -C backend/python/rerankers
 	$(MAKE) -C backend/python/transformers
-	$(MAKE) -C backend/python/kokoro
+	$(MAKE) -C backend/python/transformers-musicgen
 	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
 	$(MAKE) -C backend/python/openvoice
 	$(MAKE) -C backend/python/exllama
 	$(MAKE) -C backend/python/petals
 	$(MAKE) -C backend/python/exllama2
 prepare-test-extra: protogen-python
@@ -651,14 +656,25 @@ backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_bindin
 	mkdir -p backend-assets/espeak-ng-data
 	@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
 backend-assets/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
 	mkdir -p backend-assets/gpt4all
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
 	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
 backend-assets/grpc: protogen-go replace
 	mkdir -p backend-assets/grpc
 backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
 backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
 backend-assets/grpc/huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/huggingface
 endif
 backend/cpp/llama/llama.cpp:
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp
@@ -686,70 +702,67 @@ else
 endif
 # This target is for manually building a variant with-auto detected flags
-backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/llama.cpp
+backend-assets/grpc/llama-cpp: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-cpp
 	$(MAKE) -C backend/cpp/llama-cpp purge
 	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
 	$(MAKE) VARIANT="llama-cpp" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-cpp/grpc-server backend-assets/grpc/llama-cpp
-backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc backend/cpp/llama/llama.cpp
+backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-avx2
 	$(MAKE) -C backend/cpp/llama-avx2 purge
 	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
-backend-assets/grpc/llama-cpp-avx512: backend-assets/grpc backend/cpp/llama/llama.cpp
+backend-assets/grpc/llama-cpp-avx: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-avx512
 	$(MAKE) -C backend/cpp/llama-avx512 purge
 	$(info ${GREEN}I llama-cpp build info:avx512${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx512" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx512/grpc-server backend-assets/grpc/llama-cpp-avx512
 backend-assets/grpc/llama-cpp-avx: backend-assets/grpc backend/cpp/llama/llama.cpp
 	cp -rf backend/cpp/llama backend/cpp/llama-avx
 	$(MAKE) -C backend/cpp/llama-avx purge
 	$(info ${GREEN}I llama-cpp build info:avx${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx
-backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc backend/cpp/llama/llama.cpp
+backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-fallback
 	$(MAKE) -C backend/cpp/llama-fallback purge
 	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
 # TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
 	cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/
 endif
-backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc backend/cpp/llama/llama.cpp
+backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-cuda
 	$(MAKE) -C backend/cpp/llama-cuda purge
 	$(info ${GREEN}I llama-cpp build info:cuda${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
-backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc backend/cpp/llama/llama.cpp
+backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-hipblas
 	$(MAKE) -C backend/cpp/llama-hipblas purge
 	$(info ${GREEN}I llama-cpp build info:hipblas${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
+	BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
-backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc backend/cpp/llama/llama.cpp
+backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16
 	$(MAKE) -C backend/cpp/llama-sycl_f16 purge
 	$(info ${GREEN}I llama-cpp build info:sycl_f16${RESET})
 	BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16
-backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc backend/cpp/llama/llama.cpp
+backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32
 	$(MAKE) -C backend/cpp/llama-sycl_f32 purge
 	$(info ${GREEN}I llama-cpp build info:sycl_f32${RESET})
 	BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32
-backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc backend/cpp/llama/llama.cpp
+backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-grpc
 	$(MAKE) -C backend/cpp/llama-grpc purge
 	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
@@ -763,43 +776,29 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
 backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/llama-ggml
 endif
 backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/bark-cpp
 endif
 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/piper
 endif
-backend-assets/grpc/silero-vad: backend-assets/grpc backend-assets/lib/libonnxruntime.so.1
+backend-assets/grpc/rwkv: sources/go-rwkv.cpp sources/go-rwkv.cpp/librwkv.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/onnxruntime/include/" LIBRARY_PATH=$(CURDIR)/backend-assets/lib \
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv.cpp LIBRARY_PATH=$(CURDIR)/sources/go-rwkv.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/silero-vad ./backend/go/vad/silero
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
-ifneq ($(UPX),)
+
-	$(UPX) backend-assets/grpc/silero-vad
+backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
-endif
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
 backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
 backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
+	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH=$(CURDIR)/sources/whisper.cpp LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/whisper
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/whisper
 endif
 backend-assets/grpc/local-store: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/local-store ./backend/go/stores/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/local-store
 endif
 grpcs: prepare $(GRPC_BACKENDS)
@@ -841,7 +840,7 @@ docker-aio-all:
 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
@@ -849,7 +848,7 @@ docker-image-intel:
 docker-image-intel-xpu:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2025.0.0-0-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
 		--build-arg GO_TAGS="none" \
 		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
--- a/README.md
+++ b/README.md
@@ -38,13 +38,9 @@
 </a>
 </p>
 <p align="center">
 <a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 </p>
 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
@@ -60,70 +56,47 @@ curl https://localai.io/install.sh | sh
 Or run with docker:
 ```bash
 # CPU only image:
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-cpu
 # Nvidia GPU:
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
 # CPU and GPU image (bigger size):
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
 # AIO images (it will pre-download a set of models ready for use, see https://localai.io/basics/container/)
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
-```
+# Alternative images:
-
+# - if you have an Nvidia GPU:
-To load models:
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
-
+# - without preconfigured models
-```bash
+# docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
-# From the model gallery (see available models with `local-ai models list`, in the WebUI from the model tab, or visiting https://models.localai.io)
+# - without preconfigured models for Nvidia GPUs
-local-ai run llama-3.2-1b-instruct:q4_k_m
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12 
 # Start LocalAI with the phi-2 model directly from huggingface
 local-ai run huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf
 # Install and run a model from the Ollama OCI registry
 local-ai run ollama://gemma:2b
 # Run a model from a configuration file
 local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 # Install and run a model from a standard OCI registry (e.g., Docker Hub)
 local-ai run oci://localai/phi-2:latest
 ```
 [💻 Getting started](https://localai.io/basics/getting_started/index.html)
-## 📰 Latest project news
+## 🔥🔥 Hot topics / Roadmap
- Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
+[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
 - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
 - Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204
 - Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
 - Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
 - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723. P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
 - May 2024: 🔥🔥 Decentralized P2P llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs  https://localai.io/features/distribute/
 - May 2024: 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
 - April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
-Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+- 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
 - 🔥🔥 Decentralized llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs  https://localai.io/features/distribute/
 - 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
 - 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
 - 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
 - Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
 - Reranker API: https://github.com/mudler/LocalAI/pull/2121
-## 🔥🔥 Hot topics (looking for help):
+Hot topics (looking for contributors):
 - Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
 - Realtime API https://github.com/mudler/LocalAI/issues/3714
 - WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
 - Backends v2: https://github.com/mudler/LocalAI/issues/1126
 - Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
 - Assistant API: https://github.com/mudler/LocalAI/issues/1273
 - Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
 - Vulkan: https://github.com/mudler/LocalAI/issues/1647
 - Anthropic API: https://github.com/mudler/LocalAI/issues/1808
 If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
 ## 🚀 [Features](https://localai.io/features/)
- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
 - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
 - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
- 🎨 [Image generation](https://localai.io/features/image-generation)
+- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
 - 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) 
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
@@ -131,8 +104,6 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
 - 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!
 ## 💻 Usage
@@ -154,7 +125,6 @@ Model galleries
 Other:
 - Helm chart https://github.com/go-skynet/helm-charts
 - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
 - Langchain: https://python.langchain.com/docs/integrations/providers/localai/
 - Terminal utility https://github.com/djcopley/ShellOracle
 - Local Smart assistant https://github.com/mudler/LocalAGI
 - Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
@@ -162,10 +132,6 @@ Other:
 - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
 - Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
 - Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
 - Another Telegram Bot https://github.com/JackBekket/Hellper
 - Auto-documentation https://github.com/JackBekket/Reflexia
 - Github bot which answer on issues, with code and documentation as context https://github.com/JackBekket/GitHelper
 - Github Actions: https://github.com/marketplace/actions/start-localai
 - Examples: https://github.com/mudler/LocalAI/tree/master/examples/
@@ -179,7 +145,6 @@ Other:
 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)
 - [Run Visual studio code with LocalAI (SUSE)](https://www.suse.com/c/running-ai-locally/)
 - 🆕 [Run LocalAI on Jetson Nano Devkit](https://mudler.pm/posts/local-ai-jetson-nano-devkit/)
 - [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/blog/low-code-llm-apps-with-local-ai-flowise-and-pulumi/)
 - [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
@@ -239,6 +204,7 @@ LocalAI couldn't have been built without the help of great software already avai
 - https://github.com/antimatter15/alpaca.cpp
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/ggerganov/whisper.cpp
 - https://github.com/saharNooby/rwkv.cpp
 - https://github.com/rhasspy/piper
 ## 🤗 Contributors
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -1,7 +1,7 @@
 name: text-embedding-ada-002
-embeddings: true
+backend: bert-embeddings
 parameters:
-  model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
+  model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin
 usage: |
    You can test this model with curl like this:
--- a/aio/cpu/image-gen.yaml
+++ b/aio/cpu/image-gen.yaml
@@ -1,17 +1,56 @@
 name: stablediffusion
-backend: stablediffusion-ggml
+backend: stablediffusion
 cfg_scale: 4.5
 options:
 - sampler:euler
 parameters:
-  model: stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf
+  model: stablediffusion_assets
-step: 25
+
 license: "BSD-3"
 urls:
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
 description: |
     Stable Diffusion in NCNN with c++, supported txt2img and img2img
 download_files:
- filename: "stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
+- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
-  sha256: "b8944e9fe0b69b36ae1b5bb0185b3a7b8ef14347fe0fa9af6c64c4829022261f"
+  sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
-  uri: "huggingface://second-state/stable-diffusion-v1-5-GGUF/stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
 - filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
  sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
 - filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
  sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
 - filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
  sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
 - filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
  sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
 - filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
  sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
 - filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
  sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
 - filename: "stablediffusion_assets/log_sigmas.bin"
  sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
 - filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
  sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
 - filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
  sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
 - filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
  sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
 - filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
  sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
 - filename: "stablediffusion_assets/vocab.txt"
  sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
 usage: |
        curl http://localhost:8080/v1/images/generations \
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4o
+name: gpt-4-vision-preview
 roles:
  user: "USER:"
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-name: gpt-4o
+name: gpt-4-vision-preview
 roles:
  user: "USER:"
--- a/aio/intel/image-gen.yaml
+++ b/aio/intel/image-gen.yaml
@@ -1,6 +1,6 @@
 name: stablediffusion
 parameters:
-  model: Lykon/dreamshaper-8
+  model: runwayml/stable-diffusion-v1-5
 backend: diffusers
 step: 25
 f16: true
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -2,7 +2,7 @@ backend: llama-cpp
 context_size: 4096
 mmap: false
 f16: false
-name: gpt-4o
+name: gpt-4-vision-preview
 roles:
  user: "USER:"
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -16,7 +16,6 @@ service Backend {
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}
@@ -26,21 +25,6 @@ service Backend {
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
  rpc Rerank(RerankRequest) returns (RerankResult) {}
  rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
  rpc VAD(VADRequest) returns (VADResponse) {}
 }
 // Define the empty request
 message MetricsRequest {}
 message MetricsResponse {
  int32 slot_id = 1;
  string prompt_json_for_slot = 2;  // Stores the prompt as a JSON string.
  float tokens_per_second = 3;
  int32 tokens_generated = 4;
  int32 prompt_tokens_processed = 5;
 }
 message RerankRequest {
@@ -149,9 +133,6 @@ message PredictOptions {
  repeated string Images = 42;
  bool UseTokenizerTemplate = 43;
  repeated Message Messages = 44;
  repeated string Videos = 45;
  repeated string Audios = 46;
  string CorrelationId = 47;
 }
 // The response message containing the result
@@ -159,8 +140,6 @@ message Reply {
  bytes message = 1;
  int32 tokens = 2;
  int32 prompt_tokens = 3;
  double timing_prompt_processing = 4;
  double timing_token_generation = 5;
 }
 message ModelOptions {
@@ -223,7 +202,6 @@ message ModelOptions {
  int32  SwapSpace = 53;
  int32  MaxModelLen = 54;
  int32  TensorParallelSize = 55;
  string LoadFormat = 58;
  string MMProj = 41;
@@ -237,16 +215,6 @@ message ModelOptions {
  bool FlashAttention = 56;
  bool NoKVOffload = 57;
  string ModelPath = 59;
  repeated string LoraAdapters = 60;
  repeated float LoraScales = 61;
  repeated string Options = 62;
  string CacheTypeKey = 63;
  string CacheTypeValue = 64;
 }
 message Result {
@@ -302,30 +270,6 @@ message TTSRequest {
  optional string language = 5;
 }
 message VADRequest {
  repeated float audio = 1;
 }
 message VADSegment {
  float start = 1;
  float end = 2;
 }
 message VADResponse {
  repeated VADSegment segments = 1;
 }
 message SoundGenerationRequest {
  string text = 1;
  string model = 2;
  string dst = 3;
  optional float duration = 4;
  optional float temperature = 5;
  optional bool sample = 6;
  optional string src = 7;
  optional int32 src_divisor = 8;
 }
 message TokenizationResponse {
  int32 length = 1;
  repeated int32 tokens = 2;
--- a/backend/cpp/grpc/Makefile
+++ b/backend/cpp/grpc/Makefile
@@ -46,13 +46,8 @@ endif
 $(INSTALLED_PACKAGES): grpc_build
 $(GRPC_REPO):
-	mkdir -p $(GRPC_REPO)/grpc
+	git clone --depth $(GIT_CLONE_DEPTH) -b $(TAG_LIB_GRPC) $(GIT_REPO_LIB_GRPC) $(GRPC_REPO)/grpc
-	cd $(GRPC_REPO)/grpc && \
+	cd $(GRPC_REPO)/grpc && git submodule update --jobs 2 --init --recursive --depth $(GIT_CLONE_DEPTH)
 	git init && \
 	git remote add origin $(GIT_REPO_LIB_GRPC)  && \
 	git fetch origin && \
 	git checkout $(TAG_LIB_GRPC) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 $(GRPC_BUILD): $(GRPC_REPO)
 	mkdir -p $(GRPC_BUILD)
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -1,6 +1,5 @@
 LLAMA_VERSION?=
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 CMAKE_ARGS?=
 BUILD_TYPE?=
@@ -22,7 +21,7 @@ else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIP=ON
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)
@@ -30,7 +29,9 @@ else ifeq ($(OS),Darwin)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
 		CMAKE_ARGS+=-DGGML_METAL=ON
-		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+# Until this is tested properly, we disable embedded metal file
 # as we already embed it as part of the LocalAI assets
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=OFF
 		TARGET+=--target ggml-metal
 	endif
 endif
@@ -44,13 +45,11 @@ ifeq ($(BUILD_TYPE),sycl_f32)
 endif
 llama.cpp:
-	mkdir -p llama.cpp
+	git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
-	cd llama.cpp && \
+	if [ -z "$(LLAMA_VERSION)" ]; then \
-	git init && \
+		exit 1; \
-	git remote add origin $(LLAMA_REPO)  && \
+	fi
-	git fetch origin && \
+	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1
 	git checkout -b build $(LLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 llama.cpp/examples/grpc-server: llama.cpp
 	mkdir -p llama.cpp/examples/grpc-server
@@ -72,9 +71,9 @@ clean: purge
 grpc-server: llama.cpp llama.cpp/examples/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
-	+bash -c "source $(ONEAPI_VARS); \
+	bash -c "source $(ONEAPI_VARS); \
 	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)"
 else
-	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
 endif
 	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/cpp/llama/patches/01-llava.patch
+++ b/backend/cpp/llama/patches/01-llava.patch
@@ -1,13 +0,0 @@
 diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
 index 3cd0d2fa..6c5e811a 100644
 --- a/examples/llava/clip.cpp
 +++ b/examples/llava/clip.cpp
@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
                 int* patches_data = (int*)malloc(ggml_nbytes(patches));
                 for (int i = 0; i < num_patches; i++) {
 -                    patches_data[i] = i + 1;
 +                    patches_data[i] = i;
                 }
                 ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
                 free(patches_data);
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -1,12 +1,5 @@
 #!/bin/bash
 ## Patches
 ## Apply patches from the `patches` directory
 for patch in $(ls patches); do
    echo "Applying patch $patch"
    patch -d llama.cpp/ -p1 < patches/$patch
 done 
 cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
 cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
 cp -rfv json.hpp llama.cpp/examples/grpc-server/
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -481,3 +481,30 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
    return ret;
 }
 //
 // random string / id
 //
 static std::string random_string()
 {
    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
    std::random_device rd;
    std::mt19937 generator(rd());
    std::string result(32, ' ');
    for (int i = 0; i < 32; ++i) {
        result[i] = str[generator() % str.size()];
    }
    return result;
 }
 static std::string gen_chatcmplid()
 {
    std::stringstream chatcmplid;
    chatcmplid << "chatcmpl-" << random_string();
    return chatcmplid.str();
 }
--- a/backend/go/bark/Makefile
+++ b/backend/go/bark/Makefile
@@ -1,25 +0,0 @@
 INCLUDE_PATH := $(abspath ./)
 LIBRARY_PATH := $(abspath ./)
 AR?=ar
 BUILD_TYPE?=
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../sources/bark.cpp/examples -I$(INCLUDE_PATH)/../../../sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/../../../sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
 LDFLAGS  = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/../../../sources/bark.cpp/build/examples -lbark -lstdc++ -lm
 # warnings
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
 gobark.o:
 	$(CXX) $(CXXFLAGS) gobark.cpp -o gobark.o -c $(LDFLAGS)
 libbark.a: gobark.o
 	cp $(INCLUDE_PATH)/../../../sources/bark.cpp/build/libbark.a ./
 	$(AR) rcs libbark.a gobark.o
 	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml.c.o
 	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-alloc.c.o
 	$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-backend.c.o
 clean:
 	rm -f gobark.o libbark.a
--- a/backend/go/bark/gobark.cpp
+++ b/backend/go/bark/gobark.cpp
@@ -1,85 +0,0 @@
 #include <iostream>
 #include <tuple>
 #include "bark.h"
 #include "gobark.h"
 #include "common.h"
 #include "ggml.h"
 struct bark_context *c;
 void bark_print_progress_callback(struct bark_context *bctx, enum bark_encoding_step step, int progress, void *user_data) {
    if (step == bark_encoding_step::SEMANTIC) {
        printf("\rGenerating semantic tokens... %d%%", progress);
    } else if (step == bark_encoding_step::COARSE) {
        printf("\rGenerating coarse tokens... %d%%", progress);
    } else if (step == bark_encoding_step::FINE) {
        printf("\rGenerating fine tokens... %d%%", progress);
    }
    fflush(stdout);
 }
 int load_model(char *model) {
    // initialize bark context
    struct bark_context_params ctx_params = bark_context_default_params();
    bark_params params;
    params.model_path = model;
   // ctx_params.verbosity = verbosity;
    ctx_params.progress_callback = bark_print_progress_callback;
    ctx_params.progress_callback_user_data = nullptr;
    struct bark_context *bctx = bark_load_model(params.model_path.c_str(), ctx_params, params.seed);
    if (!bctx) {
        fprintf(stderr, "%s: Could not load model\n", __func__);
        return 1;
    }
    c = bctx;
    return 0;
 }
 int tts(char *text,int  threads, char *dst ) {
    ggml_time_init();
    const int64_t t_main_start_us = ggml_time_us();
    // generate audio
    if (!bark_generate_audio(c, text, threads)) {
        fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
        return 1;
    }
    const float *audio_data = bark_get_audio_data(c);
    if (audio_data == NULL) {
        fprintf(stderr, "%s: Could not get audio data\n", __func__);
        return 1;
    }
    const int audio_arr_size = bark_get_audio_data_size(c);
    std::vector<float> audio_arr(audio_data, audio_data + audio_arr_size);
    write_wav_on_disk(audio_arr, dst);
    // report timing
    {
        const int64_t t_main_end_us = ggml_time_us();
        const int64_t t_load_us = bark_get_load_time(c);
        const int64_t t_eval_us = bark_get_eval_time(c);
        printf("\n\n");
        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
        printf("%s:     eval time = %8.2f ms\n", __func__, t_eval_us / 1000.0f);
        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
    }
    return 0;
 }
 int unload() {
    bark_free(c);
 }
--- a/backend/go/bark/gobark.go
+++ b/backend/go/bark/gobark.go
@@ -1,52 +0,0 @@
 package main
 // #cgo CXXFLAGS: -I${SRCDIR}/../../../sources/bark.cpp/ -I${SRCDIR}/../../../sources/bark.cpp/encodec.cpp -I${SRCDIR}/../../../sources/bark.cpp/examples -I${SRCDIR}/../../../sources/bark.cpp/spm-headers
 // #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../sources/bark.cpp/build/examples -L${SRCDIR}/../../../sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon
 // #include <gobark.h>
 // #include <stdlib.h>
 import "C"
 import (
 	"fmt"
 	"unsafe"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type Bark struct {
 	base.SingleThread
 	threads int
 }
 func (sd *Bark) Load(opts *pb.ModelOptions) error {
 	sd.threads = int(opts.Threads)
 	modelFile := C.CString(opts.ModelFile)
 	defer C.free(unsafe.Pointer(modelFile))
 	ret := C.load_model(modelFile)
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
 	return nil
 }
 func (sd *Bark) TTS(opts *pb.TTSRequest) error {
 	t := C.CString(opts.Text)
 	defer C.free(unsafe.Pointer(t))
 	dst := C.CString(opts.Dst)
 	defer C.free(unsafe.Pointer(dst))
 	threads := C.int(sd.threads)
 	ret := C.tts(t, threads, dst)
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
 	return nil
 }
--- a/backend/go/bark/gobark.h
+++ b/backend/go/bark/gobark.h
@@ -1,8 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 int load_model(char *model);
 int tts(char *text,int  threads, char *dst );
 #ifdef __cplusplus
 }
 #endif
--- a/backend/go/image/stablediffusion-ggml/Makefile
+++ b/backend/go/image/stablediffusion-ggml/Makefile
@@ -1,96 +0,0 @@
 INCLUDE_PATH := $(abspath ./)
 LIBRARY_PATH := $(abspath ./)
 AR?=ar
 CMAKE_ARGS?=
 BUILD_TYPE?=
 ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DGGML_CUDA=ON
 # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DGGML_HIP=ON
 # If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),Darwin)
 	ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
 		CMAKE_ARGS+=-DGGML_METAL=ON
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		TARGET+=--target ggml-metal
 	endif
 endif
 # ifeq ($(BUILD_TYPE),sycl_f16)
 # 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DSD_SYCL=ON -DGGML_SYCL_F16=ON
 # endif
 # ifeq ($(BUILD_TYPE),sycl_f32)
 # 	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON
 # endif
 # warnings
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
 # Find all .a archives in ARCHIVE_DIR
 # (ggml can have different backends cpu, cuda, etc., each backend generates a .a archive)
 GGML_ARCHIVE_DIR := build/ggml/src/
 ALL_ARCHIVES := $(shell find $(GGML_ARCHIVE_DIR) -type f -name '*.a')
 # Name of the single merged library
 COMBINED_LIB := libggmlall.a
 # Rule to merge all the .a files into one
 $(COMBINED_LIB): $(ALL_ARCHIVES)
 	@echo "Merging all .a into $(COMBINED_LIB)"
 	rm -f $@
 	mkdir -p merge-tmp
 	for a in $(ALL_ARCHIVES); do \
 		( cd merge-tmp && ar x ../$$a ); \
 	done
 	( cd merge-tmp && ar rcs ../$@ *.o )
 	# Ensure we have a proper index
 	ranlib $@
 	# Clean up
 	rm -rf merge-tmp
 build/libstable-diffusion.a:
 	@echo "Building SD with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
 	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
 	cmake --build . --config Release"
 else
 	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) ../../../../../sources/stablediffusion-ggml.cpp && \
 	cmake --build . --config Release
 endif
 	$(MAKE) $(COMBINED_LIB)
 gosd.o:
 	$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
 libsd.a: gosd.o
 	cp $(INCLUDE_PATH)/build/libstable-diffusion.a ./libsd.a
 	$(AR) rcs libsd.a gosd.o
 clean:
 	rm -rf gosd.o libsd.a build $(COMBINED_LIB)
--- a/backend/go/image/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/image/stablediffusion-ggml/gosd.cpp
@@ -1,228 +0,0 @@
 #include <stdio.h>
 #include <string.h>
 #include <time.h>
 #include <iostream>
 #include <random>
 #include <string>
 #include <vector>
 #include "gosd.h"
 // #include "preprocessing.hpp"
 #include "flux.hpp"
 #include "stable-diffusion.h"
 #define STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_STATIC
 #include "stb_image.h"
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #define STB_IMAGE_WRITE_STATIC
 #include "stb_image_write.h"
 #define STB_IMAGE_RESIZE_IMPLEMENTATION
 #define STB_IMAGE_RESIZE_STATIC
 #include "stb_image_resize.h"
 // Names of the sampler method, same order as enum sample_method in stable-diffusion.h
 const char* sample_method_str[] = {
    "euler_a",
    "euler",
    "heun",
    "dpm2",
    "dpm++2s_a",
    "dpm++2m",
    "dpm++2mv2",
    "ipndm",
    "ipndm_v",
    "lcm",
 };
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
 const char* schedule_str[] = {
    "default",
    "discrete",
    "karras",
    "exponential",
    "ays",
    "gits",
 };
 sd_ctx_t* sd_c;
 sample_method_t sample_method;
 int load_model(char *model, char* options[], int threads, int diff) {
    fprintf (stderr, "Loading model!\n");
    char *stableDiffusionModel = "";
    if (diff == 1 ) {
        stableDiffusionModel = model;
        model = "";
    }
    // decode options. Options are in form optname:optvale, or if booleans only optname.
    char *clip_l_path  = "";
    char *clip_g_path  = "";
    char *t5xxl_path  = "";
    char *vae_path  = "";
    char *scheduler = "";
    char *sampler = "";
    // If options is not NULL, parse options
    for (int i = 0; options[i] != NULL; i++) {
        char *optname = strtok(options[i], ":");
        char *optval = strtok(NULL, ":");
        if (optval == NULL) {
            optval = "true";
        }
        if (!strcmp(optname, "clip_l_path")) {
            clip_l_path = optval;
        }
        if (!strcmp(optname, "clip_g_path")) {
            clip_g_path = optval;
        }
        if (!strcmp(optname, "t5xxl_path")) {
            t5xxl_path = optval;
        }
        if (!strcmp(optname, "vae_path")) {
            vae_path = optval;
        }
        if (!strcmp(optname, "scheduler")) {
            scheduler = optval;
        }
        if (!strcmp(optname, "sampler")) {
            sampler = optval;
        }
    }
    int sample_method_found = -1;
    for (int m = 0; m < N_SAMPLE_METHODS; m++) {
        if (!strcmp(sampler, sample_method_str[m])) {
            sample_method_found = m;
        }
    }
    if (sample_method_found == -1) {
        fprintf(stderr, "Invalid sample method, default to EULER_A!\n");
        sample_method_found = EULER_A;
    }
    sample_method = (sample_method_t)sample_method_found;
    int schedule_found            = -1;
    for (int d = 0; d < N_SCHEDULES; d++) {
        if (!strcmp(scheduler, schedule_str[d])) {
            schedule_found = d;
                fprintf (stderr, "Found scheduler: %s\n", scheduler);
        }
    }
    if (schedule_found == -1) {
        fprintf (stderr, "Invalid scheduler! using DEFAULT\n");
        schedule_found = DEFAULT;
    }
    schedule_t schedule = (schedule_t)schedule_found;
    fprintf (stderr, "Creating context\n");
    sd_ctx_t* sd_ctx = new_sd_ctx(model,
                                  clip_l_path,
                                  clip_g_path,
                                  t5xxl_path,
                                  stableDiffusionModel,
                                  vae_path,
                                  "",
                                  "",
                                  "",
                                  "",
                                  "",
                                  false,
                                  false,
                                  false,
                                  threads,
                                  SD_TYPE_COUNT,
                                  STD_DEFAULT_RNG,
                                  schedule,
                                  false,
                                  false,
                                  false,
                                  false);
    if (sd_ctx == NULL) {
        fprintf (stderr, "failed loading model (generic error)\n");
        return 1;
    }
    fprintf (stderr, "Created context: OK\n");
    sd_c = sd_ctx;
    return 0;
 }
 int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed , char *dst, float cfg_scale) {
    sd_image_t* results;
    std::vector<int> skip_layers = {7, 8, 9};
    fprintf (stderr, "Generating image\n");
    results = txt2img(sd_c,
                            text,
                            negativeText,
                            -1, //clip_skip
                            cfg_scale, // sfg_scale
                            3.5f,
                            width,
                            height,
                            sample_method, 
                            steps,
                            seed,
                            1,
                            NULL,
                            0.9f,
                            20.f,
                            false,
                            "",
                            skip_layers.data(),
                            skip_layers.size(),
                            0,
                            0.01,
                            0.2);
    if (results == NULL) {
        fprintf (stderr, "NO results\n");
        return 1;
    }
    if (results[0].data == NULL) {
        fprintf (stderr, "Results with no data\n");
        return 1;
    }
    fprintf (stderr, "Writing PNG\n");
    fprintf (stderr, "DST: %s\n", dst);
    fprintf (stderr, "Width: %d\n", results[0].width);
    fprintf (stderr, "Height: %d\n", results[0].height);
    fprintf (stderr, "Channel: %d\n", results[0].channel);
    fprintf (stderr, "Data: %p\n", results[0].data);
    stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
                       results[0].data, 0, NULL);
    fprintf (stderr, "Saved resulting image to '%s'\n", dst);
    // TODO: free results. Why does it crash?
    free(results[0].data);
    results[0].data = NULL;
    free(results);
    fprintf (stderr, "gen_image is done", dst);
    return 0;
 }
 int unload() {
    free_sd_ctx(sd_c);
 }
--- a/backend/go/image/stablediffusion-ggml/gosd.go
+++ b/backend/go/image/stablediffusion-ggml/gosd.go
@@ -1,96 +0,0 @@
 package main
 // #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
 // #cgo LDFLAGS: -L${SRCDIR}/ -lsd -lstdc++ -lm -lggmlall -lgomp
 // #include <gosd.h>
 // #include <stdlib.h>
 import "C"
 import (
 	"fmt"
 	"os"
 	"path/filepath"
 	"strings"
 	"unsafe"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/utils"
 )
 type SDGGML struct {
 	base.SingleThread
 	threads      int
 	sampleMethod string
 	cfgScale     float32
 }
 func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
 	sd.threads = int(opts.Threads)
 	modelFile := C.CString(opts.ModelFile)
 	defer C.free(unsafe.Pointer(modelFile))
 	var options **C.char
 	// prepare the options array to pass to C
 	size := C.size_t(unsafe.Sizeof((*C.char)(nil)))
 	length := C.size_t(len(opts.Options))
 	options = (**C.char)(C.malloc(length * size))
 	view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0:len(opts.Options):len(opts.Options)]
 	var diffusionModel int
 	var oo []string
 	for _, op := range opts.Options {
 		if op == "diffusion_model" {
 			diffusionModel = 1
 			continue
 		}
 		// If it's an option path, we resolve absolute path from the model path
 		if strings.Contains(op, ":") && strings.Contains(op, "path") {
 			data := strings.Split(op, ":")
 			data[1] = filepath.Join(opts.ModelPath, data[1])
 			if err := utils.VerifyPath(data[1], opts.ModelPath); err == nil {
 				oo = append(oo, strings.Join(data, ":"))
 			}
 		} else {
 			oo = append(oo, op)
 		}
 	}
 	fmt.Fprintf(os.Stderr, "Options: %+v\n", oo)
 	for i, x := range oo {
 		view[i] = C.CString(x)
 	}
 	sd.cfgScale = opts.CFGScale
 	ret := C.load_model(modelFile, options, C.int(opts.Threads), C.int(diffusionModel))
 	if ret != 0 {
 		return fmt.Errorf("could not load model")
 	}
 	return nil
 }
 func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
 	t := C.CString(opts.PositivePrompt)
 	defer C.free(unsafe.Pointer(t))
 	dst := C.CString(opts.Dst)
 	defer C.free(unsafe.Pointer(dst))
 	negative := C.CString(opts.NegativePrompt)
 	defer C.free(unsafe.Pointer(negative))
 	ret := C.gen_image(t, negative, C.int(opts.Width), C.int(opts.Height), C.int(opts.Step), C.int(opts.Seed), dst, C.float(sd.cfgScale))
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
 	return nil
 }
--- a/backend/go/image/stablediffusion-ggml/gosd.h
+++ b/backend/go/image/stablediffusion-ggml/gosd.h
@@ -1,8 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 int load_model(char *model, char* options[], int threads, int diffusionModel);
 int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed, char *dst, float cfg_scale);
 #ifdef __cplusplus
 }
 #endif
--- a/backend/go/image/stablediffusion-ggml/main.go
+++ b/backend/go/image/stablediffusion-ggml/main.go
@@ -1,6 +1,7 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
@@ -14,7 +15,7 @@ var (
 func main() {
 	flag.Parse()
-	if err := grpc.StartServer(*addr, &SDGGML{}); err != nil {
+	if err := grpc.StartServer(*addr, &Image{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/image/stablediffusion/stablediffusion.go
+++ b/backend/go/image/stablediffusion/stablediffusion.go
@@ -0,0 +1,33 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/stablediffusion"
 )
 type Image struct {
 	base.SingleThread
 	stablediffusion *stablediffusion.StableDiffusion
 }
 func (image *Image) Load(opts *pb.ModelOptions) error {
 	var err error
 	// Note: the Model here is a path to a directory containing the model files
 	image.stablediffusion, err = stablediffusion.New(opts.ModelFile)
 	return err
 }
 func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
 	return image.stablediffusion.GenerateImage(
 		int(opts.Height),
 		int(opts.Width),
 		int(opts.Mode),
 		int(opts.Step),
 		int(opts.Seed),
 		opts.PositivePrompt,
 		opts.NegativePrompt,
 		opts.Dst)
 }
--- a/backend/go/image/tinydream/main.go
+++ b/backend/go/image/tinydream/main.go
@@ -0,0 +1,21 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &Image{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/image/tinydream/tinydream.go
+++ b/backend/go/image/tinydream/tinydream.go
@@ -0,0 +1,32 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/tinydream"
 )
 type Image struct {
 	base.SingleThread
 	tinydream *tinydream.TinyDream
 }
 func (image *Image) Load(opts *pb.ModelOptions) error {
 	var err error
 	// Note: the Model here is a path to a directory containing the model files
 	image.tinydream, err = tinydream.New(opts.ModelFile)
 	return err
 }
 func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
 	return image.tinydream.GenerateImage(
 		int(opts.Height),
 		int(opts.Width),
 		int(opts.Step),
 		int(opts.Seed),
 		opts.PositivePrompt,
 		opts.NegativePrompt,
 		opts.Dst)
 }
--- a/backend/go/llm/bert/bert.go
+++ b/backend/go/llm/bert/bert.go
@@ -0,0 +1,34 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	bert "github.com/go-skynet/go-bert.cpp"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type Embeddings struct {
 	base.SingleThread
 	bert *bert.Bert
 }
 func (llm *Embeddings) Load(opts *pb.ModelOptions) error {
 	model, err := bert.New(opts.ModelFile)
 	llm.bert = model
 	return err
 }
 func (llm *Embeddings) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
 	if len(opts.EmbeddingTokens) > 0 {
 		tokens := []int{}
 		for _, t := range opts.EmbeddingTokens {
 			tokens = append(tokens, int(t))
 		}
 		return llm.bert.TokenEmbeddings(tokens, bert.SetThreads(int(opts.Threads)))
 	}
 	return llm.bert.Embeddings(opts.Embeddings, bert.SetThreads(int(opts.Threads)))
 }
--- a/backend/go/llm/bert/main.go
+++ b/backend/go/llm/bert/main.go
@@ -0,0 +1,21 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &Embeddings{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/gpt4all/gpt4all.go
+++ b/backend/go/llm/gpt4all/gpt4all.go
@@ -0,0 +1,62 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	gpt4all "github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
 )
 type LLM struct {
 	base.SingleThread
 	gpt4all *gpt4all.Model
 }
 func (llm *LLM) Load(opts *pb.ModelOptions) error {
 	model, err := gpt4all.New(opts.ModelFile,
 		gpt4all.SetThreads(int(opts.Threads)),
 		gpt4all.SetLibrarySearchPath(opts.LibrarySearchPath))
 	llm.gpt4all = model
 	return err
 }
 func buildPredictOptions(opts *pb.PredictOptions) []gpt4all.PredictOption {
 	predictOptions := []gpt4all.PredictOption{
 		gpt4all.SetTemperature(float64(opts.Temperature)),
 		gpt4all.SetTopP(float64(opts.TopP)),
 		gpt4all.SetTopK(int(opts.TopK)),
 		gpt4all.SetTokens(int(opts.Tokens)),
 	}
 	if opts.Batch != 0 {
 		predictOptions = append(predictOptions, gpt4all.SetBatch(int(opts.Batch)))
 	}
 	return predictOptions
 }
 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.gpt4all.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	predictOptions := buildPredictOptions(opts)
 	go func() {
 		llm.gpt4all.SetTokenCallback(func(token string) bool {
 			results <- token
 			return true
 		})
 		_, err := llm.gpt4all.Predict(opts.Prompt, predictOptions...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		llm.gpt4all.SetTokenCallback(nil)
 		close(results)
 	}()
 	return nil
 }
--- a/backend/go/llm/gpt4all/main.go
+++ b/backend/go/llm/gpt4all/main.go
@@ -15,7 +15,7 @@ var (
 func main() {
 	flag.Parse()
-	if err := grpc.StartServer(*addr, &VAD{}); err != nil {
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/llama/llama.go
+++ b/backend/go/llm/llama/llama.go
@@ -6,9 +6,9 @@ import (
 	"fmt"
 	"path/filepath"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	"github.com/go-skynet/go-llama.cpp"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type LLM struct {
--- a/backend/go/llm/rwkv/main.go
+++ b/backend/go/llm/rwkv/main.go
@@ -1,6 +1,7 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
@@ -14,7 +15,7 @@ var (
 func main() {
 	flag.Parse()
-	if err := grpc.StartServer(*addr, &Bark{}); err != nil {
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/go/llm/rwkv/rwkv.go
+++ b/backend/go/llm/rwkv/rwkv.go
@@ -0,0 +1,95 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"path/filepath"
 	"github.com/donomii/go-rwkv.cpp"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 const tokenizerSuffix = ".tokenizer.json"
 type LLM struct {
 	base.SingleThread
 	rwkv *rwkv.RwkvState
 }
 func (llm *LLM) Load(opts *pb.ModelOptions) error {
 	tokenizerFile := opts.Tokenizer
 	if tokenizerFile == "" {
 		modelFile := filepath.Base(opts.ModelFile)
 		tokenizerFile = modelFile + tokenizerSuffix
 	}
 	modelPath := filepath.Dir(opts.ModelFile)
 	tokenizerPath := filepath.Join(modelPath, tokenizerFile)
 	model := rwkv.LoadFiles(opts.ModelFile, tokenizerPath, uint32(opts.GetThreads()))
 	if model == nil {
 		return fmt.Errorf("rwkv could not load model")
 	}
 	llm.rwkv = model
 	return nil
 }
 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 	stopWord := "\n"
 	if len(opts.StopPrompts) > 0 {
 		stopWord = opts.StopPrompts[0]
 	}
 	if err := llm.rwkv.ProcessInput(opts.Prompt); err != nil {
 		return "", err
 	}
 	response := llm.rwkv.GenerateResponse(int(opts.Tokens), stopWord, float32(opts.Temperature), float32(opts.TopP), nil)
 	return response, nil
 }
 func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	go func() {
 		stopWord := "\n"
 		if len(opts.StopPrompts) > 0 {
 			stopWord = opts.StopPrompts[0]
 		}
 		if err := llm.rwkv.ProcessInput(opts.Prompt); err != nil {
 			fmt.Println("Error processing input: ", err)
 			return
 		}
 		llm.rwkv.GenerateResponse(int(opts.Tokens), stopWord, float32(opts.Temperature), float32(opts.TopP), func(s string) bool {
 			results <- s
 			return true
 		})
 		close(results)
 	}()
 	return nil
 }
 func (llm *LLM) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationResponse, error) {
 	tokens, err := llm.rwkv.Tokenizer.Encode(opts.Prompt)
 	if err != nil {
 		return pb.TokenizationResponse{}, err
 	}
 	l := len(tokens)
 	i32Tokens := make([]int32, l)
 	for i, t := range tokens {
 		i32Tokens[i] = int32(t.ID)
 	}
 	return pb.TokenizationResponse{
 		Length: int32(l),
 		Tokens: i32Tokens,
 	}, nil
 }
--- a/backend/go/stores/store.go
+++ b/backend/go/stores/store.go
@@ -311,16 +311,12 @@ func (s *Store) StoresGet(opts *pb.StoresGetOptions) (pb.StoresGetResult, error)
 }
 func isNormalized(k []float32) bool {
-	var sum float64
+	var sum float32
 	for _, v := range k {
-		v64 := float64(v)
+		sum += v
 		sum += v64*v64
 	}
-	s := math.Sqrt(sum)
+	return sum == 1.0
 	return s >= 0.99 && s <= 1.01
 }
 // TODO: This we could replace with handwritten SIMD code
@@ -332,7 +328,7 @@ func normalizedCosineSimilarity(k1, k2 []float32) float32 {
 		dot += k1[i] * k2[i]
 	}
-	assert(dot >= -1.01 && dot <= 1.01, fmt.Sprintf("dot = %f", dot))
+	assert(dot >= -1 && dot <= 1, fmt.Sprintf("dot = %f", dot))
 	// 2.0 * (1.0 - dot) would be the Euclidean distance
 	return dot
@@ -422,7 +418,7 @@ func cosineSimilarity(k1, k2 []float32, mag1 float64) float32 {
 	sim := float32(dot / (mag1 * math.Sqrt(mag2)))
-	assert(sim >= -1.01 && sim <= 1.01, fmt.Sprintf("sim = %f", sim))
+	assert(sim >= -1 && sim <= 1, fmt.Sprintf("sim = %f", sim))
 	return sim
 }
--- a/backend/go/transcribe/whisper/main.go
+++ b/backend/go/transcribe/whisper/main.go
--- a/backend/go/transcribe/transcript.go
+++ b/backend/go/transcribe/transcript.go
@@ -0,0 +1,104 @@
 package main
 import (
 	"fmt"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	"github.com/go-audio/wav"
 	"github.com/mudler/LocalAI/core/schema"
 )
 func ffmpegCommand(args []string) (string, error) {
 	cmd := exec.Command("ffmpeg", args...) // Constrain this to ffmpeg to permit security scanner to see that the command is safe.
 	cmd.Env = os.Environ()
 	out, err := cmd.CombinedOutput()
 	return string(out), err
 }
 // AudioToWav converts audio to wav for transcribe.
 // TODO: use https://github.com/mccoyst/ogg?
 func audioToWav(src, dst string) error {
 	commandArgs := []string{"-i", src, "-format", "s16le", "-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", dst}
 	out, err := ffmpegCommand(commandArgs)
 	if err != nil {
 		return fmt.Errorf("error: %w out: %s", err, out)
 	}
 	return nil
 }
 func Transcript(model whisper.Model, audiopath, language string, translate bool, threads uint) (schema.TranscriptionResult, error) {
 	res := schema.TranscriptionResult{}
 	dir, err := os.MkdirTemp("", "whisper")
 	if err != nil {
 		return res, err
 	}
 	defer os.RemoveAll(dir)
 	convertedPath := filepath.Join(dir, "converted.wav")
 	if err := audioToWav(audiopath, convertedPath); err != nil {
 		return res, err
 	}
 	// Open samples
 	fh, err := os.Open(convertedPath)
 	if err != nil {
 		return res, err
 	}
 	defer fh.Close()
 	// Read samples
 	d := wav.NewDecoder(fh)
 	buf, err := d.FullPCMBuffer()
 	if err != nil {
 		return res, err
 	}
 	data := buf.AsFloat32Buffer().Data
 	// Process samples
 	context, err := model.NewContext()
 	if err != nil {
 		return res, err
 	}
 	context.SetThreads(threads)
 	if language != "" {
 		context.SetLanguage(language)
 	} else {
 		context.SetLanguage("auto")
 	}
 	if translate {
 		context.SetTranslate(true)
 	}
 	if err := context.Process(data, nil, nil); err != nil {
 		return res, err
 	}
 	for {
 		s, err := context.NextSegment()
 		if err != nil {
 			break
 		}
 		var tokens []int
 		for _, t := range s.Tokens {
 			tokens = append(tokens, t.Id)
 		}
 		segment := schema.Segment{Id: s.Num, Text: s.Text, Start: s.Start, End: s.End, Tokens: tokens}
 		res.Segments = append(res.Segments, segment)
 		res.Text += s.Text
 	}
 	return res, nil
 }
--- a/backend/go/transcribe/whisper.go
+++ b/backend/go/transcribe/whisper.go
@@ -0,0 +1,26 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type Whisper struct {
 	base.SingleThread
 	whisper whisper.Model
 }
 func (sd *Whisper) Load(opts *pb.ModelOptions) error {
 	// Note: the Model here is a path to a directory containing the model files
 	w, err := whisper.New(opts.ModelFile)
 	sd.whisper = w
 	return err
 }
 func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.TranscriptionResult, error) {
 	return Transcript(sd.whisper, opts.Dst, opts.Language, opts.Translate, uint(opts.Threads))
 }
--- a/backend/go/transcribe/whisper/whisper.go
+++ b/backend/go/transcribe/whisper/whisper.go
@@ -1,105 +0,0 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"os"
 	"path/filepath"
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	"github.com/go-audio/wav"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/utils"
 )
 type Whisper struct {
 	base.SingleThread
 	whisper whisper.Model
 }
 func (sd *Whisper) Load(opts *pb.ModelOptions) error {
 	// Note: the Model here is a path to a directory containing the model files
 	w, err := whisper.New(opts.ModelFile)
 	sd.whisper = w
 	return err
 }
 func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (pb.TranscriptResult, error) {
 	dir, err := os.MkdirTemp("", "whisper")
 	if err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	defer os.RemoveAll(dir)
 	convertedPath := filepath.Join(dir, "converted.wav")
 	if err := utils.AudioToWav(opts.Dst, convertedPath); err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	// Open samples
 	fh, err := os.Open(convertedPath)
 	if err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	defer fh.Close()
 	// Read samples
 	d := wav.NewDecoder(fh)
 	buf, err := d.FullPCMBuffer()
 	if err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	data := buf.AsFloat32Buffer().Data
 	// Process samples
 	context, err := sd.whisper.NewContext()
 	if err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	context.SetThreads(uint(opts.Threads))
 	if opts.Language != "" {
 		context.SetLanguage(opts.Language)
 	} else {
 		context.SetLanguage("auto")
 	}
 	if opts.Translate {
 		context.SetTranslate(true)
 	}
 	if err := context.Process(data, nil, nil); err != nil {
 		return pb.TranscriptResult{}, err
 	}
 	segments := []*pb.TranscriptSegment{}
 	text := ""
 	for {
 		s, err := context.NextSegment()
 		if err != nil {
 			break
 		}
 		var tokens []int32
 		for _, t := range s.Tokens {
 			tokens = append(tokens, int32(t.Id))
 		}
 		segment := &pb.TranscriptSegment{Id: int32(s.Num), Text: s.Text, Start: int64(s.Start), End: int64(s.End), Tokens: tokens}
 		segments = append(segments, segment)
 		text += s.Text
 	}
 	return pb.TranscriptResult{
 		Segments: segments,
 		Text:     text,
 	}, nil
 }
--- a/backend/go/vad/silero/vad.go
+++ b/backend/go/vad/silero/vad.go
@@ -1,54 +0,0 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/streamer45/silero-vad-go/speech"
 )
 type VAD struct {
 	base.SingleThread
 	detector *speech.Detector
 }
 func (vad *VAD) Load(opts *pb.ModelOptions) error {
 	v, err := speech.NewDetector(speech.DetectorConfig{
 		ModelPath:  opts.ModelFile,
 		SampleRate: 16000,
 		//WindowSize:           1024,
 		Threshold:            0.5,
 		MinSilenceDurationMs: 0,
 		SpeechPadMs:          0,
 	})
 	if err != nil {
 		return fmt.Errorf("create silero detector: %w", err)
 	}
 	vad.detector = v
 	return err
 }
 func (vad *VAD) VAD(req *pb.VADRequest) (pb.VADResponse, error) {
 	audio := req.Audio
 	segments, err := vad.detector.Detect(audio)
 	if err != nil {
 		return pb.VADResponse{}, fmt.Errorf("detect: %w", err)
 	}
 	vadSegments := []*pb.VADSegment{}
 	for _, s := range segments {
 		vadSegments = append(vadSegments, &pb.VADSegment{
 			Start: float32(s.SpeechStartAt),
 			End:   float32(s.SpeechEndAt),
 		})
 	}
 	return pb.VADResponse{
 		Segments: vadSegments,
 	}, nil
 }
--- a/backend/python/autogptq/requirements-cublas11.txt
+++ b/backend/python/autogptq/requirements-cublas11.txt
@@ -1,2 +0,0 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
--- a/backend/python/autogptq/requirements-cublas12.txt
+++ b/backend/python/autogptq/requirements-cublas12.txt
@@ -1 +0,0 @@
 torch==2.4.1
--- a/backend/python/autogptq/requirements-hipblas.txt
+++ b/backend/python/autogptq/requirements-hipblas.txt
@@ -1,2 +1,2 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.4.1+rocm6.0
+torch
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -1,6 +1,5 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,7 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.69.0
+grpcio==1.64.0
 protobuf
 torch
 certifi
 transformers
--- a/backend/python/bark/requirements-cpu.txt
+++ b/backend/python/bark/requirements-cpu.txt
@@ -1,4 +0,0 @@
 transformers
 accelerate
 torch==2.4.1
 torchaudio==2.4.1
--- a/backend/python/bark/requirements-cublas11.txt
+++ b/backend/python/bark/requirements-cublas11.txt
@@ -1,5 +0,0 @@
 --extra-index-url https://download.pytorch.org/whl/cu118
 torch==2.4.1+cu118
 torchaudio==2.4.1+cu118
 transformers
 accelerate
--- a/backend/python/bark/requirements-cublas12.txt
+++ b/backend/python/bark/requirements-cublas12.txt
@@ -1,4 +0,0 @@
 torch==2.4.1
 torchaudio==2.4.1
 transformers
 accelerate
--- a/backend/python/bark/requirements-hipblas.txt
+++ b/backend/python/bark/requirements-hipblas.txt
@@ -1,5 +1,3 @@
 --extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.4.1+rocm6.0
+torch
-torchaudio==2.4.1+rocm6.0
+torchaudio
 transformers
 accelerate
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -1,9 +1,6 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
-torchaudio==2.3.1+cxx11.abi
+torchaudio
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,6 @@
 accelerate
 bark==0.1.5
-grpcio==1.69.0
+grpcio==1.64.0
 protobuf
 certifi
 transformers
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -17,27 +17,11 @@
 # LIMIT_TARGETS="cublas12"
 # source $(dirname $0)/../common/libbackend.sh
 #
 PYTHON_VERSION="3.10"
 function init() {
    # Name of the backend (directory name)
    BACKEND_NAME=${PWD##*/}
    # Path where all backends files are
    MY_DIR=$(realpath `dirname $0`)
    # Build type
    BUILD_PROFILE=$(getBuildProfile)
    # Environment directory
    EDIR=${MY_DIR}
    # Allow to specify a custom env dir for shared environments
    if [ "x${ENV_DIR}" != "x" ]; then
        EDIR=${ENV_DIR}
    fi
    # If a backend has defined a list of valid build profiles...
    if [ ! -z "${LIMIT_TARGETS}" ]; then
        isValidTarget=$(checkTargets ${LIMIT_TARGETS})
@@ -90,14 +74,13 @@ function getBuildProfile() {
 # This function is idempotent, so you can call it as many times as you want and it will
 # always result in an activated virtual environment
 function ensureVenv() {
-    if [ ! -d "${EDIR}/venv" ]; then
+    if [ ! -d "${MY_DIR}/venv" ]; then
-        uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
+        uv venv ${MY_DIR}/venv
        echo "virtualenv created"
    fi
-    # Source if we are not already in a Virtual env
+    if [ "x${VIRTUAL_ENV}" != "x${MY_DIR}/venv" ]; then
-    if [ "x${VIRTUAL_ENV}" != "x${EDIR}/venv" ]; then
+        source ${MY_DIR}/venv/bin/activate
        source ${EDIR}/venv/bin/activate
        echo "virtualenv activated"
    fi
@@ -130,24 +113,13 @@ function installRequirements() {
    # These are the requirements files we will attempt to install, in order
    declare -a requirementFiles=(
-        "${EDIR}/requirements-install.txt"
+        "${MY_DIR}/requirements-install.txt"
-        "${EDIR}/requirements.txt"
+        "${MY_DIR}/requirements.txt"
-        "${EDIR}/requirements-${BUILD_TYPE}.txt"
+        "${MY_DIR}/requirements-${BUILD_TYPE}.txt"
    )
    if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
-        requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}.txt")
+        requirementFiles+=("${MY_DIR}/requirements-${BUILD_PROFILE}.txt")
    fi
    # if BUILD_TYPE is empty, we are a CPU build, so we should try to install the CPU requirements
    if [ "x${BUILD_TYPE}" == "x" ]; then
        requirementFiles+=("${EDIR}/requirements-cpu.txt")
    fi
    requirementFiles+=("${EDIR}/requirements-after.txt")
    if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
        requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}-after.txt")
    fi
    for reqFile in ${requirementFiles[@]}; do
@@ -176,13 +148,13 @@ function startBackend() {
    ensureVenv
    if [ ! -z ${BACKEND_FILE} ]; then
-        exec python ${BACKEND_FILE} $@
+        python ${BACKEND_FILE} $@
    elif [ -e "${MY_DIR}/server.py" ]; then
-        exec python ${MY_DIR}/server.py $@
+        python ${MY_DIR}/server.py $@
    elif [ -e "${MY_DIR}/backend.py" ]; then
-        exec python ${MY_DIR}/backend.py $@
+        python ${MY_DIR}/backend.py $@
    elif [ -e "${MY_DIR}/${BACKEND_NAME}.py" ]; then
-        exec python ${MY_DIR}/${BACKEND_NAME}.py $@
+        python ${MY_DIR}/${BACKEND_NAME}.py $@
    fi
 }
--- a/backend/python/common/template/Makefile
+++ b/backend/python/common/template/Makefile
@@ -1,9 +1,8 @@
 .DEFAULT_GOAL := install
 .PHONY: install
-install:
+install: protogen
 	bash install.sh
 	$(MAKE) protogen
 .PHONY: protogen
 protogen: backend_pb2_grpc.py backend_pb2.py
@@ -13,7 +12,7 @@ protogen-clean:
 	$(RM) backend_pb2_grpc.py backend_pb2.py
 backend_pb2_grpc.py backend_pb2.py:
-	bash protogen.sh
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
 .PHONY: clean
 clean: protogen-clean
--- a/backend/python/common/template/protogen.sh
+++ b/backend/python/common/template/protogen.sh
@@ -1,6 +0,0 @@
 #!/bin/bash
 set -e
 source $(dirname $0)/../common/libbackend.sh
 python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
--- a/backend/python/common/template/requirements-intel.txt
+++ b/backend/python/common/template/requirements-intel.txt
@@ -1,5 +1,4 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
+intel-extension-for-pytorch
-torch==2.3.1+cxx11.abi
+torch
 oneccl_bind_pt==2.3.100+xpu
 optimum[openvino]
--- a/Show More
+++ b/Show More
`@@ -1,2 +1 @@`
	`*.sh text eol=lf`	`*.sh text eol=lf`
	`backend/cpp/llama/*.hpp linguist-vendored`
		`@@ -1,2 +0,0 @@`
			`--extra-index-url https://download.pytorch.org/whl/cu118`
			`torch==2.4.1+cu118`
`@@ -1,2 +1,2 @@`
	`--extra-index-url https://download.pytorch.org/whl/rocm6.0`	`--extra-index-url https://download.pytorch.org/whl/rocm6.0`
	`torch==2.4.1+rocm6.0`	`torch`