tests: try to get logs

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-24 16:51:44 -04:00 · 2024-06-25 09:24:55 +02:00
904 changed files with 52037 additions and 295385 deletions
--- a/.devcontainer-scripts/postcreate.sh
+++ b/.devcontainer-scripts/postcreate.sh
@@ -1,17 +0,0 @@
 #!/bin/bash
 cd /workspace
 # Get the files into the volume without a bind mount
 if [ ! -d ".git" ]; then
    git clone https://github.com/mudler/LocalAI.git .
 else
    git fetch
 fi
 echo "Standard Post-Create script completed."
 if [ -f "/devcontainer-customization/postcreate.sh" ]; then
    echo "Launching customization postcreate.sh"
    bash "/devcontainer-customization/postcreate.sh"
 fi
--- a/.devcontainer-scripts/poststart.sh
+++ b/.devcontainer-scripts/poststart.sh
@@ -1,13 +0,0 @@
 #!/bin/bash
 cd /workspace
 # Ensures generated source files are present upon load
 make prepare
 echo "Standard Post-Start script completed."
 if [ -f "/devcontainer-customization/poststart.sh" ]; then
    echo "Launching customization poststart.sh"
    bash "/devcontainer-customization/poststart.sh"
 fi
--- a/.devcontainer-scripts/utils.sh
+++ b/.devcontainer-scripts/utils.sh
@@ -1,55 +0,0 @@
 #!/bin/bash
 # This file contains some really simple functions that are useful when building up customization scripts.
 # Checks if the git config has a user registered - and sets it up if not.
 #
 # Param 1: name
 # Param 2: email
 #
 config_user() {
    echo "Configuring git for $1 <$2>"
    local gcn=$(git config --global user.name)
    if [ -z "${gcn}" ]; then
        echo "Setting up git user / remote"
        git config --global user.name "$1"
        git config --global user.email "$2"
    fi
 }
 # Checks if the git remote is configured - and sets it up if not. Fetches either way.
 #
 # Param 1: remote name
 # Param 2: remote url
 #
 config_remote() {
    echo "Adding git remote and fetching $2 as $1"
    local gr=$(git remote -v | grep $1)
    if [ -z "${gr}" ]; then
        git remote add $1 $2
    fi
    git fetch $1
 }
 # Setup special .ssh files
 # Prints out lines of text to make things pretty
 # Param 1: bash array, filenames relative to the customization directory that should be copied to ~/.ssh
 setup_ssh() {
    echo "starting ~/.ssh directory setup..."
    mkdir -p "${HOME}.ssh"
    chmod 0700 "${HOME}/.ssh"
    echo "-----"
    local files=("$@")
    for file in "${files[@]}" ; do
        local cfile="/devcontainer-customization/${file}"
        local hfile="${HOME}/.ssh/${file}"
        if [ ! -f "${hfile}" ]; then
            echo "copying \"${file}\""
            cp "${cfile}" "${hfile}"
            chmod 600 "${hfile}"
        fi
    done
    echo "~/.ssh directory setup complete!"
 }
--- a/.devcontainer/customization/README.md
+++ b/.devcontainer/customization/README.md
@@ -1,25 +0,0 @@
 Place any additional resources your environment requires in this directory
 Script hooks are currently called for:
 `postcreate.sh` and `poststart.sh`
 If files with those names exist here, they will be called at the end of the normal script.
 This is a good place to set things like `git config --global user.name` are set - and to handle any other files that are mounted via this directory.
 To assist in doing so, `source /.devcontainer-scripts/utils.sh` will provide utility functions that may be useful - for example:
 ```
 #!/bin/bash
 source "/.devcontainer-scripts/utils.sh"
 sshfiles=("config", "key.pub")
 setup_ssh "${sshfiles[@]}"
 config_user "YOUR NAME" "YOUR EMAIL"
 config_remote "REMOTE NAME" "REMOTE URL"
 ```
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,24 +0,0 @@
 {
    "$schema": "https://raw.githubusercontent.com/devcontainers/spec/main/schemas/devContainer.schema.json",
    "name": "LocalAI",
    "workspaceFolder": "/workspace",
    "dockerComposeFile": [ "./docker-compose-devcontainer.yml" ],
    "service": "api",
    "shutdownAction": "stopCompose",
    "customizations": {
        "vscode": {
            "extensions": [
                "golang.go",
                "ms-vscode.makefile-tools",
                "ms-azuretools.vscode-docker",
                "ms-python.python",
                "ms-python.debugpy",
                "wayou.vscode-todo-highlight",
                "waderyan.gitblame"
            ]
        }
    },
    "forwardPorts": [8080, 3000],
    "postCreateCommand": "bash /.devcontainer-scripts/postcreate.sh",
    "postStartCommand": "bash /.devcontainer-scripts/poststart.sh"
 }
--- a/.devcontainer/docker-compose-devcontainer.yml
+++ b/.devcontainer/docker-compose-devcontainer.yml
@@ -1,44 +0,0 @@
 services:
  api:
    build:
      context: ..
      dockerfile: Dockerfile
      target: devcontainer
    env_file:
      - ../.env
    ports:
      - 8080:8080
    volumes:
      - localai_workspace:/workspace
      - ../models:/host-models
      - ./customization:/devcontainer-customization
    command: /bin/sh -c "while sleep 1000; do :; done"
    cap_add:
      - SYS_PTRACE
    security_opt:
      - seccomp:unconfined
  prometheus:
    image: prom/prometheus
    container_name: prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
    ports:
      - 9090:9090
    restart: unless-stopped
    volumes:
      - ./prometheus:/etc/prometheus
      - prom_data:/prometheus
  grafana:
    image: grafana/grafana
    container_name: grafana
    ports:
      - 3000:3000
    restart: unless-stopped
    environment:
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=grafana
    volumes:
      - ./grafana:/etc/grafana/provisioning/datasources
 volumes:
  prom_data:
  localai_workspace:
--- a/.devcontainer/grafana/datasource.yml
+++ b/.devcontainer/grafana/datasource.yml
@@ -1,10 +0,0 @@
 apiVersion: 1
 datasources:
 - name: Prometheus
  type: prometheus
  url: http://prometheus:9090 
  isDefault: true
  access: proxy
  editable: true
--- a/.devcontainer/prometheus/prometheus.yml
+++ b/.devcontainer/prometheus/prometheus.yml
@@ -1,21 +0,0 @@
 global:
  scrape_interval: 15s
  scrape_timeout: 10s
  evaluation_interval: 15s
 alerting:
  alertmanagers:
  - static_configs:
    - targets: []
    scheme: http
    timeout: 10s
    api_version: v1
 scrape_configs:
 - job_name: prometheus
  honor_timestamps: true
  scrape_interval: 15s
  scrape_timeout: 10s
  metrics_path: /metrics
  scheme: http
  static_configs:
  - targets:
    - localhost:9090
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,11 +1,8 @@
 .idea
 .github
 .vscode
 .devcontainer
 models
 backends
 examples/chatbot-ui/models
 backend/go/image/stablediffusion-ggml/build/
 examples/rwkv/models
 examples/**/models
 Dockerfile*
@@ -16,4 +13,4 @@ __pycache__
 # backend virtual environments
 **/venv
-backend/python/**/source
+backend/python/**/source
--- a/.env
+++ b/.env
@@ -29,9 +29,6 @@
 ## Enable/Disable single backend (useful if only one GPU is available)
 # LOCALAI_SINGLE_ACTIVE_BACKEND=true
 # Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
 # LOCALAI_FORCE_BACKEND_SHUTDOWN=true
 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
 ## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
@@ -41,6 +38,13 @@
 ## Uncomment and set to true to enable rebuilding from source
 # REBUILD=true
 ## Enable go tags, available: stablediffusion, tts
 ## stablediffusion: image generation with stablediffusion
 ## tts: enables text-to-speech with go-piper 
 ## (requires REBUILD=true)
 #
 # GO_TAGS=stablediffusion
 ## Path where to store generated images
 # LOCALAI_IMAGE_PATH=/tmp/generated/images
@@ -69,24 +73,12 @@
 ### Define a list of GRPC Servers for llama-cpp workers to distribute the load
 # https://github.com/ggerganov/llama.cpp/pull/6829
-# https://github.com/ggerganov/llama.cpp/blob/master/tools/rpc/README.md
+# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
 # LLAMACPP_GRPC_SERVERS=""
 ### Enable to run parallel requests
 # LOCALAI_PARALLEL_REQUESTS=true
 # Enable to allow p2p mode
 # LOCALAI_P2P=true
 # Enable to use federated mode
 # LOCALAI_FEDERATED=true
 # Enable to start federation server
 # FEDERATED_SERVER=true
 # Define to use federation token
 # TOKEN=""
 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1 @@
 *.sh text eol=lf
 backend/cpp/llama/*.hpp linguist-vendored
--- a/.github/bump_deps.sh
+++ b/.github/bump_deps.sh
@@ -3,25 +3,7 @@ set -xe
 REPO=$1
 BRANCH=$2
 VAR=$3
 FILE=$4
 if [ -z "$FILE" ]; then
    FILE="Makefile"
 fi
 LAST_COMMIT=$(curl -s -H "Accept: application/vnd.github.VERSION.sha" "https://api.github.com/repos/$REPO/commits/$BRANCH")
-# Read $VAR from Makefile (only first match)
+sed -i Makefile -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
 set +e
 CURRENT_COMMIT="$(grep -m1 "^$VAR?=" $FILE | cut -d'=' -f2)"
 set -e
 sed -i $FILE -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
 if [ -z "$CURRENT_COMMIT" ]; then
    echo "Could not find $VAR in Makefile."
    exit 0
 fi
 echo "Changes: https://github.com/$REPO/compare/${CURRENT_COMMIT}..${LAST_COMMIT}" >> "${VAR}_message.txt"
 echo "${LAST_COMMIT}" >> "${VAR}_commit.txt"
--- a/.github/check_and_update.py
+++ b/.github/check_and_update.py
@@ -1,85 +0,0 @@
 import hashlib
 from huggingface_hub import hf_hub_download, get_paths_info
 import requests
 import sys
 import os
 uri = sys.argv[1]
 file_name = uri.split('/')[-1]
 # Function to parse the URI and determine download method
 def parse_uri(uri):
    if uri.startswith('huggingface://'):
        repo_id = uri.split('://')[1]
        return 'huggingface', repo_id.rsplit('/', 1)[0]
    elif 'huggingface.co' in uri:
        parts = uri.split('/resolve/')
        if len(parts) > 1:
            repo_path = parts[0].split('https://huggingface.co/')[-1]
            return 'huggingface', repo_path
    return 'direct', uri
 def calculate_sha256(file_path):
    sha256_hash = hashlib.sha256()
    with open(file_path, 'rb') as f:
        for byte_block in iter(lambda: f.read(4096), b''):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()
 def manual_safety_check_hf(repo_id):
    scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
    scan = scanResponse.json()
    # Check if 'hasUnsafeFile' exists in the response
    if 'hasUnsafeFile' in scan:
        if scan['hasUnsafeFile']:
            return scan
        else:
            return None
    else:
        return None
 download_type, repo_id_or_url = parse_uri(uri)
 new_checksum =  None
 file_path = None
 # Decide download method based on URI type
 if download_type == 'huggingface':
    # Check if the repo is flagged as dangerous by HF
    hazard = manual_safety_check_hf(repo_id_or_url)
    if hazard != None:
        print(f'Error: HuggingFace has detected security problems for {repo_id_or_url}: {str(hazard)}', filename=file_name)
        sys.exit(5)
    # Use HF API to pull sha
    for file in get_paths_info(repo_id_or_url, [file_name], repo_type='model'):
        try:
            new_checksum = file.lfs.sha256
            break
        except Exception as e:
            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
            sys.exit(2)
    if new_checksum is None:
        try:
            file_path = hf_hub_download(repo_id=repo_id_or_url, filename=file_name)
        except Exception as e:
            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
            sys.exit(2)
 else:
    response = requests.get(repo_id_or_url)
    if response.status_code == 200:
        with open(file_name, 'wb') as f:
            f.write(response.content)
        file_path = file_name
    elif response.status_code == 404:
        print(f'File not found: {response.status_code}', file=sys.stderr)
        sys.exit(2)
    else:
        print(f'Error downloading file: {response.status_code}', file=sys.stderr)
        sys.exit(1)
 if new_checksum is None:
    new_checksum = calculate_sha256(file_path)
    print(new_checksum)
    os.remove(file_path)
 else:
    print(new_checksum)
--- a/.github/checksum_checker.sh
+++ b/.github/checksum_checker.sh
@@ -14,14 +14,77 @@ function check_and_update_checksum() {
    idx="$5"
    # Download the file and calculate new checksum using Python
-    new_checksum=$(python3 ./.github/check_and_update.py $uri)
+    new_checksum=$(python3 -c "
-    result=$?
+import hashlib
 from huggingface_hub import hf_hub_download, get_paths_info
 import requests
 import sys
 import os
-    if [[ $result -eq 5 ]]; then
+uri = '$uri'
-        echo "Contaminated entry detected, deleting entry for $model_name..."
+file_name = uri.split('/')[-1]
-        yq eval -i "del([$idx])" "$input_yaml"
+
-        return
+# Function to parse the URI and determine download method
-    fi
+# Function to parse the URI and determine download method
 def parse_uri(uri):
    if uri.startswith('huggingface://'):
        repo_id = uri.split('://')[1]
        return 'huggingface', repo_id.rsplit('/', 1)[0]
    elif 'huggingface.co' in uri:
        parts = uri.split('/resolve/')
        if len(parts) > 1:
            repo_path = parts[0].split('https://huggingface.co/')[-1]
            return 'huggingface', repo_path
    return 'direct', uri
 def calculate_sha256(file_path):
    sha256_hash = hashlib.sha256()
    with open(file_path, 'rb') as f:
        for byte_block in iter(lambda: f.read(4096), b''):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()
 download_type, repo_id_or_url = parse_uri(uri)
 new_checksum =  None
 # Decide download method based on URI type
 if download_type == 'huggingface':
    # Use HF API to pull sha
    for file in get_paths_info(repo_id_or_url, [file_name], repo_type='model'):
        try:
            new_checksum = file.lfs.sha256
            break
        except Exception as e:
            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
            sys.exit(2)
    if new_checksum is None:
        try:
            file_path = hf_hub_download(repo_id=repo_id_or_url, filename=file_name)
        except Exception as e:
            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
            sys.exit(2)
 else:
    response = requests.get(repo_id_or_url)
    if response.status_code == 200:
        with open(file_name, 'wb') as f:
            f.write(response.content)
        file_path = file_name
    elif response.status_code == 404:
        print(f'File not found: {response.status_code}', file=sys.stderr)
        sys.exit(2)
    else:
        print(f'Error downloading file: {response.status_code}', file=sys.stderr)
        sys.exit(1)
 if new_checksum is None:
    new_checksum = calculate_sha256(file_path)
    print(new_checksum)
    os.remove(file_path)
 else:
    print(new_checksum)
 ")
    if [[ "$new_checksum" == "" ]]; then
        echo "Error calculating checksum for $file_name. Skipping..."
@@ -31,7 +94,7 @@ function check_and_update_checksum() {
    echo "Checksum for $file_name: $new_checksum"
    # Compare and update the YAML file if checksums do not match
-    
+    result=$?
    if [[ $result -eq 2 ]]; then
        echo "File not found, deleting entry for $file_name..."
        # yq eval -i "del(.[$idx].files[] | select(.filename == \"$file_name\"))" "$input_yaml"
--- a/.github/ci/modelslist.go
+++ b/.github/ci/modelslist.go
@@ -6,7 +6,6 @@ import (
 	"io/ioutil"
 	"os"
 	"github.com/microcosm-cc/bluemonday"
 	"gopkg.in/yaml.v3"
 )
@@ -280,12 +279,6 @@ func main() {
 		return
 	}
 	// Ensure that all arbitrary text content is sanitized before display
 	for i, m := range models {
 		models[i].Name = bluemonday.StrictPolicy().Sanitize(m.Name)
 		models[i].Description = bluemonday.StrictPolicy().Sanitize(m.Description)
 	}
 	// render the template
 	data := struct {
 		Models          []*GalleryModel
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -1,16 +1,10 @@
 # https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 version: 2
 updates:
  - package-ecosystem: "gitsubmodule"
    directory: "/"
    schedule:
      interval: "weekly"
  - package-ecosystem: "gomod"
    directory: "/"
    schedule:
      interval: "weekly"
    ignore:
    - dependency-name: "github.com/mudler/LocalAI/pkg/grpc/proto"
  - package-ecosystem: "github-actions"
    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
    directory: "/"
@@ -29,91 +23,3 @@ updates:
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/bark"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/common/template"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/coqui"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/diffusers"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/exllama"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/exllama2"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/mamba"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/openvoice"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/rerankers"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/sentencetransformers"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/transformers"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/vllm"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/examples/chainlit"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/examples/functions"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/examples/langchain/langchainpy-localai-example"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/examples/langchain-chroma"
    schedule:
      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/examples/streamlit-bot"
    schedule:
      interval: "weekly"
  - package-ecosystem: "docker"
    directory: "/examples/k8sgpt"
    schedule:
      interval: "weekly"
  - package-ecosystem: "docker"
    directory: "/examples/kubernetes"
    schedule:
      interval: "weekly"
  - package-ecosystem: "docker"
    directory: "/examples/langchain"
    schedule:
      interval: "weekly"
  - package-ecosystem: "gomod"
    directory: "/examples/semantic-todo"
    schedule:
      interval: "weekly"
  - package-ecosystem: "docker"
    directory: "/examples/telegram-bot"
    schedule:
      interval: "weekly"
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,15 +1,6 @@
-enhancement:
+enhancements:
 - head-branch: ['^feature', 'feature']
 dependencies:
 - any:
  - changed-files:
    - any-glob-to-any-file: 'Makefile'
  - changed-files:
    - any-glob-to-any-file: '*.mod'
  - changed-files:
    - any-glob-to-any-file: '*.sum'
 kind/documentation:
 - any:
  - changed-files:
--- a/.github/release.yml
+++ b/.github/release.yml
@@ -13,9 +13,6 @@ changelog:
      labels:
        - bug
        - regression
    - title: "🖧 P2P area"
      labels:
         - area/p2p
    - title: Exciting New Features 🎉
      labels:
        - Semver-Minor
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
--- a/.github/workflows/backend_build.yml
+++ b/.github/workflows/backend_build.yml
@@ -1,241 +0,0 @@
 ---
 name: 'build python backend container images (reusable)'
 on:
  workflow_call:
    inputs:
      base-image:
        description: 'Base image'
        required: true
        type: string
      build-type:
        description: 'Build type'
        default: ''
        type: string
      cuda-major-version:
        description: 'CUDA major version'
        default: "12"
        type: string
      cuda-minor-version:
        description: 'CUDA minor version'
        default: "1"
        type: string
      platforms:
        description: 'Platforms'
        default: ''
        type: string
      tag-latest:
        description: 'Tag latest'
        default: ''
        type: string
      tag-suffix:
        description: 'Tag suffix'
        default: ''
        type: string
      runs-on:
        description: 'Runs on'
        required: true
        default: ''
        type: string
      backend:
        description: 'Backend to build'
        required: true
        type: string
      context:
        description: 'Build context'
        required: true
        type: string
      dockerfile:
        description: 'Build Dockerfile'
        required: true
        type: string
      skip-drivers:
        description: 'Skip drivers'
        default: 'false'
        type: string
    secrets:
      dockerUsername:
        required: true
      dockerPassword:
        required: true
      quayUsername:
        required: true
      quayPassword:
        required: true
 jobs:
  backend-build:
    runs-on: ${{ inputs.runs-on }}
    steps:
      - name: Free Disk Space (Ubuntu)
        if: inputs.runs-on == 'ubuntu-latest'
        uses: jlumbroso/free-disk-space@main
        with:
          # this might remove tools that are actually needed,
          # if set to "true" but frees about 6 GB
          tool-cache: true
          # all of these default to true, but feel free to set to
          # "false" if necessary for your workflow
          android: true
          dotnet: true
          haskell: true
          large-packages: true
          docker-images: true
          swap-storage: true
      - name: Force Install GIT latest
        run: |
          sudo apt-get update \
          && sudo apt-get install -y software-properties-common \
          && sudo apt-get update \
          && sudo add-apt-repository -y ppa:git-core/ppa \
          && sudo apt-get update \
          && sudo apt-get install -y git
      - name: Checkout
        uses: actions/checkout@v4
      - name: Release space from worker
        if: inputs.runs-on == 'ubuntu-latest'
        run: |
          echo "Listing top largest packages"
          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
          head -n 30 <<< "${pkgs}"
          echo
          df -h
          echo
          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
          sudo apt-get remove --auto-remove android-sdk-platform-tools snapd || true
          sudo apt-get purge --auto-remove android-sdk-platform-tools snapd || true
          sudo rm -rf /usr/local/lib/android
          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
          sudo rm -rf /usr/share/dotnet
          sudo apt-get remove -y '^mono-.*' || true
          sudo apt-get remove -y '^ghc-.*' || true
          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
          sudo apt-get remove -y 'php.*' || true
          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
          sudo apt-get remove -y '^google-.*' || true
          sudo apt-get remove -y azure-cli || true
          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
          sudo apt-get remove -y '^gfortran-.*' || true
          sudo apt-get remove -y microsoft-edge-stable || true
          sudo apt-get remove -y firefox || true
          sudo apt-get remove -y powershell || true
          sudo apt-get remove -y r-base-core || true
          sudo apt-get autoremove -y
          sudo apt-get clean
          echo
          echo "Listing top largest packages"
          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
          head -n 30 <<< "${pkgs}"
          echo
          sudo rm -rfv build || true
          sudo rm -rf /usr/share/dotnet || true
          sudo rm -rf /opt/ghc || true
          sudo rm -rf "/usr/local/share/boost" || true
          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
          df -h
      - name: Docker meta
        id: meta
        if: github.event_name != 'pull_request'
        uses: docker/metadata-action@v5
        with:
          images: |
            quay.io/go-skynet/local-ai-backends
            localai/localai-backends
          tags: |
            type=ref,event=branch
            type=semver,pattern={{raw}}
            type=sha
          flavor: |
            latest=${{ inputs.tag-latest }}
            suffix=${{ inputs.tag-suffix }},onlatest=true
      - name: Docker meta for PR
        id: meta_pull_request
        if: github.event_name == 'pull_request'
        uses: docker/metadata-action@v5
        with:
          images: |
            quay.io/go-skynet/ci-tests
          tags: |
            type=ref,event=branch,suffix=${{ github.event.number }}-${{ inputs.backend }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
            type=semver,pattern={{raw}},suffix=${{ github.event.number }}-${{ inputs.backend }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
            type=sha,suffix=${{ github.event.number }}-${{ inputs.backend }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
          flavor: |
            latest=${{ inputs.tag-latest }}
            suffix=${{ inputs.tag-suffix }},onlatest=true
 ## End testing image
      - name: Set up QEMU
        uses: docker/setup-qemu-action@master
        with:
          platforms: all
      - name: Set up Docker Buildx
        id: buildx
        uses: docker/setup-buildx-action@master
      - name: Login to DockerHub
        if: github.event_name != 'pull_request'
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.dockerUsername }}
          password: ${{ secrets.dockerPassword }}
      - name: Login to Quay.io
        # if: github.event_name != 'pull_request'
        uses: docker/login-action@v3
        with:
          registry: quay.io
          username: ${{ secrets.quayUsername }}
          password: ${{ secrets.quayPassword }}
      - name: Build and push
        uses: docker/build-push-action@v6
        if: github.event_name != 'pull_request'
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
            BUILD_TYPE=${{ inputs.build-type }}
            SKIP_DRIVERS=${{ inputs.skip-drivers }}
            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
            BASE_IMAGE=${{ inputs.base-image }}
            BACKEND=${{ inputs.backend }}
          context: ${{ inputs.context }}
          file: ${{ inputs.dockerfile }}
          cache-from: type=gha
          platforms: ${{ inputs.platforms }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
      - name: Build and push (PR)
        uses: docker/build-push-action@v6
        if: github.event_name == 'pull_request'
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
            BUILD_TYPE=${{ inputs.build-type }}
            SKIP_DRIVERS=${{ inputs.skip-drivers }}
            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
            BASE_IMAGE=${{ inputs.base-image }}
            BACKEND=${{ inputs.backend }}
          context: ${{ inputs.context }}
          file: ${{ inputs.dockerfile }}
          cache-from: type=gha
          platforms: ${{ inputs.platforms }}
          push: true
          tags: ${{ steps.meta_pull_request.outputs.tags }}
          labels: ${{ steps.meta_pull_request.outputs.labels }}
      - name: job summary
        run: |
          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY 
--- a/.github/workflows/build-test.yaml
+++ b/.github/workflows/build-test.yaml
@@ -1,23 +0,0 @@
 name: Build test
 on:
  push:
    branches:
      - master
  pull_request:
 jobs:
  build-test:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Set up Go
        uses: actions/setup-go@v5
        with:
          go-version: 1.23
      - name: Run GoReleaser
        run: |
          make dev-dist
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -9,54 +9,54 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - repository: "ggml-org/llama.cpp"
+          - repository: "go-skynet/go-llama.cpp"
-            variable: "LLAMA_VERSION"
+            variable: "GOLLAMA_VERSION"
            branch: "master"
-            file: "backend/cpp/llama-cpp/Makefile"
+          - repository: "ggerganov/llama.cpp"
-          - repository: "ggml-org/whisper.cpp"
+            variable: "CPPLLAMA_VERSION"
            branch: "master"
          - repository: "go-skynet/go-ggml-transformers.cpp"
            variable: "GOGGMLTRANSFORMERS_VERSION"
            branch: "master"
          - repository: "donomii/go-rwkv.cpp"
            variable: "RWKV_VERSION"
            branch: "main"
          - repository: "ggerganov/whisper.cpp"
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
-            file: "backend/go/whisper/Makefile"
+          - repository: "go-skynet/go-bert.cpp"
-          - repository: "PABannier/bark.cpp"
+            variable: "BERT_VERSION"
-            variable: "BARKCPP_VERSION"
+            branch: "master"
-            branch: "main"
+          - repository: "go-skynet/bloomz.cpp"
-            file: "Makefile"
+            variable: "BLOOMZ_VERSION"
-          - repository: "leejet/stable-diffusion.cpp"
+            branch: "main"
-            variable: "STABLEDIFFUSION_GGML_VERSION"
+          - repository: "nomic-ai/gpt4all"
            variable: "GPT4ALL_VERSION"
            branch: "main"
          - repository: "mudler/go-ggllm.cpp"
            variable: "GOGGLLM_VERSION"
            branch: "master"
          - repository: "mudler/go-stable-diffusion"
            variable: "STABLEDIFFUSION_VERSION"
            branch: "master"
            file: "backend/go/stablediffusion-ggml/Makefile"
          - repository: "mudler/go-piper"
            variable: "PIPER_VERSION"
            branch: "master"
            file: "backend/go/piper/Makefile"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Bump dependencies 🔧
        id: bump
        run: |
-          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }} ${{ matrix.file }}
+          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
          {
            echo 'message<<EOF'
            cat "${{ matrix.variable }}_message.txt"
            echo EOF
          } >> "$GITHUB_OUTPUT"
          {
            echo 'commit<<EOF'
            cat "${{ matrix.variable }}_commit.txt"
            echo EOF
          } >> "$GITHUB_OUTPUT"
          rm -rfv ${{ matrix.variable }}_message.txt
          rm -rfv ${{ matrix.variable }}_commit.txt
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Update ${{ matrix.repository }}'
-          title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`'
+          title: ':arrow_up: Update ${{ matrix.repository }}'
          branch: "update/${{ matrix.variable }}"
-          body: ${{ steps.bump.outputs.message }}
+          body: Bump of ${{ matrix.repository }} version
          signoff: true
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -17,12 +17,12 @@ jobs:
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Update docs version ${{ matrix.repository }}'
-          title: 'docs: :arrow_up: update docs version ${{ matrix.repository }}'
+          title: ':arrow_up: Update docs version ${{ matrix.repository }}'
          branch: "update/docs"
          body: Bump of ${{ matrix.repository }} version inside docs
          signoff: true
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -5,7 +5,7 @@ on:
  workflow_dispatch:
 jobs:
  checksum_check:
-    runs-on: ubuntu-latest
+    runs-on: arc-runner-set
    steps:
      - name: Force Install GIT latest
        run: |
@@ -20,11 +20,12 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install -y pip wget
          sudo pip install --upgrade pip 
          pip install huggingface_hub
      - name: 'Setup yq'
-        uses: dcarbone/install-yq-action@v1.3.1
+        uses: dcarbone/install-yq-action@v1.1.1
        with:
-          version: 'v4.44.2'
+          version: 'v4.43.1'
          download-compressed: true
          force: true
@@ -35,12 +36,12 @@ jobs:
          sudo chmod 777 /hf_cache
          bash .github/checksum_checker.sh gallery/index.yaml
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Checksum updates in gallery/index.yaml'
-          title: 'chore(model-gallery): :arrow_up: update checksum'
+          title: 'models(gallery): :arrow_up: update checksum'
          branch: "update/checksum"
          body: Updating checksums in gallery/index.yaml
          signoff: true
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.4.0
+        uses: dependabot/fetch-metadata@v2.1.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -1,64 +0,0 @@
 name: Explorer deployment
 on:
  push:
    branches:
      - master
    tags:
      - 'v*'
 concurrency:
  group: ci-deploy-${{ github.head_ref || github.ref }}-${{ github.repository }}
 jobs:
  build-linux:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          make protogen-go
      - name: Build api
        run: |
          CGO_ENABLED=0 make build
      - name: rm
        uses: appleboy/ssh-action@v1.2.2
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
            key: ${{ secrets.EXPLORER_SSH_KEY }}
            port: ${{ secrets.EXPLORER_SSH_PORT }}
            script: |
                sudo rm -rf local-ai/ || true
      - name: copy file via ssh
        uses: appleboy/scp-action@v1.0.0
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
            key: ${{ secrets.EXPLORER_SSH_KEY }}
            port: ${{ secrets.EXPLORER_SSH_PORT }}
            source: "local-ai"
            overwrite: true
            rm: true
            target: ./local-ai
      - name: restarting
        uses: appleboy/ssh-action@v1.2.2
        with:
            host: ${{ secrets.EXPLORER_SSH_HOST }}
            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
            key: ${{ secrets.EXPLORER_SSH_KEY }}
            port: ${{ secrets.EXPLORER_SSH_PORT }}
            script: |
                sudo cp -rfv local-ai/local-ai /usr/bin/local-ai
                sudo systemctl restart local-ai
--- a/.github/workflows/disabled/comment-pr.yaml
+++ b/.github/workflows/disabled/comment-pr.yaml
@@ -1,83 +0,0 @@
 name: Comment PRs
 on:
  pull_request_target:
 jobs:
  comment-pr:
    env:
        MODEL_NAME: hermes-2-theta-llama-3-8b
    runs-on: ubuntu-latest
    steps:
    - name: Checkout code
      uses: actions/checkout@v3
      with:
        ref: "${{ github.event.pull_request.merge_commit_sha }}"
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
      # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.7.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
            raw_diff_file_output: diff.txt
            file_output_only: "true"
            base_branch: ${{ github.event.pull_request.base.sha }}
    - name: Show diff
      env:
        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
      run: |
            cat $DIFF
    - name: Summarize
      env:
        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
      id: summarize
      run: |
            input="$(cat $DIFF)"
            # Define the LocalAI API endpoint
            API_URL="http://localhost:8080/chat/completions"
            # Create a JSON payload using jq to handle special characters
            json_payload=$(jq -n --arg input "$input" '{
            model: "'$MODEL_NAME'",
            messages: [
                {
                role: "system",
                content: "You are LocalAI-bot in Github that helps understanding PRs and assess complexity. Explain what has changed in this PR diff and why"
                },
                {
                role: "user",
                content: $input
                }
            ]
            }')
            # Send the request to LocalAI
            response=$(curl -s -X POST $API_URL \
            -H "Content-Type: application/json" \
            -d "$json_payload")
            # Extract the summary from the response
            summary="$(echo $response | jq -r '.choices[0].message.content')"
            # Print the summary
            #  -H "Authorization: Bearer $API_KEY" \
            echo "Summary:"
            echo "$summary"
            echo "payload sent"
            echo "$json_payload"
            {
                echo 'message<<EOF'
                echo "$summary"
                echo EOF
              } >> "$GITHUB_OUTPUT"
            docker logs --tail 10 local-ai
    - uses: mshick/add-pr-comment@v2
      if: always()
      with:
          repo-token: ${{ secrets.UPDATE_BOT_TOKEN }}
          message: ${{ steps.summarize.outputs.message }}
          message-failure: |
            Uh oh! Could not analyze this PR, maybe it's too big?
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -2,10 +2,9 @@ name: 'generate and publish GRPC docker caches'
 on:
  workflow_dispatch:
-
+  push:
-  schedule:
+    branches:
-    # daily at midnight
+      - master
    - cron: '0 0 * * *'
 concurrency:
  group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -76,7 +75,7 @@ jobs:
        uses: actions/checkout@v4
      - name: Cache GRPC
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
@@ -85,11 +84,11 @@ jobs:
          build-args: |
            GRPC_BASE_IMAGE=${{ matrix.grpc-base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.65.0
+            GRPC_VERSION=v1.64.0
          context: .
          file: ./Dockerfile
          cache-to: type=gha,ignore-error=true
          cache-from: type=gha
          target: grpc
          platforms: ${{ matrix.platforms }}
-          push: false
+          push: false
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -15,7 +15,7 @@ jobs:
    strategy:
      matrix:
        include:
-          - base-image: intel/oneapi-basekit:2025.2.0-0-devel-ubuntu22.04
+          - base-image: intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64'
    runs-on: ${{matrix.runs-on}}
@@ -46,7 +46,7 @@ jobs:
        uses: actions/checkout@v4
      - name: Cache Intel images
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -9,11 +9,13 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  image-build:
+  extras-image-build:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
      ffmpeg: ${{ matrix.ffmpeg }}
      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
@@ -31,38 +33,107 @@ jobs:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
      max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
      fail-fast: false
      matrix:
        include:
-          - build-type: 'cublas'
+          - build-type: ''
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-gpu-nvidia-cuda12'
+            tag-suffix: '-ffmpeg'
-            runs-on: 'ubuntu-latest'
+            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "5"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'ubuntu-latest'
+            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
-            tag-suffix: 'sycl-f16'
+            tag-suffix: 'sycl-f16-ffmpeg'
-            runs-on: 'ubuntu-latest'
+            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-          - build-type: 'vulkan'
+  core-image-build:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
      ffmpeg: ${{ matrix.ffmpeg }}
      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
      matrix:
        include:
          - build-type: ''
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-vulkan-core'
+            tag-suffix: '-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: 'sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "5"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-vulkan-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -13,11 +13,13 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  hipblas-jobs:
+  self-hosted-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
      ffmpeg: ${{ matrix.ffmpeg }}
      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
@@ -27,29 +29,217 @@ jobs:
      grpc-base-image: ${{ matrix.grpc-base-image }}
      aio: ${{ matrix.aio }}
      makeflags: ${{ matrix.makeflags }}
      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
      max-parallel: ${{ github.event_name != 'pull_request' && 6 || 12 }}
      matrix:
        include:
          # Extra images
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: ''
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "8"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11'
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "5"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12'
            ffmpeg: ''
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "8"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda11-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            aio: "-aio-gpu-nvidia-cuda-11"
            latest-image: 'latest-gpu-nvidia-cuda-11'
            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "5"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda12-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            aio: "-aio-gpu-nvidia-cuda-12"
            latest-image: 'latest-gpu-nvidia-cuda-12'
            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: ''
            #platforms: 'linux/amd64,linux/arm64'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: ''
            ffmpeg: ''
            image-type: 'extras'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: '-gpu-hipblas'
+            tag-suffix: '-hipblas-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            aio: "-aio-gpu-hipblas"
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
-            runs-on: 'ubuntu-latest'
+            latest-image: 'latest-gpu-hipblas'
            latest-image-aio: 'latest-aio-gpu-hipblas'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
            ffmpeg: 'false'
            image-type: 'extras'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            aio: "-aio-gpu-intel-f16"
            latest-image: 'latest-gpu-intel-f16'
            latest-image-aio: 'latest-aio-gpu-intel-f16'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
            runs-on: 'arc-runner-set'
            aio: "-aio-gpu-intel-f32"
            latest-image: 'latest-gpu-intel-f32'
            latest-image-aio: 'latest-aio-gpu-intel-f32'
            makeflags: "--jobs=3 --output-sync=target"
          # Core images
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-core'
            ffmpeg: 'false'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-core'
            ffmpeg: 'false'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas-core'
            ffmpeg: 'false'
            image-type: 'core'
            base-image: "rocm/dev-ubuntu-22.04:6.1"
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
            aio: "-aio-gpu-hipblas"
  core-image-build:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
      ffmpeg: ${{ matrix.ffmpeg }}
      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
@@ -59,105 +249,78 @@ jobs:
      base-image: ${{ matrix.base-image }}
      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
-      skip-drivers: ${{ matrix.skip-drivers }}
+      latest-image: ${{ matrix.latest-image }}
      latest-image-aio: ${{ matrix.latest-image-aio }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
      #max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
      matrix:
        include:
          - build-type: ''
            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'auto'
-            tag-suffix: ''
+            tag-suffix: '-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "ubuntu:22.04"
-            runs-on: 'ubuntu-latest'
+            runs-on: 'arc-runner-set'
            aio: "-aio-cpu"
            latest-image: 'latest-cpu'
            latest-image-aio: 'latest-aio-cpu'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
          - build-type: 'cublas'
            cuda-major-version: "11"
-            cuda-minor-version: "7"
+            cuda-minor-version: "8"
            platforms: 'linux/amd64'
-            tag-latest: 'auto'
+            tag-latest: 'false'
-            tag-suffix: '-gpu-nvidia-cuda11'
+            tag-suffix: '-cublas-cuda11-core'
-            runs-on: 'ubuntu-latest'
+            ffmpeg: ''
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
            aio: "-aio-gpu-nvidia-cuda-11"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "0"
+            cuda-minor-version: "5"
            platforms: 'linux/amd64'
-            tag-latest: 'auto'
+            tag-latest: 'false'
-            tag-suffix: '-gpu-nvidia-cuda12'
+            tag-suffix: '-cublas-cuda12-core'
-            runs-on: 'ubuntu-latest'
+            ffmpeg: ''
            image-type: 'core'
            base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
            cuda-minor-version: "8"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda11-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "5"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
            aio: "-aio-gpu-nvidia-cuda-12"
          - build-type: 'vulkan'
-            platforms: 'linux/amd64'
+            platforms: 'linux/amd64,linux/arm64'
-            tag-latest: 'auto'
+            tag-latest: 'false'
-            tag-suffix: '-gpu-vulkan'
+            tag-suffix: '-vulkan-ffmpeg-core'
-            runs-on: 'ubuntu-latest'
+            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
-            skip-drivers: 'false'
+            makeflags: "--jobs=4 --output-sync=target"
            makeflags: "--jobs=4 --output-sync=target"
            aio: "-aio-gpu-vulkan"
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-gpu-intel-f16'
            runs-on: 'ubuntu-latest'
            makeflags: "--jobs=3 --output-sync=target"
            aio: "-aio-gpu-intel-f16"
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-gpu-intel-f32'
            runs-on: 'ubuntu-latest'
            makeflags: "--jobs=3 --output-sync=target"
            aio: "-aio-gpu-intel-f32"
  gh-runner:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      aio: ${{ matrix.aio }}
      base-image: ${{ matrix.base-image }}
      grpc-base-image: ${{ matrix.grpc-base-image }}
      makeflags: ${{ matrix.makeflags }}
      skip-drivers: ${{ matrix.skip-drivers }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
      matrix:
        include:
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
            platforms: 'linux/arm64'
            tag-latest: 'auto'
            tag-suffix: '-nvidia-l4t-arm64'
            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
            runs-on: 'ubuntu-24.04-arm'
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'true'
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -23,7 +23,7 @@ on:
        type: string
      cuda-minor-version:
        description: 'CUDA minor version'
-        default: "4"
+        default: "5"
        type: string
      platforms:
        description: 'Platforms'
@@ -33,13 +33,25 @@ on:
        description: 'Tag latest'
        default: ''
        type: string
      latest-image:
          description: 'Tag latest'
          default: ''
          type: string
      latest-image-aio:
          description: 'Tag latest'
          default: ''
          type: string
      tag-suffix:
        description: 'Tag suffix'
        default: ''
        type: string
-      skip-drivers:
+      ffmpeg:
-        description: 'Skip drivers by default'
+        description: 'FFMPEG'
-        default: 'false'
+        default: ''
        type: string
      image-type:
        description: 'Image type'
        default: ''
        type: string
      runs-on:
        description: 'Runs on'
@@ -69,22 +81,6 @@ jobs:
  reusable_image-build:
    runs-on: ${{ inputs.runs-on }}
    steps:
      - name: Free Disk Space (Ubuntu)
        if: inputs.runs-on == 'ubuntu-latest'
        uses: jlumbroso/free-disk-space@main
        with:
          # this might remove tools that are actually needed,
          # if set to "true" but frees about 6 GB
          tool-cache: true
          # all of these default to true, but feel free to set to
          # "false" if necessary for your workflow
          android: true
          dotnet: true
          haskell: true
          large-packages: true
          docker-images: true
          swap-storage: true
      - name: Force Install GIT latest
        run: |
          sudo apt-get update \
@@ -106,8 +102,8 @@ jobs:
          df -h
          echo
          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-          sudo apt-get remove --auto-remove android-sdk-platform-tools snapd || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
-          sudo apt-get purge --auto-remove android-sdk-platform-tools snapd || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
          sudo rm -rf /usr/local/lib/android
          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
          sudo rm -rf /usr/share/dotnet
@@ -152,18 +148,18 @@ jobs:
            type=sha
          flavor: |
            latest=${{ inputs.tag-latest }}
-            suffix=${{ inputs.tag-suffix }},onlatest=true
+            suffix=${{ inputs.tag-suffix }}
      - name: Docker meta for PR
        id: meta_pull_request
        if: github.event_name == 'pull_request'
        uses: docker/metadata-action@v5
        with:
          images: |
-            quay.io/go-skynet/ci-tests
+            ttl.sh/localai-ci-pr-${{ github.event.number }}
          tags: |
-            type=ref,event=branch,suffix=localai${{ github.event.number }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
+            type=ref,event=branch
-            type=semver,pattern={{raw}},suffix=localai${{ github.event.number }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
+            type=semver,pattern={{raw}}
-            type=sha,suffix=localai${{ github.event.number }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
+            type=sha
          flavor: |
            latest=${{ inputs.tag-latest }}
            suffix=${{ inputs.tag-suffix }}
@@ -179,7 +175,7 @@ jobs:
            type=semver,pattern={{raw}}
          flavor: |
            latest=${{ inputs.tag-latest }}
-            suffix=${{ inputs.aio }},onlatest=true
+            suffix=${{ inputs.aio }}
      - name: Docker meta AIO (dockerhub)
        if: inputs.aio != ''
@@ -192,8 +188,7 @@ jobs:
            type=ref,event=branch
            type=semver,pattern={{raw}}
          flavor: |
-            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.aio }}
            suffix=${{ inputs.aio }},onlatest=true
      - name: Set up QEMU
        uses: docker/setup-qemu-action@master
@@ -220,7 +215,7 @@ jobs:
          password: ${{ secrets.quayPassword }}
      - name: Build and push
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        if: github.event_name != 'pull_request'
        with:
          builder: ${{ steps.buildx.outputs.name }}
@@ -232,12 +227,13 @@ jobs:
            BUILD_TYPE=${{ inputs.build-type }}
            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
            FFMPEG=${{ inputs.ffmpeg }}
            IMAGE_TYPE=${{ inputs.image-type }}
            BASE_IMAGE=${{ inputs.base-image }}
            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.65.0
+            GRPC_VERSION=v1.64.0
            MAKEFLAGS=${{ inputs.makeflags }}
            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
@@ -247,7 +243,7 @@ jobs:
          labels: ${{ steps.meta.outputs.labels }}
 ### Start testing image
      - name: Build and push
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        if: github.event_name == 'pull_request'
        with:
          builder: ${{ steps.buildx.outputs.name }}
@@ -259,23 +255,28 @@ jobs:
            BUILD_TYPE=${{ inputs.build-type }}
            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
            FFMPEG=${{ inputs.ffmpeg }}
            IMAGE_TYPE=${{ inputs.image-type }}
            BASE_IMAGE=${{ inputs.base-image }}
            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.65.0
+            GRPC_VERSION=v1.64.0
            MAKEFLAGS=${{ inputs.makeflags }}
            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
          cache-from: type=gha
          platforms: ${{ inputs.platforms }}
-          #push: true
+          push: true
          tags: ${{ steps.meta_pull_request.outputs.tags }}
          labels: ${{ steps.meta_pull_request.outputs.labels }}
      - name: Testing image
        if: github.event_name == 'pull_request'
        run: |
          echo "Image is available at ttl.sh/localai-ci-pr-${{ github.event.number }}:${{ steps.meta_pull_request.outputs.version }}" >> $GITHUB_STEP_SUMMARY
 ## End testing image
      - name: Build and push AIO image
        if: inputs.aio != ''
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
@@ -290,7 +291,7 @@ jobs:
      - name: Build and push AIO image (dockerhub)
        if: inputs.aio != ''
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
          build-args: |
@@ -303,6 +304,27 @@ jobs:
          tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
          labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
      - name: Latest tag
        # run this on branches, when it is a tag and there is a latest-image defined
        if: github.event_name != 'pull_request' && inputs.latest-image != ''  && github.ref_type == 'tag'
        run: |
          docker pull localai/localai:${{ steps.meta.outputs.version }}
          docker tag localai/localai:${{ steps.meta.outputs.version }} localai/localai:${{ inputs.latest-image }}
          docker push localai/localai:${{ inputs.latest-image }}
          docker pull quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
          docker tag quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image }}
          docker push quay.io/go-skynet/local-ai:${{ inputs.latest-image }}
      - name: Latest AIO tag
        # run this on branches, when it is a tag and there is a latest-image defined
        if: github.event_name != 'pull_request' && inputs.latest-image-aio != ''  && github.ref_type == 'tag'
        run: |
          docker pull localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }}
          docker tag localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }} localai/localai:${{ inputs.latest-image-aio }}
          docker push localai/localai:${{ inputs.latest-image-aio }}
          docker pull quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }}
          docker tag quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
          docker push quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
      - name: job summary
        run: |
          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -1,168 +0,0 @@
 name: Notifications for new models
 on:
  pull_request:
     types:
       - closed
 jobs:
  notify-discord:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
      with:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.8.1
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
            raw_diff_file_output: diff.txt
            file_output_only: "true"
    - name: Summarize
      env:
        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
      id: summarize
      run: |
            input="$(cat $DIFF)"
            # Define the LocalAI API endpoint
            API_URL="http://localhost:8080/chat/completions"
            # Create a JSON payload using jq to handle special characters
            json_payload=$(jq -n --arg input "$input" '{
            model: "'$MODEL_NAME'",
            messages: [
                {
                role: "system",
                content: "You are LocalAI-bot. Write a discord message to notify everyone about the new model from the git diff. Make it informal. An example can include: the URL of the model, the name, and a brief description of the model if exists. Also add an hint on how to install it in LocalAI and that can be browsed over https://models.localai.io. For example: local-ai run model_name_here"
                },
                {
                role: "user",
                content: $input
                }
            ]
            }')
            # Send the request to LocalAI
            response=$(curl -s -X POST $API_URL \
            -H "Content-Type: application/json" \
            -d "$json_payload")
            # Extract the summary from the response
            summary="$(echo $response | jq -r '.choices[0].message.content')"
            # Print the summary
            #  -H "Authorization: Bearer $API_KEY" \
            echo "Summary:"
            echo "$summary"
            echo "payload sent"
            echo "$json_payload"
            {
                echo 'message<<EOF'
                echo "$summary"
                echo EOF
              } >> "$GITHUB_OUTPUT"
            docker logs --tail 10 local-ai
    - name: Discord notification
      env:
        DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK_URL }}
        DISCORD_USERNAME: "LocalAI-Bot"
        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
      uses: Ilshidur/action-discord@master
      with:
        args: ${{ steps.summarize.outputs.message }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
      uses: mxschmitt/action-tmate@v3.22
      with:
        detached: true
        connect-timeout-seconds: 180
        limit-access-to-actor: true
  notify-twitter:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
      with:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - name: Start LocalAI
      run: |
        echo "Starting LocalAI..."
        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master run --debug $MODEL_NAME
        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
      # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.8.1
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
            raw_diff_file_output: diff.txt
            file_output_only: "true"
    - name: Summarize
      env:
        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
      id: summarize
      run: |
            input="$(cat $DIFF)"
            # Define the LocalAI API endpoint
            API_URL="http://localhost:8080/chat/completions"
            # Create a JSON payload using jq to handle special characters
            json_payload=$(jq -n --arg input "$input" '{
            model: "'$MODEL_NAME'",
            messages: [
                {
                role: "system",
                content: "You are LocalAI-bot. Write a twitter message to notify everyone about the new model from the git diff. Make it informal and really short. An example can include: the name, and a brief description of the model if exists. Also add an hint on how to install it in LocalAI. For example: local-ai run model_name_here"
                },
                {
                role: "user",
                content: $input
                }
            ]
            }')
            # Send the request to LocalAI
            response=$(curl -s -X POST $API_URL \
            -H "Content-Type: application/json" \
            -d "$json_payload")
            # Extract the summary from the response
            summary="$(echo $response | jq -r '.choices[0].message.content')"
            # Print the summary
            #  -H "Authorization: Bearer $API_KEY" \
            echo "Summary:"
            echo "$summary"
            echo "payload sent"
            echo "$json_payload"
            {
                echo 'message<<EOF'
                echo "$summary"
                echo EOF
              } >> "$GITHUB_OUTPUT"
            docker logs --tail 10 local-ai
    - uses: Eomm/why-don-t-you-tweet@v2
      with:
        tweet-message: ${{ steps.summarize.outputs.message }}
      env:
        # Get your tokens from https://developer.twitter.com/apps
        TWITTER_CONSUMER_API_KEY: ${{ secrets.TWITTER_APP_KEY }}
        TWITTER_CONSUMER_API_SECRET: ${{ secrets.TWITTER_APP_SECRET }}
        TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }}
        TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
    - name: Setup tmate session if fails
      if: ${{ failure() }}
      uses: mxschmitt/action-tmate@v3.22
      with:
        detached: true
        connect-timeout-seconds: 180
        limit-access-to-actor: true
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@@ -1,63 +0,0 @@
 name: Release notifications
 on:
  release:
    types:
      - published
 jobs:
  notify-discord:
    runs-on: ubuntu-latest
    env:
        RELEASE_BODY: ${{ github.event.release.body }}
        RELEASE_TITLE: ${{ github.event.release.name }}
        RELEASE_TAG_NAME: ${{ github.event.release.tag_name }}
    steps:
    - uses: mudler/localai-github-action@v1
      with:
        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
    - name: Summarize
      id: summarize
      run: |
            input="$RELEASE_TITLE\b$RELEASE_BODY"
            # Define the LocalAI API endpoint
            API_URL="http://localhost:8080/chat/completions"
            # Create a JSON payload using jq to handle special characters
            json_payload=$(jq -n --arg input "$input" '{
            model: "'$MODEL_NAME'",
            messages: [
                {
                role: "system",
                content: "Write a discord message with a bullet point summary of the release notes."
                },
                {
                role: "user",
                content: $input
                }
            ]
            }')
            # Send the request to LocalAI API
            response=$(curl -s -X POST $API_URL \
            -H "Content-Type: application/json" \
            -d "$json_payload")
            # Extract the summary from the response
            summary=$(echo $response | jq -r '.choices[0].message.content')
            # Print the summary
            #  -H "Authorization: Bearer $API_KEY" \
            {
                echo 'message<<EOF'
                echo "$summary"
                echo EOF
              } >> "$GITHUB_OUTPUT"
    - name: Discord notification
      env:
        DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK_URL_RELEASE }}
        DISCORD_USERNAME: "LocalAI-Bot"
        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
      uses: Ilshidur/action-discord@master
      with:
        args: ${{ steps.summarize.outputs.message }}
--- a/.github/workflows/prlint.yaml
+++ b/.github/workflows/prlint.yaml
@@ -1,28 +0,0 @@
 name: Check PR style
 on:
  pull_request_target:
    types:
      - opened
      - reopened
      - edited
      - synchronize
 jobs:
  title-lint:
    runs-on: ubuntu-latest
    permissions:
      statuses: write
    steps:
      - uses: aslafy-z/conventional-pr-title-action@v3
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 #  check-pr-description:
 #    runs-on: ubuntu-latest
 #    steps:
 #      - uses: actions/checkout@v2
 #      - uses: jadrol/pr-description-checker-action@v1.0.0
 #        id: description-checker
 #        with:
 #          repo-token: ${{ secrets.GITHUB_TOKEN }}
 #          exempt-labels: no qa
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,26 +1,312 @@
-name: goreleaser
+name: Build and Release
 on:
-  push:
+- push
-    tags:
+- pull_request
-      - 'v*'
+
 env:
  GRPC_VERSION: v1.64.0
 permissions:
  contents: write
 concurrency:
  group: ci-releases-${{ github.head_ref || github.ref }}-${{ github.repository }}
  cancel-in-progress: true
 jobs:
-  goreleaser:
+
  build-linux-arm:
    runs-on: ubuntu-latest
    steps:
-      - name: Checkout
+      - name: Clone
        uses: actions/checkout@v4
        with:
-          fetch-depth: 0
+          submodules: true
-      - name: Set up Go
+      - uses: actions/setup-go@v5
        uses: actions/setup-go@v5
        with:
-          go-version: 1.23
+          go-version: '1.21.x'
-      - name: Run GoReleaser
+          cache: false
-        uses: goreleaser/goreleaser-action@v6
+
-        with:
+      - name: Dependencies
-          version: v2.11.0
+        run: |
-          args: release --clean
+          sudo apt-get update
          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
      - name: Install CUDA Dependencies
        run: |
          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
          sudo dpkg -i cuda-keyring_1.1-1_all.deb
          sudo apt-get update
          sudo apt-get install -y cuda-cross-aarch64 cuda-nvcc-cross-aarch64-${CUDA_VERSION} libcublas-cross-aarch64-${CUDA_VERSION}
        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 
+          CUDA_VERSION: 12-5
      - name: Cache grpc
        id: cache-grpc
        uses: actions/cache@v4
        with:
          path: grpc
          key: ${{ runner.os }}-arm-grpc-${{ env.GRPC_VERSION }}
      - name: Build grpc
        if: steps.cache-grpc.outputs.cache-hit != 'true'
        run: |
          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
            -DgRPC_BUILD_TESTS=OFF \
            ../.. && sudo make --jobs 5 --output-sync=target
      - name: Install gRPC
        run: |
          GNU_HOST=aarch64-linux-gnu
          C_COMPILER_ARM_LINUX=$GNU_HOST-gcc
          CXX_COMPILER_ARM_LINUX=$GNU_HOST-g++
          CROSS_TOOLCHAIN=/usr/$GNU_HOST
          CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
          CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
          # https://cmake.org/cmake/help/v3.13/manual/cmake-toolchains.7.html#cross-compiling-for-linux
          echo "set(CMAKE_SYSTEM_NAME Linux)" >> $CMAKE_CROSS_TOOLCHAIN && \
            echo "set(CMAKE_SYSTEM_PROCESSOR arm)" >> $CMAKE_CROSS_TOOLCHAIN && \
            echo "set(CMAKE_STAGING_PREFIX $CROSS_STAGING_PREFIX)" >> $CMAKE_CROSS_TOOLCHAIN && \
            echo "set(CMAKE_SYSROOT ${CROSS_TOOLCHAIN}/sysroot)" >> $CMAKE_CROSS_TOOLCHAIN && \
            echo "set(CMAKE_C_COMPILER /usr/bin/$C_COMPILER_ARM_LINUX)" >> $CMAKE_CROSS_TOOLCHAIN && \
            echo "set(CMAKE_CXX_COMPILER /usr/bin/$CXX_COMPILER_ARM_LINUX)" >> $CMAKE_CROSS_TOOLCHAIN && \
            echo "set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)" >> $CMAKE_CROSS_TOOLCHAIN && \
            echo "set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)" >> $CMAKE_CROSS_TOOLCHAIN && \
            echo "set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)" >> $CMAKE_CROSS_TOOLCHAIN && \
            echo "set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)" >> $CMAKE_CROSS_TOOLCHAIN
          GRPC_DIR=$PWD/grpc
          cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install && \
          GRPC_CROSS_BUILD_DIR=$GRPC_DIR/cmake/cross_build && \
          mkdir -p $GRPC_CROSS_BUILD_DIR && \
          cd $GRPC_CROSS_BUILD_DIR && \
          cmake -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CROSS_TOOLCHAIN \
            -DCMAKE_BUILD_TYPE=Release \
            -DCMAKE_INSTALL_PREFIX=$CROSS_TOOLCHAIN/grpc_install \
            ../.. && \
          sudo make -j`nproc` install
      - name: Build
        id: build
        run: |
          GNU_HOST=aarch64-linux-gnu
          C_COMPILER_ARM_LINUX=$GNU_HOST-gcc
          CXX_COMPILER_ARM_LINUX=$GNU_HOST-g++
          CROSS_TOOLCHAIN=/usr/$GNU_HOST
          CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
          CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
          export PATH=$PATH:$GOPATH/bin
          export PATH=/usr/local/cuda/bin:$PATH
          sudo rm -rf /usr/aarch64-linux-gnu/lib/libstdc++.so.6
          sudo cp -rf /usr/aarch64-linux-gnu/lib/libstdc++.so* /usr/aarch64-linux-gnu/lib/libstdc++.so.6
          sudo cp /usr/aarch64-linux-gnu/lib/ld-linux-aarch64.so.1 ld.so
          GO_TAGS=p2p \
          BACKEND_LIBS="./grpc/cmake/cross_build/third_party/re2/libre2.a ./grpc/cmake/cross_build/libgrpc.a ./grpc/cmake/cross_build/libgrpc++.a ./grpc/cmake/cross_build/third_party/protobuf/libprotobuf.a /usr/aarch64-linux-gnu/lib/libc.so.6 /usr/aarch64-linux-gnu/lib/libstdc++.so.6 /usr/aarch64-linux-gnu/lib/libgomp.so.1 /usr/aarch64-linux-gnu/lib/libm.so.6 /usr/aarch64-linux-gnu/lib/libgcc_s.so.1 /usr/aarch64-linux-gnu/lib/libdl.so.2 /usr/aarch64-linux-gnu/lib/libpthread.so.0 ./ld.so" \
          GOOS=linux \
          GOARCH=arm64 \
          CMAKE_ARGS="-DProtobuf_INCLUDE_DIRS=$CROSS_STAGING_PREFIX/include -DProtobuf_DIR=$CROSS_STAGING_PREFIX/lib/cmake/protobuf -DgRPC_DIR=$CROSS_STAGING_PREFIX/lib/cmake/grpc -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CROSS_TOOLCHAIN -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++" make dist-cross-linux-arm64
      - uses: actions/upload-artifact@v4
        with:
          name: LocalAI-linux-arm64
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
  build-linux:
    runs-on: arc-runner-set
    steps:
      - name: Force Install GIT latest
        run: |
          sudo apt-get update \
          && sudo apt-get install -y software-properties-common \
          && sudo apt-get update \
          && sudo add-apt-repository -y ppa:git-core/ppa \
          && sudo apt-get update \
          && sudo apt-get install -y git
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache cmake
      - name: Intel Dependencies
        run: |
          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
          echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
          sudo apt update
          sudo apt install -y intel-basekit
      - name: Install CUDA Dependencies
        run: |
          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
          sudo dpkg -i cuda-keyring_1.1-1_all.deb
          sudo apt-get update
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
        env:
          CUDA_VERSION: 12-3
      - name: "Install Hipblas"
        env:
          ROCM_VERSION: "6.1"
          AMDGPU_VERSION: "6.1"
        run: |
            set -ex
            sudo apt-get update
            sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates curl libnuma-dev gnupg
            curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add -
            printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | sudo tee /etc/apt/sources.list.d/rocm.list
            printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list
            printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
            sudo apt-get update
            sudo DEBIAN_FRONTEND=noninteractive apt-get install -y \
                hipblas-dev rocm-dev \
                rocblas-dev
            sudo apt-get clean
            sudo rm -rf /var/lib/apt/lists/*
            sudo ldconfig
      - name: Cache grpc
        id: cache-grpc
        uses: actions/cache@v4
        with:
          path: grpc
          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
      - name: Build grpc
        if: steps.cache-grpc.outputs.cache-hit != 'true'
        run: |
          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
            -DgRPC_BUILD_TESTS=OFF \
            ../.. && sudo make --jobs 5 --output-sync=target
      - name: Install gRPC
        run: |
          cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install
      # BACKEND_LIBS needed for gpu-workload: /opt/intel/oneapi/*/lib/libiomp5.so /opt/intel/oneapi/*/lib/libmkl_core.so /opt/intel/oneapi/*/lib/libmkl_core.so.2 /opt/intel/oneapi/*/lib/libmkl_intel_ilp64.so /opt/intel/oneapi/*/lib/libmkl_intel_ilp64.so.2 /opt/intel/oneapi/*/lib/libmkl_sycl_blas.so /opt/intel/oneapi/*/lib/libmkl_sycl_blas.so.4 /opt/intel/oneapi/*/lib/libmkl_tbb_thread.so /opt/intel/oneapi/*/lib/libmkl_tbb_thread.so.2 /opt/intel/oneapi/*/lib/libsycl.so /opt/intel/oneapi/*/lib/libsycl.so.7 /opt/intel/oneapi/*/lib/libsycl.so.7.1.0 /opt/rocm-*/lib/libamdhip64.so /opt/rocm-*/lib/libamdhip64.so.5 /opt/rocm-*/lib/libamdhip64.so.6 /opt/rocm-*/lib/libamdhip64.so.6.1.60100 /opt/rocm-*/lib/libhipblas.so /opt/rocm-*/lib/libhipblas.so.2 /opt/rocm-*/lib/libhipblas.so.2.1.60100 /opt/rocm-*/lib/librocblas.so /opt/rocm-*/lib/librocblas.so.4 /opt/rocm-*/lib/librocblas.so.4.1.60100 /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /usr/lib/x86_64-linux-gnu/libOpenCL.so.1 /usr/lib/x86_64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/x86_64-linux-gnu/libm.so.6 /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 /usr/lib/x86_64-linux-gnu/libc.so.6 /usr/lib/x86_64-linux-gnu/librt.so.1 /usr/local/cuda-*/targets/x86_64-linux/lib/libcublas.so /usr/local/cuda-*/targets/x86_64-linux/lib/libcublasLt.so /usr/local/cuda-*/targets/x86_64-linux/lib/libcudart.so /usr/local/cuda-*/targets/x86_64-linux/lib/stubs/libcuda.so
      - name: Build
        id: build
        run: |
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
          export PATH=$PATH:$GOPATH/bin
          export PATH=/usr/local/cuda/bin:$PATH
          export PATH=/opt/rocm/bin:$PATH
          source /opt/intel/oneapi/setvars.sh
          sudo cp /lib64/ld-linux-x86-64.so.2 ld.so
          GO_TAGS=p2p \
          BACKEND_LIBS="./ld.so /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /usr/lib/x86_64-linux-gnu/libm.so.6 /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 /usr/lib/x86_64-linux-gnu/libc.so.6 /usr/lib/x86_64-linux-gnu/libgomp.so.1" \
          make -j4 dist
      - uses: actions/upload-artifact@v4
        with:
          name: LocalAI-linux
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
  build-stablediffusion:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
      - name: Build stablediffusion
        run: |
          export PATH=$PATH:$GOPATH/bin
          make backend-assets/grpc/stablediffusion
          mkdir -p release && cp backend-assets/grpc/stablediffusion release
        env:
          GO_TAGS: stablediffusion
      - uses: actions/upload-artifact@v4
        with:
          name: stablediffusion
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
            release/*
  build-macOS-arm64:
    runs-on: macos-14
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with:
          submodules: true
      - uses: actions/setup-go@v5
        with:
          go-version: '1.21.x'
          cache: false
      - name: Dependencies
        run: |
          brew install protobuf grpc
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
      - name: Build
        id: build
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
          BACKEND_LIBS="$(ls /opt/homebrew/opt/grpc/lib/*.dylib /opt/homebrew/opt/re2/lib/*.dylib /opt/homebrew/opt/openssl@3/lib/*.dylib /opt/homebrew/opt/protobuf/lib/*.dylib /opt/homebrew/opt/abseil/lib/*.dylib | xargs)" GO_TAGS=p2p make dist
      - uses: actions/upload-artifact@v4
        with:
          name: LocalAI-MacOS-arm64
          path: release/
      - name: Release
        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
            release/*
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
          limit-access-to-actor: true
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -18,7 +18,7 @@ jobs:
        if: ${{ github.actor != 'dependabot[bot]' }}
      - name: Run Gosec Security Scanner
        if: ${{ github.actor != 'dependabot[bot]' }}
-        uses: securego/gosec@v2.22.7
+        uses: securego/gosec@master
        with:
          # we let the report trigger content trigger a failure using the GitHub Security features.
          args: '-no-fail -fmt sarif -out results.sarif ./...'
--- a/.github/workflows/stalebot.yml
+++ b/.github/workflows/stalebot.yml
@@ -1,24 +0,0 @@
 name: 'Close stale issues and PRs'
 permissions:
  issues: write
  pull-requests: write
 on:
  schedule:
    - cron: '30 1 * * *'
 jobs:
  stale:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9
        with:
          stale-issue-message: 'This issue is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
          stale-pr-message: 'This PR is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 10 days.'
          close-issue-message: 'This issue was closed because it has been stalled for 5 days with no activity.'
          close-pr-message: 'This PR was closed because it has been stalled for 10 days with no activity.'
          days-before-issue-stale: 90
          days-before-pr-stale: 90
          days-before-issue-close: 5
          days-before-pr-close: 10
          exempt-issue-labels: 'roadmap'
          exempt-pr-labels: 'roadmap'
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -14,34 +14,12 @@ concurrency:
  cancel-in-progress: true
 jobs:
  # Requires CUDA
  # tests-chatterbox-tts:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
  #       uses: actions/checkout@v4
  #       with:
  #         submodules: true
  #     - name: Dependencies
  #       run: |
  #         sudo apt-get update
  #         sudo apt-get install build-essential ffmpeg
  #         # Install UV
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
  #     - name: Test chatterbox-tts
  #       run: |
  #          make --jobs=5 --output-sync=target -C backend/python/chatterbox
  #          make --jobs=5 --output-sync=target -C backend/python/chatterbox test
  tests-transformers:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with:
+        with: 
          submodules: true
      - name: Dependencies
        run: |
@@ -51,18 +29,42 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user --no-cache-dir grpcio-tools==1.64.1
+          pip install --user grpcio-tools==1.64.0
-
+          
      - name: Test transformers
        run: |
           make --jobs=5 --output-sync=target -C backend/python/transformers
           make --jobs=5 --output-sync=target -C backend/python/transformers test
  tests-sentencetransformers:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools==1.64.0
      - name: Test sentencetransformers
        run: |
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test
  tests-rerankers:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with:
+        with: 
          submodules: true
      - name: Dependencies
        run: |
@@ -72,7 +74,7 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user --no-cache-dir grpcio-tools==1.64.1
+          pip install --user grpcio-tools==1.64.0
      - name: Test rerankers
        run: |
@@ -84,7 +86,7 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with:
+        with: 
          submodules: true
      - name: Dependencies
        run: |
@@ -94,38 +96,86 @@ jobs:
          sudo apt-get install -y libopencv-dev
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
-          pip install --user --no-cache-dir grpcio-tools==1.64.1
+          pip install --user grpcio-tools==1.64.0
      - name: Test diffusers
        run: |
          make --jobs=5 --output-sync=target -C backend/python/diffusers
          make --jobs=5 --output-sync=target -C backend/python/diffusers test
-  #tests-vllm:
+  tests-parler-tts:
-  #  runs-on: ubuntu-latest
+    runs-on: ubuntu-latest
-  #  steps:
+    steps:
-  #    - name: Clone
+      - name: Clone
-  #      uses: actions/checkout@v4
+        uses: actions/checkout@v4
-  #      with:
+        with: 
-  #        submodules: true
+          submodules: true
-  #    - name: Dependencies
+      - name: Dependencies
-  #      run: |
+        run: |
-  #        sudo apt-get update
+          sudo apt-get update
-  #        sudo apt-get install -y build-essential ffmpeg
+          sudo apt-get install build-essential ffmpeg
-  #        sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          # Install UV
-  #        sudo apt-get install -y libopencv-dev
+          curl -LsSf https://astral.sh/uv/install.sh | sh
-  #        # Install UV
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
-  #        curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y libopencv-dev
-  #        pip install --user --no-cache-dir grpcio-tools==1.64.1
+          pip install --user grpcio-tools==1.64.0
-  #    - name: Test vllm backend
+
-  #      run: |
+      - name: Test parler-tts
-  #        make --jobs=5 --output-sync=target -C backend/python/vllm
+        run: |
-  #        make --jobs=5 --output-sync=target -C backend/python/vllm test
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts
-  # tests-transformers-musicgen:
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts test
  tests-openvoice:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools==1.64.0
      - name: Test openvoice
        run: |
           make --jobs=5 --output-sync=target -C backend/python/openvoice
           make --jobs=5 --output-sync=target -C backend/python/openvoice test
  tests-transformers-musicgen:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools==1.64.0
      - name: Test transformers-musicgen
        run: |
           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
  # tests-petals:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
  #       uses: actions/checkout@v4
-  #       with:
+  #       with: 
  #         submodules: true
  #     - name: Dependencies
  #       run: |
@@ -135,12 +185,14 @@ jobs:
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
+  #         pip install --user grpcio-tools==1.64.0
-  #     - name: Test transformers-musicgen
+  #     - name: Test petals
  #       run: |
-  #          make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
+  #          make --jobs=5 --output-sync=target -C backend/python/petals
-  #          make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
+  #          make --jobs=5 --output-sync=target -C backend/python/petals test
  # tests-bark:
  #   runs-on: ubuntu-latest
@@ -187,7 +239,7 @@ jobs:
  #           df -h
  #     - name: Clone
  #       uses: actions/checkout@v4
-  #       with:
+  #       with: 
  #         submodules: true
  #     - name: Dependencies
  #       run: |
@@ -197,14 +249,14 @@ jobs:
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
+  #         pip install --user grpcio-tools==1.64.0
  #     - name: Test bark
  #       run: |
  #          make --jobs=5 --output-sync=target -C backend/python/bark
  #          make --jobs=5 --output-sync=target -C backend/python/bark test
-
+           
  # Below tests needs GPU. Commented out for now
  # TODO: Re-enable as soon as we have GPU nodes
  # tests-vllm:
@@ -212,7 +264,7 @@ jobs:
  #   steps:
  #     - name: Clone
  #       uses: actions/checkout@v4
-  #       with:
+  #       with: 
  #         submodules: true
  #     - name: Dependencies
  #       run: |
@@ -222,18 +274,38 @@ jobs:
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
+  #         pip install --user grpcio-tools==1.64.0
  #     - name: Test vllm
  #       run: |
  #          make --jobs=5 --output-sync=target -C backend/python/vllm
  #          make --jobs=5 --output-sync=target -C backend/python/vllm test
  tests-vallex:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
        with: 
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
          pip install --user grpcio-tools==1.64.0
      - name: Test vall-e-x
        run: |
           make --jobs=5 --output-sync=target -C backend/python/vall-e-x
           make --jobs=5 --output-sync=target -C backend/python/vall-e-x test
  tests-coqui:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v4
-        with:
+        with: 
          submodules: true
      - name: Dependencies
        run: |
@@ -242,8 +314,8 @@ jobs:
          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng python3-pip
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
-          pip install --user --no-cache-dir grpcio-tools==1.64.1
+          pip install --user grpcio-tools==1.64.0
      - name: Test coqui
        run: |
          make --jobs=5 --output-sync=target -C backend/python/coqui
-          make --jobs=5 --output-sync=target -C backend/python/coqui test
+          make --jobs=5 --output-sync=target -C backend/python/coqui test
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -10,7 +10,7 @@ on:
      - '*'
 env:
-  GRPC_VERSION: v1.65.0
+  GRPC_VERSION: v1.64.0
 concurrency:
  group: ci-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -67,20 +67,17 @@ jobs:
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
      - name: Proto Dependencies
        run: |
          # Install protoc
          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
+          sudo apt-get install build-essential curl ffmpeg
-          sudo apt-get install -y libgmock-dev clang
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
             sudo apt-get update && \
             sudo apt-get install -y conda
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake patch python3-pip unzip
@@ -96,21 +93,45 @@ jobs:
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
          export CUDACXX=/usr/local/cuda/bin/nvcc
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
          # The python3-grpc-tools package in 22.04 is too old
-          pip install --user grpcio-tools==1.71.0 grpcio==1.71.0
+          pip install --user grpcio-tools
-          make -C backend/python/transformers
+          sudo rm -rfv /usr/bin/conda || true
          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
-          make backends/huggingface backends/llama-cpp backends/local-store backends/silero-vad backends/piper backends/whisper backends/stablediffusion-ggml
+          # Pre-build piper before we start tests in order to have shared libraries in place
          make sources/go-piper && \
          GO_TAGS="tts" make -C sources/go-piper piper.o && \
          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
        env:
-          CUDA_VERSION: 12-4
+          CUDA_VERSION: 12-3
      - name: Cache grpc
        id: cache-grpc
        uses: actions/cache@v4
        with:
          path: grpc
          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
      - name: Build grpc
        if: steps.cache-grpc.outputs.cache-hit != 'true'
        run: |
          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --jobs 5 --shallow-submodules https://github.com/grpc/grpc && \
          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
            -DgRPC_BUILD_TESTS=OFF \
            ../.. && sudo make --jobs 5
      - name: Install gRPC
        run: |
          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
-          PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -155,21 +176,17 @@ jobs:
        uses: actions/checkout@v4
        with:
          submodules: true
-      - name: Dependencies
+      - name: Build images
        run: |
-          # Install protoc
+          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
-          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
          rm protoc.zip
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Test
        run: |
-            PATH="$PATH:$HOME/go/bin" make backends/local-store backends/silero-vad backends/llama-cpp backends/whisper backends/piper backends/stablediffusion-ggml docker-build-aio e2e-aio
+          LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
            make run-e2e-aio
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
@@ -195,28 +212,18 @@ jobs:
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
-          pip install --user --no-cache-dir grpcio-tools==1.71.0 grpcio==1.71.0
+          pip install --user grpcio-tools==1.64.0
      - name: Build llama-cpp-darwin
        run: |
          make protogen-go
          make build
          bash scripts/build-llama-cpp-darwin.sh
          ls -la build/darwin.tar
          mv build/darwin.tar build/llama-cpp.tar
          ./local-ai backends install "ocifile://$PWD/build/llama-cpp.tar"
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export CC=/opt/homebrew/opt/llvm/bin/clang
          # Used to run the newer GNUMake version from brew that supports --output-sync
          export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
-          PATH="$PATH:$HOME/go/bin" make protogen-go
+          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make --jobs 4 --output-sync=target test
          PATH="$PATH:$HOME/go/bin" BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
-        uses: mxschmitt/action-tmate@v3.22
+        uses: mxschmitt/action-tmate@v3.18
        with:
          detached: true
          connect-timeout-seconds: 180
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -13,19 +13,13 @@ jobs:
      - uses: actions/setup-go@v5
        with:
          go-version: 'stable'
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install protobuf-compiler
      - run: |
          go install github.com/swaggo/swag/cmd/swag@latest
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Bump swagger 🔧
        run: |
-          make protogen-go swagger
+          make swagger
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v7
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/yaml-check.yml
+++ b/.github/workflows/yaml-check.yml
@@ -8,7 +8,7 @@ jobs:
    steps:
      - name: 'Checkout'
        uses: actions/checkout@master
-      - name: 'Yamllint model gallery'
+      - name: 'Yamllint'
        uses: karancode/yamllint-github-action@master
        with:
          yamllint_file_or_dir: 'gallery'
@@ -16,11 +16,3 @@ jobs:
          yamllint_comment: true
        env:
          GITHUB_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      - name: 'Yamllint Backend gallery'
        uses: karancode/yamllint-github-action@master
        with:
          yamllint_file_or_dir: 'backend'
          yamllint_strict: false
          yamllint_comment: true
        env:
          GITHUB_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -2,22 +2,17 @@
 /sources/
 __pycache__/
 *.a
 *.o
 get-sources
 prepare-sources
-/backend/cpp/llama-cpp/grpc-server
+/backend/cpp/llama/grpc-server
-/backend/cpp/llama-cpp/llama.cpp
+/backend/cpp/llama/llama.cpp
 /backend/cpp/llama-*
 !backend/cpp/llama-cpp
 /backends
 /backend-images
 /result.yaml
 protoc
 *.log
 go-ggml-transformers
 go-gpt2
 go-rwkv
 whisper.cpp
 /bloomz
 go-bert
@@ -59,6 +54,3 @@ docs/static/gallery.html
 # backend virtual environments
 **/venv
 # per-developer customization files for the development container
 .devcontainer/customization/*
--- a/.goreleaser.yaml
+++ b/.goreleaser.yaml
@@ -1,33 +0,0 @@
 version: 2
 before:
  hooks:
    - make protogen-go
    - go mod tidy
 dist: release
 source:
  enabled: true
  name_template: '{{ .ProjectName }}-{{ .Tag }}-source'
 builds:
  -
    env:
      - CGO_ENABLED=0
    ldflags:
      - -s -w
      - -X "github.com/mudler/LocalAI/internal.Version={{ .Tag }}"
      - -X "github.com/mudler/LocalAI/internal.Commit={{ .FullCommit }}"
    goos:
      - linux
      - darwin
      #- windows
    goarch:
      - amd64
      - arm64
 archives:
  - formats: [ 'binary' ] # this removes the tar of the archives, leaving the binaries alone
    name_template: local-ai-{{ .Tag }}-{{ .Os }}-{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}
 checksum:
  name_template: '{{ .ProjectName }}-{{ .Tag }}-checksums.txt'
 snapshot:
  version_template: "{{ .Tag }}-next"
 changelog:
  use: github-native
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -3,12 +3,12 @@
    "configurations": [
        {
            "name": "Python: Current File",
-            "type": "debugpy",
+            "type": "python",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal",
            "justMyCode": false,
-            "cwd": "${fileDirname}",
+            "cwd": "${workspaceFolder}/examples/langchain-chroma",
            "env": {
                "OPENAI_API_BASE": "http://localhost:8080/v1",
                "OPENAI_API_KEY": "abc"
@@ -19,16 +19,15 @@
            "type": "go",
            "request": "launch",
            "mode": "debug",
-            "program": "${workspaceRoot}",
+            "program": "${workspaceFolder}/main.go",
-            "args": [],
+            "args": [
                "api"
            ],
            "env": {
-                "LOCALAI_LOG_LEVEL": "debug",
+                "C_INCLUDE_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
-                "LOCALAI_P2P": "true",
+                "LIBRARY_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
-                "LOCALAI_FEDERATED": "true"
+                "DEBUG": "true"
-            },
+            }
            "buildFlags": ["-tags", "", "-v"],
            "envFile": "${workspaceFolder}/.env",
            "cwd": "${workspaceRoot}"
        }
    ]
 }
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -15,6 +15,8 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
 - [Documentation](#documentation)
 - [Community and Communication](#community-and-communication)
 ## Getting Started
 ### Prerequisites
@@ -52,7 +54,7 @@ If you find a bug, have a feature request, or encounter any issues, please check
 ## Coding Guidelines
- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here.
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
 ## Testing
@@ -82,3 +84,5 @@ We are welcome the contribution of the documents, please open new PR or create a
 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
 ---
--- a/442
+++ b/442
@@ -1,61 +1,156 @@
 ARG IMAGE_TYPE=extras
 ARG BASE_IMAGE=ubuntu:22.04
 ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
 ARG INTEL_BASE_IMAGE=${BASE_IMAGE}
-FROM ${BASE_IMAGE} AS requirements
+# The requirements-core target is common to all images.  It should not be placed in requirements-core unless every single build will use it.
 FROM ${BASE_IMAGE} AS requirements-core
 USER root
 ARG GO_VERSION=1.22.4
 ARG TARGETARCH
 ARG TARGETVARIANT
 ENV DEBIAN_FRONTEND=noninteractive
 ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        ca-certificates curl wget espeak-ng libgomp1 \
+        build-essential \
-        python3 python-is-python3 ffmpeg && \
+        ccache \
        ca-certificates \
        cmake \
        curl \
        git \
        unzip && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Install Go
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH $PATH:/root/go/bin:/usr/local/go/bin
 # Install grpc compilers
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
 # Use the variables in subsequent instructions
 RUN echo "Target Architecture: $TARGETARCH"
 RUN echo "Target Variant: $TARGETVARIANT"
 # Cuda
 ENV PATH /usr/local/cuda/bin:${PATH}
 # HipBLAS requirements
 ENV PATH /opt/rocm/bin:${PATH}
 # OpenBLAS requirements and stable diffusion
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        libopenblas-dev \
        libopencv-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Set up OpenCV
 RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
 WORKDIR /build
 RUN test -n "$TARGETARCH" \
    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
 ###################################
 ###################################
 # The requirements-extras target is for any builds with IMAGE_TYPE=extras. It should not be placed in this target unless every IMAGE_TYPE=extras build will use it
 FROM requirements-core AS requirements-extras
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        espeak-ng \
        espeak \
        python3-pip \
        python-is-python3 \
        python3-dev \
        python3-venv && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* && \
    pip install --upgrade pip
 # Install grpcio-tools (the version in 22.04 is too old)
 RUN pip install --user grpcio-tools
 ###################################
 ###################################
 # The requirements-drivers target is for BUILD_TYPE specific items.  If you need to install something specific to CUDA, or specific to ROCM, it goes here.
-FROM requirements AS requirements-drivers
+# This target will be built on top of requirements-core or requirements-extras as retermined by the IMAGE_TYPE build-arg
 FROM requirements-${IMAGE_TYPE} AS requirements-drivers
 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
-ARG CUDA_MINOR_VERSION=0
+ARG CUDA_MINOR_VERSION=5
 ARG SKIP_DRIVERS=false
 ARG TARGETARCH
 ARG TARGETVARIANT
 ENV BUILD_TYPE=${BUILD_TYPE}
-RUN mkdir -p /run/localai
+ENV BUILD_TYPE=${BUILD_TYPE}
 RUN echo "default" > /run/localai/capability
 # Vulkan requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+    if [ "${BUILD_TYPE}" = "vulkan" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils wget gpg-agent && \
+                        software-properties-common pciutils wget gpg-agent && \
        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
        apt-get update && \
-        apt-get install -y \
+            apt-get install -y \
            vulkan-sdk && \
        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* && \
+        rm -rf /var/lib/apt/lists/*
        echo "vulkan" > /run/localai/capability
    fi
 EOT
 # CuBLAS requirements
 RUN <<EOT bash
-    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+    if [ "${BUILD_TYPE}" = "cublas" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
-            software-properties-common pciutils
+                        software-properties-common pciutils
        if [ "amd64" = "$TARGETARCH" ]; then
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-        fi
+            fi
        if [ "arm64" = "$TARGETARCH" ]; then
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
        fi
        dpkg -i cuda-keyring_1.1-1_all.deb && \
            rm -f cuda-keyring_1.1-1_all.deb && \
            apt-get update && \
            apt-get install -y --no-install-recommends \
                cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
                libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
            apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils && \
        curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
        apt-get update && \
@@ -67,19 +162,11 @@ RUN <<EOT bash
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
        apt-get clean && \
-        rm -rf /var/lib/apt/lists/* && \
+        rm -rf /var/lib/apt/lists/* \
-        echo "nvidia" > /run/localai/capability
+    ; fi
    fi
 EOT
 RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
        echo "nvidia-l4t" > /run/localai/capability
    fi
 EOT
 # If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            libclblast-dev && \
@@ -87,95 +174,18 @@ RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        rm -rf /var/lib/apt/lists/* \
    ; fi
-RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            hipblas-dev \
            rocblas-dev && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/* && \
        echo "amd" > /run/localai/capability && \
        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
        ldconfig \
    ; fi
 # Cuda
 ENV PATH=/usr/local/cuda/bin:${PATH}
 # HipBLAS requirements
 ENV PATH=/opt/rocm/bin:${PATH}
 ###################################
 ###################################
 # The requirements-core target is common to all images.  It should not be placed in requirements-core unless every single build will use it.
 FROM requirements-drivers AS build-requirements
 ARG GO_VERSION=1.22.6
 ARG CMAKE_VERSION=3.26.4
 ARG CMAKE_FROM_SOURCE=false
 ARG TARGETARCH
 ARG TARGETVARIANT
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
        ccache \
        ca-certificates espeak-ng \
        curl libssl-dev \
        git \
        git-lfs \
        unzip upx-ucl python3 python-is-python3 && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Install CMake (the version in 22.04 is too old)
 RUN <<EOT bash
    if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
    else
        apt-get update && \
        apt-get install -y \
            cmake && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 # Install Go
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
 # Install grpc compilers
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
 # OpenBLAS requirements and stable diffusion
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        libopenblas-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 RUN test -n "$TARGETARCH" \
    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
 # Use the variables in subsequent instructions
 RUN echo "Target Architecture: $TARGETARCH"
 RUN echo "Target Variant: $TARGETVARIANT"
 WORKDIR /build
 ###################################
 ###################################
@@ -186,39 +196,71 @@ FROM ${INTEL_BASE_IMAGE} AS intel
 RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \
 gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
 RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" > /etc/apt/sources.list.d/intel-graphics.list
 ###################################
 ###################################
 # The grpc target does one thing, it builds and installs GRPC.  This is in it's own layer so that it can be effectively cached by CI.
 # You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
 FROM ${GRPC_BASE_IMAGE} AS grpc
 # This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
 ARG GRPC_MAKEFLAGS="-j4 -Otarget"
 ARG GRPC_VERSION=v1.64.2
 ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
 WORKDIR /build
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        intel-oneapi-runtime-libs && \
+        ca-certificates \
        build-essential \
        cmake \
        git && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # We install GRPC to a different prefix here so that we can copy in only the build artifacts later
 # saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
 # and running make install in the target container
 RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
    mkdir -p /build/grpc/cmake/build && \
    cd /build/grpc/cmake/build && \
    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
    make && \
    make install && \
    rm -rf /build
 ###################################
 ###################################
-# The builder-base target has the arguments, variables, and copies shared between full builder images and the uncompiled devcontainer
+# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
 # Adjustments to the build process should likely be made here.
 FROM requirements-drivers AS builder
-FROM build-requirements AS builder-base
+ARG GO_TAGS="stablediffusion tts p2p"
 ARG GO_TAGS=""
 ARG GRPC_BACKENDS
 ARG MAKEFLAGS
-ARG LD_FLAGS="-s -w"
+
 ARG TARGETARCH
 ARG TARGETVARIANT
 ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
 ENV MAKEFLAGS=${MAKEFLAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
 ENV LD_FLAGS=${LD_FLAGS}
 RUN echo "GO_TAGS: $GO_TAGS" && echo "TARGETARCH: $TARGETARCH"
 WORKDIR /build
 COPY . .
 COPY .git .
 RUN echo "GO_TAGS: $GO_TAGS"
-# We need protoc installed, and the version in 22.04 is too old.
+RUN make prepare
 # We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
 # but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
 # here so that we can generate the grpc code for the stablediffusion build
 RUN <<EOT bash
    if [ "amd64" = "$TARGETARCH" ]; then
        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
@@ -232,61 +274,20 @@ RUN <<EOT bash
    fi
 EOT
-###################################
+# stablediffusion does not tolerate a newer version of abseil, build it first
-###################################
+RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
-# Compile backends first in a separate stage
+# Install the pre-built GRPC
-FROM builder-base AS builder-backends
+COPY --from=grpc /opt/grpc /usr/local
 ARG TARGETARCH
 ARG TARGETVARIANT
 # Rebuild with defaults backends
 WORKDIR /build
 COPY ./Makefile .
 COPY ./backend ./backend
 COPY ./go.mod .
 COPY ./go.sum .
 COPY ./.git ./.git
 # Some of the Go backends use libs from the main src, we could further optimize the caching by building the CPP backends before here
 COPY ./pkg/grpc ./pkg/grpc
 COPY ./pkg/utils ./pkg/utils
 COPY ./pkg/langchain ./pkg/langchain
 RUN ls -l ./
 RUN make protogen-go
 # The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
 # Adjustments to the build process should likely be made here.
 FROM builder-backends AS builder
 WORKDIR /build
 COPY . .
 ## Build the binary
 ## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space
 ## Otherwise just run the normal build
 RUN make build
-###################################
+RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
-###################################
+        mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
-
+        touch /build/sources/go-piper/piper-phonemize/pi/lib/keep \
-# The devcontainer target is not used on CI. It is a target for developers to use locally -
+    ; fi
 # rather than copying files it mounts them locally and leaves building to the developer
 FROM builder-base AS devcontainer
 COPY .devcontainer-scripts /.devcontainer-scripts
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        ssh less
 # For the devcontainer, leave apt functional in case additional devtools are needed at runtime.
 RUN go install github.com/go-delve/delve/cmd/dlv@latest
 RUN go install github.com/mikefarah/yq/v4@latest
 ###################################
 ###################################
@@ -295,27 +296,118 @@ RUN go install github.com/mikefarah/yq/v4@latest
 # If you cannot find a more suitable place for an addition, this layer is a suitable place for it.
 FROM requirements-drivers
 ARG FFMPEG
 ARG BUILD_TYPE
 ARG TARGETARCH
 ARG IMAGE_TYPE=extras
 ARG EXTRA_BACKENDS
 ARG MAKEFLAGS
 ENV BUILD_TYPE=${BUILD_TYPE}
 ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
 ENV MAKEFLAGS=${MAKEFLAGS}
 ARG CUDA_MAJOR_VERSION=12
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
-WORKDIR /
+# Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            ffmpeg && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/* \
    ; fi
-COPY ./entrypoint.sh .
+WORKDIR /build
 # we start fresh & re-copy all assets because `make build` does not clean up nicely after itself
 # so when `entrypoint.sh` runs `make build` again (which it does by default), the build would fail
 # see https://github.com/go-skynet/LocalAI/pull/658#discussion_r1241971626 and
 # https://github.com/go-skynet/LocalAI/pull/434
 COPY . .
 COPY --from=builder /build/sources ./sources/
 COPY --from=grpc /opt/grpc /usr/local
 RUN make prepare-sources
 # Copy the binary
 COPY --from=builder /build/local-ai ./
 # Copy shared libraries for piper
 COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
 # do not let stablediffusion rebuild (requires an older version of absl)
 COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
 # Change the shell to bash so we can use [[ tests below
 SHELL ["/bin/bash", "-c"]
 # We try to strike a balance between individual layer size (as that affects total push time) and total image size
 # Splitting the backends into more groups with fewer items results in a larger image, but a smaller size for the largest layer
 # Splitting the backends into fewer groups with more items results in a smaller image, but a larger size for the largest layer
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/coqui \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/parler-tts \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/diffusers \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/transformers-musicgen \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/exllama \
    ; fi
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/vall-e-x \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/openvoice \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "petals" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/petals \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "sentencetransformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/sentencetransformers \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/exllama2 \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/transformers \
    ; fi
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/vllm \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/autogptq \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/bark \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/rerankers \
    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/mamba \
    ; fi
 # Make sure the models directory exists
-RUN mkdir -p /models /backends
+RUN mkdir -p /build/models
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f ${HEALTHCHECK_ENDPOINT} || exit 1
-VOLUME /models /backends
+VOLUME /build/models
 EXPOSE 8080
-ENTRYPOINT [ "/entrypoint.sh" ]
+ENTRYPOINT [ "/build/entrypoint.sh" ]
--- a/5
+++ b/5
@@ -0,0 +1,5 @@
 VERSION 0.7
 build:
    FROM DOCKERFILE -f Dockerfile .
    SAVE ARTIFACT /usr/bin/local-ai AS LOCAL local-ai
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2023-2025 Ettore Di Giacinto (mudler@localai.io)
+Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/884
+++ b/884
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 <h1 align="center">
  <br>
-  <img width="300" src="./core/http/static/logo.png"> <br>
+  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
    LocalAI
 <br>
 </h1>
@@ -30,197 +31,72 @@
 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
-<img src="https://img.shields.io/badge/X-%23000000.svg?style=for-the-badge&logo=X&logoColor=white&label=LocalAI_API" alt="Follow LocalAI_API"/>
+<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
 </a>
 <a href="https://discord.gg/uJAeKSAGDy" target="blank">
 <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
 </a>
 </p>
 <p align="center">
 <a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
 </p>
 > :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
 >
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🥽 Demo](https://demo.localai.io) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 [![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/localaiofficial_bot)
 [![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).
-
+![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
 ## 📚🆕 Local Stack Family
 🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:
 <table>
  <tr>
    <td width="50%" valign="top">
      <a href="https://github.com/mudler/LocalAGI">
        <img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
      </a>
    </td>
    <td width="50%" valign="top">
      <h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
      <p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
    </td>
  </tr>
  <tr>
    <td width="50%" valign="top">
      <a href="https://github.com/mudler/LocalRecall">
        <img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
      </a>
    </td>
    <td width="50%" valign="top">
      <h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
      <p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
    </td>
  </tr>
 </table>
 ## Screenshots
 | Talk Interface | Generate Audio |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
 | Models Overview | Generate Images |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
 | Chat Interface | Home |
 | --- | --- |
 | ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
 | Login | Swarm |
 | --- | --- |
 |![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
 ## 💻 Quickstart
 Run the installer script:
 ```bash
 # Basic installation
 curl https://localai.io/install.sh | sh
 ```
 For more installation options, see [Installer Options](https://localai.io/docs/advanced/installer/).
 Or run with docker:
 ### CPU only image:
 ```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
 ```
 ### NVIDIA GPU Images:
 ```bash
 # CUDA 12.0
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
 # CUDA 11.7
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11
 # NVIDIA Jetson (L4T) ARM64
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-nvidia-l4t-arm64
 ```
 ### AMD GPU Images (ROCm):
 ```bash
 docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-gpu-hipblas
 ```
 ### Intel GPU Images (oneAPI):
 ```bash
 # Intel GPU with FP16 support
 docker run -ti --name local-ai -p 8080:8080 --device=/dev/dri/card1 --device=/dev/dri/renderD128 localai/localai:latest-gpu-intel-f16
 # Intel GPU with FP32 support
 docker run -ti --name local-ai -p 8080:8080 --device=/dev/dri/card1 --device=/dev/dri/renderD128 localai/localai:latest-gpu-intel-f32
 ```
 ### Vulkan GPU Images:
 ```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-vulkan
 ```
 ### AIO Images (pre-downloaded models):
 ```bash
 # CPU version
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
-
+# Alternative images:
-# NVIDIA CUDA 12 version
+# - if you have an Nvidia GPU:
-docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
-
+# - without preconfigured models
-# NVIDIA CUDA 11 version
+# docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
-docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-11
+# - without preconfigured models for Nvidia GPUs
-
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12 
 # Intel GPU version
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-gpu-intel-f16
 # AMD GPU version
 docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-aio-gpu-hipblas
 ```
-For more information about the AIO images and pre-downloaded models, see [Container Documentation](https://localai.io/basics/container/).
+[💻 Getting started](https://localai.io/basics/getting_started/index.html)
-To load models:
+## 🔥🔥 Hot topics / Roadmap
-```bash
+[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
 # From the model gallery (see available models with `local-ai models list`, in the WebUI from the model tab, or visiting https://models.localai.io)
 local-ai run llama-3.2-1b-instruct:q4_k_m
 # Start LocalAI with the phi-2 model directly from huggingface
 local-ai run huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf
 # Install and run a model from the Ollama OCI registry
 local-ai run ollama://gemma:2b
 # Run a model from a configuration file
 local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 # Install and run a model from a standard OCI registry (e.g., Docker Hub)
 local-ai run oci://localai/phi-2:latest
 ```
-For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)
+- 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
 - 🔥🔥 Decentralized llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs  https://localai.io/features/distribute/
 - 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
 - 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
 - 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
 - Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
 - Reranker API: https://github.com/mudler/LocalAI/pull/2121
-## 📰 Latest project news
+Hot topics (looking for contributors):
- July 2025: All backends migrated outside of the main binary. LocalAI is now more lightweight, small, and automatically downloads the required backend to run the model. [Read the release notes](https://github.com/mudler/LocalAI/releases/tag/v3.2.0)
+- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
- June 2025: [Backend management](https://github.com/mudler/LocalAI/pull/5607) has been added. Attention: extras images are going to be deprecated from the next release! Read [the backend management PR](https://github.com/mudler/LocalAI/pull/5607).
+- Backends v2: https://github.com/mudler/LocalAI/issues/1126
- May 2025: [Audio input](https://github.com/mudler/LocalAI/pull/5466) and [Reranking](https://github.com/mudler/LocalAI/pull/5396) in llama.cpp backend, [Realtime API](https://github.com/mudler/LocalAI/pull/5392),  Support to Gemma, SmollVLM, and more multimodal models (available in the gallery).
+- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
- May 2025: Important: image name changes [See release](https://github.com/mudler/LocalAI/releases/tag/v2.29.0)
+- Assistant API: https://github.com/mudler/LocalAI/issues/1273
- Apr 2025: Rebrand, WebUI enhancements
+- Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
- Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
+- Vulkan: https://github.com/mudler/LocalAI/issues/1647
 - Apr 2025: WebUI overhaul, AIO images updates
 - Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
 - Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
 - Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
 - Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
 - Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204
 - Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
 - Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
 - July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723. P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
 - May 2024: 🔥🔥 Decentralized P2P llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs  https://localai.io/features/distribute/
 - May 2024: 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
 - April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
-Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
 ## 🚀 [Features](https://localai.io/features/)
- 🧩 [Backend Gallery](https://localai.io/backends/): Install/remove backends on the fly, powered by OCI images — fully customizable and API-driven.
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
 - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
 - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
- 🎨 [Image generation](https://localai.io/features/image-generation)
+- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
 - 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) 
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
@@ -228,10 +104,10 @@ Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3A
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
 - 📈 [Reranker API](https://localai.io/features/reranker/)
 - 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
 - [Agentic capabilities](https://github.com/mudler/LocalAGI)
 - 🔊 Voice activity detection (Silero-VAD support)
 - 🌍 Integrated WebUI!
 ## 💻 Usage
 Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
 ### 🔗 Community and integrations
@@ -249,7 +125,6 @@ Model galleries
 Other:
 - Helm chart https://github.com/go-skynet/helm-charts
 - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
 - Langchain: https://python.langchain.com/docs/integrations/providers/localai/
 - Terminal utility https://github.com/djcopley/ShellOracle
 - Local Smart assistant https://github.com/mudler/LocalAGI
 - Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
@@ -257,10 +132,6 @@ Other:
 - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
 - Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
 - Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
 - Another Telegram Bot https://github.com/JackBekket/Hellper
 - Auto-documentation https://github.com/JackBekket/Reflexia
 - Github bot which answer on issues, with code and documentation as context https://github.com/JackBekket/GitHelper
 - Github Actions: https://github.com/marketplace/actions/start-localai
 - Examples: https://github.com/mudler/LocalAI/tree/master/examples/
@@ -274,7 +145,6 @@ Other:
 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)
 - [Run Visual studio code with LocalAI (SUSE)](https://www.suse.com/c/running-ai-locally/)
 - 🆕 [Run LocalAI on Jetson Nano Devkit](https://mudler.pm/posts/local-ai-jetson-nano-devkit/)
 - [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/blog/low-code-llm-apps-with-local-ai-flowise-and-pulumi/)
 - [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
@@ -307,7 +177,7 @@ A huge thank you to our generous sponsors who support this project covering CI e
 <p align="center">
  <a href="https://www.spectrocloud.com/" target="blank">
-    <img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
+    <img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
  </a>
  <a href="https://www.premai.io/" target="blank">
    <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
@@ -334,6 +204,7 @@ LocalAI couldn't have been built without the help of great software already avai
 - https://github.com/antimatter15/alpaca.cpp
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/ggerganov/whisper.cpp
 - https://github.com/saharNooby/rwkv.cpp
 - https://github.com/rhasspy/piper
 ## 🤗 Contributors
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -1,8 +1,7 @@
 embeddings: true
 name: text-embedding-ada-002
-backend: llama-cpp
+backend: bert-embeddings
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin
 usage: |
    You can test this model with curl like this:
--- a/aio/cpu/image-gen.yaml
+++ b/aio/cpu/image-gen.yaml
@@ -1,17 +1,56 @@
 name: stablediffusion
-backend: stablediffusion-ggml
+backend: stablediffusion
 cfg_scale: 4.5
 options:
 - sampler:euler
 parameters:
-  model: stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf
+  model: stablediffusion_assets
-step: 25
+
 license: "BSD-3"
 urls:
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
 description: |
     Stable Diffusion in NCNN with c++, supported txt2img and img2img
 download_files:
- filename: "stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
+- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
-  sha256: "b8944e9fe0b69b36ae1b5bb0185b3a7b8ef14347fe0fa9af6c64c4829022261f"
+  sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
-  uri: "huggingface://second-state/stable-diffusion-v1-5-GGUF/stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
 - filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
  sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
 - filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
  sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
 - filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
  sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
 - filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
  sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
 - filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
  sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
 - filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
  sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
 - filename: "stablediffusion_assets/log_sigmas.bin"
  sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
 - filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
  sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
 - filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
  sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
 - filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
  sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
 - filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
  sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
 - filename: "stablediffusion_assets/vocab.txt"
  sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
 usage: |
        curl http://localhost:8080/v1/images/generations \
--- a/aio/cpu/rerank.yaml
+++ b/aio/cpu/rerank.yaml
@@ -1,13 +1,7 @@
 name: jina-reranker-v1-base-en
-reranking: true
+backend: rerankers
 f16: true
 parameters:
-  model: jina-reranker-v1-tiny-en.f16.gguf
+  model: cross-encoder
 backend: llama-cpp
 download_files:
  - filename: jina-reranker-v1-tiny-en.f16.gguf
    sha256: 5f696cf0d0f3d347c4a279eee8270e5918554cdac0ed1f632f2619e4e8341407
    uri: huggingface://mradermacher/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en.f16.gguf 
 usage: |
    You can test this model with curl like this:
--- a/aio/cpu/text-to-speech.yaml
+++ b/aio/cpu/text-to-speech.yaml
@@ -2,7 +2,7 @@ name: tts-1
 download_files:
  - filename: voice-en-us-amy-low.tar.gz
    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
-backend: piper
+
 parameters:
  model: en-us-amy-low.onnx
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -1,58 +1,101 @@
 context_size: 8192
 f16: true
 backend: llama-cpp
 function:
  grammar:
    no_mixed_free_string: true
    schema_type: llama3.1 # or JSON is supported too (json)
  response_regex:
  - <function=(?P<name>\w+)>(?P<arguments>.*)</function>
 mmap: true
 name: gpt-4
 mmap: true
 parameters:
-  model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 context_size: 8192
 stopwords:
- <|im_end|>
+- "<|im_end|>"
- <dummy32000>
+- "<dummy32000>"
- <|eot_id|>
+- "</tool_call>"
- <|end_of_text|>
+- "<|eot_id|>"
 - "<|end_of_text|>"
 function:
  # disable injecting the "answer" tool
  disable_no_action: true
  grammar:
    # This allows the grammar to also return messages
    mixed_mode: true
    # Suffix to add to the grammar
    #prefix: '<tool_call>\n'
    # Force parallel calls in the grammar
    # parallel_calls: true
  return_name_in_function_response: true
  # Without grammar uncomment the lines below
  # Warning: this is relying only on the capability of the
  # LLM model to generate the correct function call.
  json_regex_match: 
   - "(?s)<tool_call>(.*?)</tool_call>"
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
  replace_function_results: 
  # Replace everything that is not JSON array or object
  # 
  - key: '(?s)^[^{\[]*'
    value: ""
  - key: '(?s)[^}\]]*$'
    value: ""
  - key: "'([^']*?)'"
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
-    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+    {{.Input -}}
-    You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
+    <|im_start|>assistant
    {{.Input }}
    <|start_header_id|>assistant<|end_header_id|>
  chat_message: |
-    <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall }}
-    {{ else if eq .RoleName "tool" -}}
+    <tool_call>
-    The Function was executed and the response was:
+    {{- else if eq .RoleName "tool" }}
-    {{ end -}}
+    <tool_response>
-    {{ if .Content -}}
+    {{- end }}
-    {{.Content -}}
+    {{- if .Content}}
-    {{ else if .FunctionCall -}}
+    {{.Content }}
-    {{ range .FunctionCall }}
+    {{- end }}
-    [{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
+    {{- if .FunctionCall}}
-    {{ end }}
+    {{toJson .FunctionCall}}
-    {{ end -}}
+    {{- end }}
-    <|eot_id|>
+    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+  function: |-
-    <|start_header_id|>system<|end_header_id|>
+    <|im_start|>system
-    You are an expert in composing functions. You are given a question and a set of possible functions.
+    You are a function calling AI model.
-    Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
+    Here are the available tools:
-    If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
+    <tools>
-    If you decide to invoke any of the function(s), you MUST put it in the format as follows:
+    {{range .Functions}}
-    [func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
-    You SHOULD NOT include any other text in the response.
+    {{end}}
-    Here is a list of functions in JSON format that you can invoke.
+    </tools>
-    {{toJson .Functions}}
+    You should call the tools provided to you sequentially
-    <|eot_id|><|start_header_id|>user<|end_header_id|>
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
-    {{.Input}}
+    <scratchpad>
-    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+    {step-by-step reasoning and plan in bullet points}
-
+    </scratchpad>
-download_files:
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+    <tool_call>
-  sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
+    {"arguments": <args-dict>, "name": <function-name>}
-  uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
--- a/aio/cpu/vad.yaml
+++ b/aio/cpu/vad.yaml
@@ -1,8 +0,0 @@
 backend: silero-vad
 name: silero-vad
 parameters:
  model: silero-vad.onnx
 download_files:
 - filename: silero-vad.onnx
  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -1,50 +1,31 @@
 backend: llama-cpp
 context_size: 4096
 f16: true
 backend: llama-cpp
 mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
+name: gpt-4-vision-preview
-name: gpt-4o
+
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: bakllava-mmproj.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: bakllava.gguf
-stopwords:
+
 - <|im_end|>
 - <dummy32000>
 - </s>
 - <|endoftext|>
 template:
  chat: |
-    {{.Input -}}
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+    ASSISTANT:
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
+- filename: bakllava.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: bakllava-mmproj.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -129,10 +129,10 @@ detect_gpu
 detect_gpu_size
 PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
-export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vad.yaml,/aio/${PROFILE}/vision.yaml}"
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
 check_vars
 echo "===> Starting LocalAI[$PROFILE] with the following models: $MODELS"
-exec /entrypoint.sh "$@"
+exec /build/entrypoint.sh "$@"
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@@ -1,8 +1,7 @@
 embeddings: true
 name: text-embedding-ada-002
-backend: llama-cpp
+backend: sentencetransformers
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: all-MiniLM-L6-v2
 usage: |
    You can test this model with curl like this:
--- a/aio/gpu-8g/rerank.yaml
+++ b/aio/gpu-8g/rerank.yaml
@@ -1,13 +1,7 @@
 name: jina-reranker-v1-base-en
-reranking: true
+backend: rerankers
 f16: true
 parameters:
-  model: jina-reranker-v1-tiny-en.f16.gguf
+  model: cross-encoder
 backend: llama-cpp
 download_files:
  - filename: jina-reranker-v1-tiny-en.f16.gguf
    sha256: 5f696cf0d0f3d347c4a279eee8270e5918554cdac0ed1f632f2619e4e8341407
    uri: huggingface://mradermacher/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en.f16.gguf 
 usage: |
    You can test this model with curl like this:
--- a/aio/gpu-8g/text-to-speech.yaml
+++ b/aio/gpu-8g/text-to-speech.yaml
@@ -2,7 +2,7 @@ name: tts-1
 download_files:
  - filename: voice-en-us-amy-low.tar.gz
    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
-backend: piper
+
 parameters:
  model: en-us-amy-low.onnx
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -1,54 +1,101 @@
 context_size: 4096
 f16: true
 backend: llama-cpp
 function:
  capture_llm_results:
  - (?s)<Thought>(.*?)</Thought>
  grammar:
    properties_order: name,arguments
  json_regex_match:
  - (?s)<Output>(.*?)</Output>
  replace_llm_results:
  - key: (?s)<Thought>(.*?)</Thought>
    value: ""
 mmap: true
 name: gpt-4
 mmap: true
 parameters:
-  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 context_size: 8192
 stopwords:
- <|im_end|>
+- "<|im_end|>"
- <dummy32000>
+- "<dummy32000>"
- </s>
+- "</tool_call>"
 - "<|eot_id|>"
 - "<|end_of_text|>"
 function:
  # disable injecting the "answer" tool
  disable_no_action: true
  grammar:
    # This allows the grammar to also return messages
    mixed_mode: true
    # Suffix to add to the grammar
    #prefix: '<tool_call>\n'
    # Force parallel calls in the grammar
    # parallel_calls: true
  return_name_in_function_response: true
  # Without grammar uncomment the lines below
  # Warning: this is relying only on the capability of the
  # LLM model to generate the correct function call.
  json_regex_match: 
   - "(?s)<tool_call>(.*?)</tool_call>"
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
  replace_function_results: 
  # Replace everything that is not JSON array or object
  # 
  - key: '(?s)^[^{\[]*'
    value: ""
  - key: '(?s)[^}\]]*$'
    value: ""
  - key: "'([^']*?)'"
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{ .RoleName }}
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall }}
-    Function call:
+    <tool_call>
-    {{ else if eq .RoleName "tool" -}}
+    {{- else if eq .RoleName "tool" }}
-    Function response:
+    <tool_response>
-    {{ end -}}
+    {{- end }}
-    {{ if .Content -}}
+    {{- if .Content}}
    {{.Content }}
-    {{ end -}}
+    {{- end }}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
+    {{- end }}
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+  function: |-
    <|im_start|>system
-    You are an AI assistant that executes function calls, and these are the tools at your disposal:
+    You are a function calling AI model.
    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    <|im_end|>
+    </tools>
    You should call the tools provided to you sequentially
    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
    <scratchpad>
    {step-by-step reasoning and plan in bullet points}
    </scratchpad>
    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
-    <|im_start|>assistant
+    <|im_start|>assistant
 download_files:
 - filename: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
  sha256: 4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4
  uri: huggingface://mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
--- a/aio/gpu-8g/vad.yaml
+++ b/aio/gpu-8g/vad.yaml
@@ -1,8 +0,0 @@
 backend: silero-vad
 name: silero-vad
 parameters:
  model: silero-vad.onnx
 download_files:
 - filename: silero-vad.onnx
  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -1,50 +1,35 @@
 context_size: 4096
 backend: llama-cpp
 context_size: 4096
 f16: true
 mmap: true
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
+name: gpt-4-vision-preview
-name: gpt-4o
+
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
-stopwords:
+  temperature: 0.2
- <|im_end|>
+  top_k: 40
- <dummy32000>
+  top_p: 0.95
- </s>
+  seed: -1
- <|endoftext|>
+
 template:
  chat: |
-    {{.Input -}}
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+    ASSISTANT:
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/intel/embeddings.yaml
+++ b/aio/intel/embeddings.yaml
@@ -1,8 +1,7 @@
 embeddings: true
 name: text-embedding-ada-002
-backend: llama-cpp
+backend: sentencetransformers
 parameters:
-  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+  model: all-MiniLM-L6-v2
 usage: |
    You can test this model with curl like this:
--- a/aio/intel/image-gen.yaml
+++ b/aio/intel/image-gen.yaml
@@ -1,6 +1,6 @@
 name: stablediffusion
 parameters:
-  model: Lykon/dreamshaper-8
+  model: runwayml/stable-diffusion-v1-5
 backend: diffusers
 step: 25
 f16: true
--- a/aio/intel/rerank.yaml
+++ b/aio/intel/rerank.yaml
@@ -1,13 +1,7 @@
 name: jina-reranker-v1-base-en
-reranking: true
+backend: rerankers
 f16: true
 parameters:
-  model: jina-reranker-v1-tiny-en.f16.gguf
+  model: cross-encoder
 backend: llama-cpp
 download_files:
  - filename: jina-reranker-v1-tiny-en.f16.gguf
    sha256: 5f696cf0d0f3d347c4a279eee8270e5918554cdac0ed1f632f2619e4e8341407
    uri: huggingface://mradermacher/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en.f16.gguf 
 usage: |
    You can test this model with curl like this:
--- a/aio/intel/text-to-speech.yaml
+++ b/aio/intel/text-to-speech.yaml
@@ -2,7 +2,7 @@ name: tts-1
 download_files:
  - filename: voice-en-us-amy-low.tar.gz
    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
-backend: piper
+
 parameters:
  model: en-us-amy-low.onnx
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -1,54 +1,103 @@
 context_size: 4096
 f16: true
 backend: llama-cpp
 function:
  capture_llm_results:
  - (?s)<Thought>(.*?)</Thought>
  grammar:
    properties_order: name,arguments
  json_regex_match:
  - (?s)<Output>(.*?)</Output>
  replace_llm_results:
  - key: (?s)<Thought>(.*?)</Thought>
    value: ""
 mmap: true
 name: gpt-4
 mmap: false
 context_size: 8192
 f16: false
 parameters:
-  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
 stopwords:
- <|im_end|>
+- "<|im_end|>"
- <dummy32000>
+- "<dummy32000>"
- </s>
+- "</tool_call>"
 - "<|eot_id|>"
 - "<|end_of_text|>"
 function:
  # disable injecting the "answer" tool
  disable_no_action: true
  grammar:
    # This allows the grammar to also return messages
    mixed_mode: true
    # Suffix to add to the grammar
    #prefix: '<tool_call>\n'
    # Force parallel calls in the grammar
    # parallel_calls: true
  return_name_in_function_response: true
  # Without grammar uncomment the lines below
  # Warning: this is relying only on the capability of the
  # LLM model to generate the correct function call.
  json_regex_match: 
   - "(?s)<tool_call>(.*?)</tool_call>"
   - "(?s)<tool_call>(.*?)"
  replace_llm_results:
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
  replace_function_results: 
  # Replace everything that is not JSON array or object
  # 
  - key: '(?s)^[^{\[]*'
    value: ""
  - key: '(?s)[^}\]]*$'
    value: ""
  - key: "'([^']*?)'"
    value: "_DQUOTE_${1}_DQUOTE_"
  - key: '\\"'
    value: "__TEMP_QUOTE__"
  - key: "\'"
    value: "'"
  - key: "_DQUOTE_"
    value: '"'
  - key: "__TEMP_QUOTE__"
    value: '"'
  # Drop the scratchpad content from responses
  - key: "(?s)<scratchpad>.*</scratchpad>"
    value: ""
 template:
  chat: |
    {{.Input -}}
    <|im_start|>assistant
  chat_message: |
-    <|im_start|>{{ .RoleName }}
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall }}
-    Function call:
+    <tool_call>
-    {{ else if eq .RoleName "tool" -}}
+    {{- else if eq .RoleName "tool" }}
-    Function response:
+    <tool_response>
-    {{ end -}}
+    {{- end }}
-    {{ if .Content -}}
+    {{- if .Content}}
    {{.Content }}
-    {{ end -}}
+    {{- end }}
-    {{ if .FunctionCall -}}
+    {{- if .FunctionCall}}
    {{toJson .FunctionCall}}
-    {{ end -}}<|im_end|>
+    {{- end }}
    {{- if .FunctionCall }}
    </tool_call>
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+  function: |-
    <|im_start|>system
-    You are an AI assistant that executes function calls, and these are the tools at your disposal:
+    You are a function calling AI model.
    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
-    <|im_end|>
+    </tools>
    You should call the tools provided to you sequentially
    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
    <scratchpad>
    {step-by-step reasoning and plan in bullet points}
    </scratchpad>
    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
 - filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
--- a/aio/intel/vad.yaml
+++ b/aio/intel/vad.yaml
@@ -1,8 +0,0 @@
 backend: silero-vad
 name: silero-vad
 parameters:
  model: silero-vad.onnx
 download_files:
 - filename: silero-vad.onnx
  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -1,51 +1,35 @@
 context_size: 4096
 backend: llama-cpp
-f16: true
+context_size: 4096
-mmap: true
+mmap: false
-mmproj: minicpm-v-2_6-mmproj-f16.gguf
+f16: false
-name: gpt-4o
+name: gpt-4-vision-preview
 roles:
  user: "USER:"
  assistant: "ASSISTANT:"
  system: "SYSTEM:"
 mmproj: llava-v1.6-7b-mmproj-f16.gguf
 parameters:
-  model: minicpm-v-2_6-Q4_K_M.gguf
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
-stopwords:
+  temperature: 0.2
- <|im_end|>
+  top_k: 40
- <dummy32000>
+  top_p: 0.95
- </s>
+  seed: -1
- <|endoftext|>
+
 template:
  chat: |
-    {{.Input -}}
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{ .RoleName }}
    {{ if .FunctionCall -}}
    Function call:
    {{ else if eq .RoleName "tool" -}}
    Function response:
    {{ end -}}
    {{ if .Content -}}
    {{.Content }}
    {{ end -}}
    {{ if .FunctionCall -}}
    {{toJson .FunctionCall}}
    {{ end -}}<|im_end|>
  completion: |
    {{.Input}}
-  function: |
+    ASSISTANT:
    <|im_start|>system
    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    For each function call return a json object with function name and arguments
    <|im_end|>
    {{.Input -}}
    <|im_start|>assistant
 download_files:
- filename: minicpm-v-2_6-Q4_K_M.gguf
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
-  sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
-  uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
+
-  sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
+usage: |
    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
        "model": "gpt-4-vision-preview",
        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/assets.go
+++ b/assets.go
@@ -0,0 +1,6 @@
 package main
 import "embed"
 //go:embed backend-assets/*
 var backendAssets embed.FS
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -1,142 +0,0 @@
 ARG BASE_IMAGE=ubuntu:22.04
 FROM ${BASE_IMAGE} AS builder
 ARG BACKEND=rerankers
 ARG BUILD_TYPE
 ENV BUILD_TYPE=${BUILD_TYPE}
 ARG CUDA_MAJOR_VERSION
 ARG CUDA_MINOR_VERSION
 ARG SKIP_DRIVERS=false
 ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
 ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETARCH
 ARG TARGETVARIANT
 ARG GO_VERSION=1.22.6
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
        git ccache \
        ca-certificates \
        make cmake \
        curl unzip \
        libssl-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Cuda
 ENV PATH=/usr/local/cuda/bin:${PATH}
 # HipBLAS requirements
 ENV PATH=/opt/rocm/bin:${PATH}
 # Vulkan requirements
 RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils wget gpg-agent && \
        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
        apt-get update && \
        apt-get install -y \
            vulkan-sdk && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 # CuBLAS requirements
 RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils
        if [ "amd64" = "$TARGETARCH" ]; then
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
        fi
        if [ "arm64" = "$TARGETARCH" ]; then
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
        fi
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 # If we are building with clblas support, we need the libraries for the builds
 RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            libclblast-dev && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/* \
    ; fi
 RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            hipblas-dev \
            rocblas-dev && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/* && \
        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
        ldconfig \
    ; fi
 # Intel oneAPI requirements
 RUN <<EOT bash
    if [[ "${BUILD_TYPE}" == sycl* ]] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y --no-install-recommends \
            intel-oneapi-runtime-libs && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 # Install Go
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin:/usr/local/bin
 # Install grpc compilers
 RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
 RUN echo "TARGETARCH: $TARGETARCH"
 # We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
 # but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
 # here so that we can generate the grpc code for the stablediffusion build
 RUN <<EOT bash
    if [ "amd64" = "$TARGETARCH" ]; then
        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
        rm protoc.zip
    fi
    if [ "arm64" = "$TARGETARCH" ]; then
        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
        rm protoc.zip
    fi
 EOT
 COPY . /LocalAI
 RUN cd /LocalAI && make protogen-go && make -C /LocalAI/backend/go/${BACKEND} build
 FROM scratch
 ARG BACKEND=rerankers
 COPY --from=builder /LocalAI/backend/go/${BACKEND}/package/. ./
--- a/backend/Dockerfile.llama-cpp
+++ b/backend/Dockerfile.llama-cpp
@@ -1,207 +0,0 @@
 ARG BASE_IMAGE=ubuntu:22.04
 ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
 # The grpc target does one thing, it builds and installs GRPC.  This is in it's own layer so that it can be effectively cached by CI.
 # You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
 FROM ${GRPC_BASE_IMAGE} AS grpc
 # This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
 ARG GRPC_MAKEFLAGS="-j4 -Otarget"
 ARG GRPC_VERSION=v1.65.0
 ARG CMAKE_FROM_SOURCE=false
 ARG CMAKE_VERSION=3.26.4
 ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
 WORKDIR /build
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        ca-certificates \
        build-essential curl libssl-dev \
        git && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Install CMake (the version in 22.04 is too old)
 RUN <<EOT bash
    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
    else
        apt-get update && \
        apt-get install -y \
            cmake && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 # We install GRPC to a different prefix here so that we can copy in only the build artifacts later
 # saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
 # and running make install in the target container
 RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
    mkdir -p /build/grpc/cmake/build && \
    cd /build/grpc/cmake/build && \
    sed -i "216i\  TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
    make && \
    make install && \
    rm -rf /build
 FROM ${BASE_IMAGE} AS builder
 ARG BACKEND=rerankers
 ARG BUILD_TYPE
 ENV BUILD_TYPE=${BUILD_TYPE}
 ARG CUDA_MAJOR_VERSION
 ARG CUDA_MINOR_VERSION
 ARG SKIP_DRIVERS=false
 ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
 ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETARCH
 ARG TARGETVARIANT
 ARG GO_VERSION=1.22.6
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
        ccache git \
        ca-certificates \
        make \
        curl unzip \
        libssl-dev && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*
 # Cuda
 ENV PATH=/usr/local/cuda/bin:${PATH}
 # HipBLAS requirements
 ENV PATH=/opt/rocm/bin:${PATH}
 # Vulkan requirements
 RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils wget gpg-agent && \
        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
        apt-get update && \
        apt-get install -y \
            vulkan-sdk && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 # CuBLAS requirements
 RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils
        if [ "amd64" = "$TARGETARCH" ]; then
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
        fi
        if [ "arm64" = "$TARGETARCH" ]; then
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
        fi
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 # If we are building with clblas support, we need the libraries for the builds
 RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            libclblast-dev && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/* \
    ; fi
 RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            hipblas-dev \
            rocblas-dev && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/* && \
        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
        ldconfig \
    ; fi
 RUN echo "TARGETARCH: $TARGETARCH"
 # We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
 # but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
 # here so that we can generate the grpc code for the stablediffusion build
 RUN <<EOT bash
    if [ "amd64" = "$TARGETARCH" ]; then
        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
        rm protoc.zip
    fi
    if [ "arm64" = "$TARGETARCH" ]; then
        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
        rm protoc.zip
    fi
 EOT
 # Install CMake (the version in 22.04 is too old)
 RUN <<EOT bash
    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
    else
        apt-get update && \
        apt-get install -y \
            cmake && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 COPY --from=grpc /opt/grpc /usr/local
 COPY . /LocalAI
 ## Otherwise just run the normal build
 RUN <<EOT bash
 if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
        cd /LocalAI/backend/cpp/llama-cpp && make llama-cpp-fallback && \
        make llama-cpp-grpc && make llama-cpp-rpc-server; \
    else \
        cd /LocalAI/backend/cpp/llama-cpp && make llama-cpp-avx && \
        make llama-cpp-avx2 && \
        make llama-cpp-avx512 && \
        make llama-cpp-fallback && \
        make llama-cpp-grpc && \
        make llama-cpp-rpc-server; \
    fi  
 EOT
 # Copy libraries using a script to handle architecture differences
 RUN make -C /LocalAI/backend/cpp/llama-cpp package
 FROM scratch
 # Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
 COPY --from=builder /LocalAI/backend/cpp/llama-cpp/package/. ./
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -1,123 +0,0 @@
 ARG BASE_IMAGE=ubuntu:22.04
 FROM ${BASE_IMAGE} AS builder
 ARG BACKEND=rerankers
 ARG BUILD_TYPE
 ENV BUILD_TYPE=${BUILD_TYPE}
 ARG CUDA_MAJOR_VERSION
 ARG CUDA_MINOR_VERSION
 ARG SKIP_DRIVERS=false
 ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
 ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETARCH
 ARG TARGETVARIANT
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
        ccache \
        ca-certificates \
        espeak-ng \
        curl \
        libssl-dev \
        git \
        git-lfs \
        unzip \
        upx-ucl \
        curl python3-pip \
        python-is-python3 \
        python3-dev llvm \
        python3-venv make && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* && \
    pip install --upgrade pip
 # Cuda
 ENV PATH=/usr/local/cuda/bin:${PATH}
 # HipBLAS requirements
 ENV PATH=/opt/rocm/bin:${PATH}
 # Vulkan requirements
 RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils wget gpg-agent && \
        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
        apt-get update && \
        apt-get install -y \
            vulkan-sdk && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 # CuBLAS requirements
 RUN <<EOT bash
    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
        apt-get update && \
        apt-get install -y  --no-install-recommends \
            software-properties-common pciutils
        if [ "amd64" = "$TARGETARCH" ]; then
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
        fi
        if [ "arm64" = "$TARGETARCH" ]; then
            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
        fi
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/*
    fi
 EOT
 # If we are building with clblas support, we need the libraries for the builds
 RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            libclblast-dev && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/* \
    ; fi
 RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            hipblas-dev \
            rocblas-dev && \
        apt-get clean && \
        rm -rf /var/lib/apt/lists/* && \
        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
        ldconfig \
    ; fi
 # Install uv as a system package
 RUN curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=/usr/bin sh
 ENV PATH="/root/.cargo/bin:${PATH}"
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 # Install grpcio-tools (the version in 22.04 is too old)
 RUN pip install --user grpcio-tools==1.71.0 grpcio==1.71.0
 COPY python/${BACKEND} /${BACKEND}
 COPY backend.proto /${BACKEND}/backend.proto
 COPY python/common/ /${BACKEND}/common
 RUN cd /${BACKEND} && make
 FROM scratch
 ARG BACKEND=rerankers
 COPY --from=builder /${BACKEND}/ /
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -14,10 +14,8 @@ service Backend {
  rpc PredictStream(PredictOptions) returns (stream Reply) {}
  rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
  rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}
@@ -27,21 +25,6 @@ service Backend {
  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
  rpc Rerank(RerankRequest) returns (RerankResult) {}
  rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
  rpc VAD(VADRequest) returns (VADResponse) {}
 }
 // Define the empty request
 message MetricsRequest {}
 message MetricsResponse {
  int32 slot_id = 1;
  string prompt_json_for_slot = 2;  // Stores the prompt as a JSON string.
  float tokens_per_second = 3;
  int32 tokens_generated = 4;
  int32 prompt_tokens_processed = 5;
 }
 message RerankRequest {
@@ -150,9 +133,6 @@ message PredictOptions {
  repeated string Images = 42;
  bool UseTokenizerTemplate = 43;
  repeated Message Messages = 44;
  repeated string Videos = 45;
  repeated string Audios = 46;
  string CorrelationId = 47;
 }
 // The response message containing the result
@@ -160,13 +140,6 @@ message Reply {
  bytes message = 1;
  int32 tokens = 2;
  int32 prompt_tokens = 3;
  double timing_prompt_processing = 4;
  double timing_token_generation = 5;
  bytes audio = 6;
 }
 message GrammarTrigger {
  string word = 1;
 }
 message ModelOptions {
@@ -185,13 +158,18 @@ message ModelOptions {
  string MainGPU = 13;
  string TensorSplit = 14;
  int32 Threads = 15;
  string LibrarySearchPath = 16;
  float RopeFreqBase = 17;
  float RopeFreqScale = 18;
  float RMSNormEps = 19;
  int32 NGQA = 20;
  string ModelFile = 21;
-
+  // AutoGPTQ
  string Device = 22;
  bool UseTriton = 23;
  string ModelBaseName = 24;
  bool UseFastTokenizer = 25;
  // Diffusers
  string PipelineType = 26;
@@ -224,12 +202,6 @@ message ModelOptions {
  int32  SwapSpace = 53;
  int32  MaxModelLen = 54;
  int32  TensorParallelSize = 55;
  string LoadFormat = 58;
  bool   DisableLogStatus = 66;
  string DType = 67;
  int32  LimitImagePerPrompt = 68;
  int32  LimitVideoPerPrompt = 69;
  int32  LimitAudioPerPrompt = 70;
  string MMProj = 41;
@@ -243,22 +215,6 @@ message ModelOptions {
  bool FlashAttention = 56;
  bool NoKVOffload = 57;
  string ModelPath = 59;
  repeated string LoraAdapters = 60;
  repeated float LoraScales = 61;
  repeated string Options = 62;
  string CacheTypeKey = 63;
  string CacheTypeValue = 64;
  repeated GrammarTrigger GrammarTriggers = 65;
  bool Reranking = 71;
  repeated string Overrides = 72;
 }
 message Result {
@@ -306,19 +262,6 @@ message GenerateImageRequest {
  int32 CLIPSkip = 11;
 }
 message GenerateVideoRequest {
  string prompt = 1;
  string start_image = 2;  // Path or base64 encoded image for the start frame
  string end_image = 3;    // Path or base64 encoded image for the end frame
  int32 width = 4;
  int32 height = 5;
  int32 num_frames = 6;    // Number of frames to generate
  int32 fps = 7;          // Frames per second
  int32 seed = 8;
  float cfg_scale = 9;    // Classifier-free guidance scale
  string dst = 10;        // Output path for the generated video
 }
 message TTSRequest {
  string text = 1;
  string model = 2;
@@ -327,30 +270,6 @@ message TTSRequest {
  optional string language = 5;
 }
 message VADRequest {
  repeated float audio = 1;
 }
 message VADSegment {
  float start = 1;
  float end = 2;
 }
 message VADResponse {
  repeated VADSegment segments = 1;
 }
 message SoundGenerationRequest {
  string text = 1;
  string model = 2;
  string dst = 3;
  optional float duration = 4;
  optional float temperature = 5;
  optional bool sample = 6;
  optional string src = 7;
  optional int32 src_divisor = 8;
 }
 message TokenizationResponse {
  int32 length = 1;
  repeated int32 tokens = 2;
@@ -375,4 +294,4 @@ message StatusResponse {
 message Message {
  string role = 1;
  string content = 2;
-}
+}
--- a/backend/cpp/grpc/Makefile
+++ b/backend/cpp/grpc/Makefile
@@ -46,14 +46,9 @@ endif
 $(INSTALLED_PACKAGES): grpc_build
 $(GRPC_REPO):
-	mkdir -p $(GRPC_REPO)/grpc
+	git clone --depth $(GIT_CLONE_DEPTH) -b $(TAG_LIB_GRPC) $(GIT_REPO_LIB_GRPC) $(GRPC_REPO)/grpc
-	cd $(GRPC_REPO)/grpc && \
+	cd $(GRPC_REPO)/grpc && git submodule update --jobs 2 --init --recursive --depth $(GIT_CLONE_DEPTH)
-	git init && \
+
 	git remote add origin $(GIT_REPO_LIB_GRPC)  && \
 	git fetch origin && \
 	git checkout $(TAG_LIB_GRPC) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 $(GRPC_BUILD): $(GRPC_REPO)
 	mkdir -p $(GRPC_BUILD)
 	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . && cmake --build . --target install
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,168 +0,0 @@
 LLAMA_VERSION?=3f4fc97f1d745f1d5d3c853949503136d419e6de
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
 CMAKE_ARGS?=
 BUILD_TYPE?=
 NATIVE?=false
 ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server
 JOBS?=$(shell nproc)
 # Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 ifeq ($(NATIVE),false)
 	CMAKE_ARGS+=-DGGML_NATIVE=OFF
 endif
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DGGML_CUDA=ON
 # If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	ROCM_HOME ?= /opt/rocm
 	ROCM_PATH ?= /opt/rocm
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 #	GPU_TARGETS ?= gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102
 #	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
 	CMAKE_ARGS+=-DGGML_HIP=ON
 #	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 else ifeq ($(BUILD_TYPE),vulkan)
 	CMAKE_ARGS+=-DGGML_VULKAN=1
 else ifeq ($(OS),Darwin)
 	ifeq ($(BUILD_TYPE),)
 		BUILD_TYPE=metal
 	endif
 	ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DGGML_METAL=OFF
 	else
 		CMAKE_ARGS+=-DGGML_METAL=ON
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
 		CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
 		CMAKE_ARGS+=-DGGML_OPENMP=OFF
 	endif
 	TARGET+=--target ggml-metal
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
 	CMAKE_ARGS+=-DGGML_SYCL=ON \
 		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DCMAKE_CXX_FLAGS="-fsycl" \
 		-DGGML_SYCL_F16=ON
 endif
 ifeq ($(BUILD_TYPE),sycl_f32)
 	CMAKE_ARGS+=-DGGML_SYCL=ON \
 		-DCMAKE_C_COMPILER=icx \
 		-DCMAKE_CXX_COMPILER=icpx \
 		-DCMAKE_CXX_FLAGS="-fsycl"
 endif
 INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages
 INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
 ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
 				 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
 				 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
 				 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
 				 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
 build-llama-cpp-grpc-server:
 # Conditionally build grpc for the llama backend to use if needed
 ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
 	$(MAKE) -C ../../grpc build
 	_PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto \
 	_GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin \
 	PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
 	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
 	LLAMA_VERSION=$(LLAMA_VERSION) \
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server
 else
 	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
 	LLAMA_VERSION=$(LLAMA_VERSION) $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server
 endif
 llama-cpp-avx2: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build purge
 	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx2-build" build-llama-cpp-grpc-server
 	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build/grpc-server llama-cpp-avx2
 llama-cpp-avx512: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build purge
 	$(info ${GREEN}I llama-cpp build info:avx512${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx512-build" build-llama-cpp-grpc-server
 	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build/grpc-server llama-cpp-avx512
 llama-cpp-avx: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build purge
 	$(info ${GREEN}I llama-cpp build info:avx${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-cpp-avx-build" build-llama-cpp-grpc-server
 	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build/grpc-server llama-cpp-avx
 llama-cpp-fallback: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build purge
 	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
 	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
 llama-cpp-grpc: llama.cpp
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
 	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
 	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
 	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
 	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc
 llama-cpp-rpc-server: llama-cpp-grpc
 	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
 llama.cpp:
 	mkdir -p llama.cpp
 	cd llama.cpp && \
 	git init && \
 	git remote add origin $(LLAMA_REPO)  && \
 	git fetch origin && \
 	git checkout -b build $(LLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 llama.cpp/tools/grpc-server: llama.cpp
 	mkdir -p llama.cpp/tools/grpc-server
 	bash prepare.sh
 rebuild:
 	bash prepare.sh
 	rm -rf grpc-server
 	$(MAKE) grpc-server
 package:
 	bash package.sh
 purge:
 	rm -rf llama.cpp/build
 	rm -rf llama.cpp/tools/grpc-server
 	rm -rf grpc-server
 clean: purge
 	rm -rf llama.cpp
 grpc-server: llama.cpp llama.cpp/tools/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	+bash -c "source $(ONEAPI_VARS); \
 	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET)"
 else
 	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET)
 endif
 	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
--- a/backend/cpp/llama-cpp/package.sh
+++ b/backend/cpp/llama-cpp/package.sh
@@ -1,42 +0,0 @@
 #!/bin/bash
 # Script to copy the appropriate libraries based on architecture
 # This script is used in the final stage of the Dockerfile
 set -e
 CURDIR=$(dirname "$(realpath $0)")
 # Create lib directory
 mkdir -p $CURDIR/package/lib
 cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
 cp -rfv $CURDIR/run.sh $CURDIR/package/
 # Detect architecture and copy appropriate libraries
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    # x86_64 architecture
    echo "Detected x86_64 architecture, copying x86_64 libraries..."
    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
 elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    # ARM64 architecture
    echo "Detected ARM64 architecture, copying ARM64 libraries..."
    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
 else
    echo "Error: Could not detect architecture"
    exit 1
 fi
 echo "Packaging completed successfully" 
 ls -liah $CURDIR/package/
 ls -liah $CURDIR/package/lib/
--- a/backend/cpp/llama-cpp/patches/01-llava.patch
+++ b/backend/cpp/llama-cpp/patches/01-llava.patch
@@ -1,13 +0,0 @@
 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
 index 3cd0d2fa..6c5e811a 100644
 --- a/tools/mtmd/clip.cpp
 +++ b/tools/mtmd/clip.cpp
@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
                 int* patches_data = (int*)malloc(ggml_nbytes(patches));
                 for (int i = 0; i < num_patches; i++) {
 -                    patches_data[i] = i + 1;
 +                    patches_data[i] = i;
                 }
                 ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
                 free(patches_data);
--- a/backend/cpp/llama-cpp/prepare.sh
+++ b/backend/cpp/llama-cpp/prepare.sh
@@ -1,52 +0,0 @@
 #!/bin/bash
 ## Patches
 ## Apply patches from the `patches` directory
 for patch in $(ls patches); do
    echo "Applying patch $patch"
    patch -d llama.cpp/ -p1 < patches/$patch
 done 
 set -e
 cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
 cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
 cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/
 cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/
 cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/
 set +e
 if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
    echo "grpc-server already added"
 else
    echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
 fi
 set -e
 # Now to keep maximum compatibility with the original server.cpp, we need to remove the index.html.gz.hpp and loading.html.hpp includes
 # and remove the main function
 # TODO: upstream this to the original server.cpp by extracting the upstream main function to a separate file
 awk '
 /int[ \t]+main[ \t]*\(/ {          # If the line starts the main function
    in_main=1;                     # Set a flag
    open_braces=0;                 # Track number of open braces
 }
 in_main {
    open_braces += gsub(/\{/, "{"); # Count opening braces
    open_braces -= gsub(/\}/, "}"); # Count closing braces
    if (open_braces == 0) {         # If all braces are closed
        in_main=0;                  # End skipping
    }
    next;                           # Skip lines inside main
 }
 !in_main                           # Print lines not inside main
 ' "llama.cpp/tools/server/server.cpp" > llama.cpp/tools/grpc-server/server.cpp
 # remove index.html.gz.hpp and loading.html.hpp includes
 if [[ "$OSTYPE" == "darwin"* ]]; then
    # macOS
    sed -i '' '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
 else
    # Linux and others
    sed -i '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
 fi
--- a/backend/cpp/llama-cpp/run.sh
+++ b/backend/cpp/llama-cpp/run.sh
@@ -1,61 +0,0 @@
 #!/bin/bash
 set -ex
 # Get the absolute current dir where the script is located
 CURDIR=$(dirname "$(realpath $0)")
 cd /
 echo "CPU info:"
 grep -e "model\sname" /proc/cpuinfo | head -1
 grep -e "flags" /proc/cpuinfo | head -1
 BINARY=llama-cpp-fallback
 if grep -q -e "\savx\s" /proc/cpuinfo ; then
 	echo "CPU:    AVX    found OK"
 	if [ -e $CURDIR/llama-cpp-avx ]; then
 		BINARY=llama-cpp-avx
 	fi
 fi
 if grep -q -e "\savx2\s" /proc/cpuinfo ; then
 	echo "CPU:    AVX2   found OK"
 	if [ -e $CURDIR/llama-cpp-avx2 ]; then
 		BINARY=llama-cpp-avx2
 	fi
 fi
 # Check avx 512
 if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
 	echo "CPU:    AVX512F found OK"
 	if [ -e $CURDIR/llama-cpp-avx512 ]; then
 		BINARY=llama-cpp-avx512
 	fi
 fi
 if [ -n "$LLAMACPP_GRPC_SERVERS" ]; then
 	if [ -e $CURDIR/llama-cpp-grpc ]; then
 		BINARY=llama-cpp-grpc
 	fi
 fi
 # Extend ld library path with the dir where this script is located/lib
 if [ "$(uname)" == "Darwin" ]; then
 	DYLD_FALLBACK_LIBRARY_PATH=$CURDIR/lib:$DYLD_FALLBACK_LIBRARY_PATH
 else
 	export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 fi
 # If there is a lib/ld.so, use it
 if [ -f $CURDIR/lib/ld.so ]; then
 	echo "Using lib/ld.so"
 	echo "Using binary: $BINARY"
 	exec $CURDIR/lib/ld.so $CURDIR/$BINARY "$@"
 fi
 echo "Using binary: $BINARY"
 exec $CURDIR/$BINARY "$@"
 # In case we fail execing, just run fallback
 exec $CURDIR/llama-cpp-fallback "$@"
--- a/backend/cpp/llama-cpp/CMakeLists.txt
+++ b/backend/cpp/llama-cpp/CMakeLists.txt
@@ -1,3 +1,20 @@
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 set(TARGET myclip)
 add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
 install(TARGETS ${TARGET} LIBRARY)
 target_include_directories(myclip PUBLIC .)
 target_include_directories(myclip PUBLIC ../..)
 target_include_directories(myclip PUBLIC ../../common)
 target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if (NOT MSVC)
    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
 endif()
 # END CLIP hack
 set(TARGET grpc-server)
 set(CMAKE_CXX_STANDARD 17)
 cmake_minimum_required(VERSION 3.15)
@@ -57,12 +74,8 @@ add_library(hw_grpc_proto
  ${hw_proto_srcs}
  ${hw_proto_hdrs} )
-add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp httplib.h)
+add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
-
+target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
 target_include_directories(${TARGET} PRIVATE ../llava)
 target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
 target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
  absl::flags_parse
  gRPC::${_REFLECTION}
  gRPC::${_GRPC_GRPCPP}
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -0,0 +1,69 @@
 LLAMA_VERSION?=
 CMAKE_ARGS?=
 BUILD_TYPE?=
 ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 # If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
 	CMAKE_ARGS+=-DLLAMA_CUBLAS=ON
 # If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
 	CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
 # If build type is clblas (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
 	CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
 # If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
 else ifeq ($(OS),darwin)
 	ifneq ($(BUILD_TYPE),metal)
 		CMAKE_ARGS+=-DLLAMA_METAL=OFF
 	endif
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
 	CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
 endif
 ifeq ($(BUILD_TYPE),sycl_f32)
 	CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 endif
 llama.cpp:
 	git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
 	if [ -z "$(LLAMA_VERSION)" ]; then \
 		exit 1; \
 	fi
 	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1
 llama.cpp/examples/grpc-server: llama.cpp
 	mkdir -p llama.cpp/examples/grpc-server
 	bash prepare.sh
 rebuild:
 	bash prepare.sh
 	rm -rf grpc-server
 	$(MAKE) grpc-server
 purge:
 	rm -rf llama.cpp/build
 	rm -rf llama.cpp/examples/grpc-server
 	rm -rf grpc-server
 clean: purge
 	rm -rf llama.cpp
 grpc-server: llama.cpp llama.cpp/examples/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	bash -c "source $(ONEAPI_VARS); \
 	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && $(MAKE)"
 else
 	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && $(MAKE)
 endif
 	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/cpp/llama/json.hpp
+++ b/backend/cpp/llama/json.hpp
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -0,0 +1,20 @@
 #!/bin/bash
 cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
 cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
 cp -rfv json.hpp llama.cpp/examples/grpc-server/
 cp -rfv utils.hpp llama.cpp/examples/grpc-server/
 if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
    echo "grpc-server already added"
 else
    echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
 fi
 ## XXX: In some versions of CMake clip wasn't being built before llama.
 ## This is an hack for now, but it should be fixed in the future.
 cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
 cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
 echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
 cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
 cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -0,0 +1,510 @@
 // https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
 #pragma once
 #include <string>
 #include <vector>
 #include <set>
 #include <mutex>
 #include <condition_variable>
 #include <unordered_map>
 #include "json.hpp"
 #include "../llava/clip.h"
 using json = nlohmann::json;
 extern bool server_verbose;
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
 #endif
 #if SERVER_VERBOSE != 1
 #define LOG_VERBOSE(MSG, ...)
 #else
 #define LOG_VERBOSE(MSG, ...)                                            \
    do                                                                   \
    {                                                                    \
        if (server_verbose)                                              \
        {                                                                \
            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
        }                                                                \
    } while (0)
 #endif
 #define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
 //
 // parallel
 //
 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
    SERVER_STATE_READY,          // Server is ready and model is loaded
    SERVER_STATE_ERROR           // An error occurred, load_model failed
 };
 enum task_type {
    TASK_TYPE_COMPLETION,
    TASK_TYPE_CANCEL,
    TASK_TYPE_NEXT_RESPONSE
 };
 struct task_server {
    int id = -1; // to be filled by llama_server_queue
    int target_id;
    task_type type;
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
    int multitask_id = -1;
 };
 struct task_result {
    int id;
    int multitask_id = -1;
    bool stop;
    bool error;
    json result_json;
 };
 struct task_multi {
    int id;
    std::set<int> subtasks_remaining{};
    std::vector<task_result> results{};
 };
 // TODO: can become bool if we can't find use of more states
 enum slot_state
 {
    IDLE,
    PROCESSING,
 };
 enum slot_command
 {
    NONE,
    LOAD_PROMPT,
    RELEASE,
 };
 struct slot_params
 {
    bool stream       = true;
    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
    uint32_t seed      = -1; // RNG seed
    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t  n_predict = -1; // new tokens to predict
    std::vector<std::string> antiprompt;
    json input_prefix;
    json input_suffix;
 };
 struct slot_image
 {
    int32_t id;
    bool request_encode_image = false;
    float * image_embedding = nullptr;
    int32_t image_tokens = 0;
    clip_image_u8 * img_data;
    std::string prefix_prompt; // before of this image
 };
 // completion token output with probabilities
 struct completion_token_output
 {
    struct token_prob
    {
        llama_token tok;
        float prob;
    };
    std::vector<token_prob> probs;
    llama_token tok;
    std::string text_to_send;
 };
 static inline void server_log(const char *level, const char *function, int line,
                       const char *message, const nlohmann::ordered_json &extra)
 {
    nlohmann::ordered_json log
    {
        {"timestamp", time(nullptr)},
        {"level",     level},
        {"function",  function},
        {"line",      line},
        {"message",   message},
    };
    if (!extra.empty())
    {
        log.merge_patch(extra);
    }
    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
    printf("%.*s\n", (int)str.size(), str.data());
    fflush(stdout);
 }
 //
 // server utils
 //
 template <typename T>
 static T json_value(const json &body, const std::string &key, const T &default_value)
 {
    // Fallback null to default value
    return body.contains(key) && !body.at(key).is_null()
        ? body.value(key, default_value)
        : default_value;
 }
 inline std::string format_chatml(std::vector<json> messages)
 {
    std::ostringstream chatml_msgs;
    for (auto it = messages.begin(); it != messages.end(); ++it) {
        chatml_msgs << "<|im_start|>"
                    << json_value(*it, "role",    std::string("user")) << '\n';
        chatml_msgs << json_value(*it, "content", std::string(""))
                    << "<|im_end|>\n";
    }
    chatml_msgs << "<|im_start|>assistant" << '\n';
    return chatml_msgs.str();
 }
 //
 // work queue utils
 //
 struct llama_server_queue {
    int id = 0;
    std::mutex mutex_tasks;
    // queues
    std::vector<task_server> queue_tasks;
    std::vector<task_server> queue_tasks_deferred;
    std::vector<task_multi> queue_multitasks;
    std::condition_variable condition_tasks;
    // callback functions
    std::function<void(task_server&)> callback_new_task;
    std::function<void(task_multi&)> callback_finish_multitask;
    std::function<void(void)> callback_all_task_finished;
    // Add a new task to the end of the queue
    int post(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        if (task.id == -1) {
            task.id = id++;
        }
        queue_tasks.push_back(std::move(task));
        condition_tasks.notify_one();
        return task.id;
    }
    // Add a new task, but defer until one slot is available
    void defer(task_server task) {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        queue_tasks_deferred.push_back(std::move(task));
    }
    // Get the next id for creating anew task
    int get_new_id() {
        std::unique_lock<std::mutex> lock(mutex_tasks);
        return id++;
    }
    // Register function to process a new task
    void on_new_task(std::function<void(task_server&)> callback) {
        callback_new_task = callback;
    }
    // Register function to process a multitask
    void on_finish_multitask(std::function<void(task_multi&)> callback) {
        callback_finish_multitask = callback;
    }
    // Register the function to be called when the batch of tasks is finished
    void on_all_tasks_finished(std::function<void(void)> callback) {
        callback_all_task_finished = callback;
    }
    // Call when the state of one slot is changed
    void notify_slot_changed() {
        // move deferred tasks back to main loop
        std::unique_lock<std::mutex> lock(mutex_tasks);
        for (auto & task : queue_tasks_deferred) {
            queue_tasks.push_back(std::move(task));
        }
        queue_tasks_deferred.clear();
    }
    // Start the main loop. This call is blocking
    [[noreturn]]
    void start_loop() {
        while (true) {
            // new task arrived
            LOG_VERBOSE("have new task", {});
            {
                while (true)
                {
                    std::unique_lock<std::mutex> lock(mutex_tasks);
                    if (queue_tasks.empty()) {
                        lock.unlock();
                        break;
                    }
                    task_server task = queue_tasks.front();
                    queue_tasks.erase(queue_tasks.begin());
                    lock.unlock();
                    LOG_VERBOSE("callback_new_task", {});
                    callback_new_task(task);
                }
                LOG_VERBOSE("callback_all_task_finished", {});
                // process and update all the multitasks
                auto queue_iterator = queue_multitasks.begin();
                while (queue_iterator != queue_multitasks.end())
                {
                    if (queue_iterator->subtasks_remaining.empty())
                    {
                        // all subtasks done == multitask is done
                        task_multi current_multitask = *queue_iterator;
                        callback_finish_multitask(current_multitask);
                        // remove this multitask
                        queue_iterator = queue_multitasks.erase(queue_iterator);
                    }
                    else
                    {
                        ++queue_iterator;
                    }
                }
                // all tasks in the current loop is finished
                callback_all_task_finished();
            }
            LOG_VERBOSE("wait for new task", {});
            // wait for new task
            {
                std::unique_lock<std::mutex> lock(mutex_tasks);
                if (queue_tasks.empty()) {
                    condition_tasks.wait(lock, [&]{
                        return !queue_tasks.empty();
                    });
                }
            }
        }
    }
    //
    // functions to manage multitasks
    //
    // add a multitask by specifying the id of all subtask (subtask is a task_server)
    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        task_multi multi;
        multi.id = multitask_id;
        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
        queue_multitasks.push_back(multi);
    }
    // updatethe remaining subtasks, while appending results to multitask
    void update_multitask(int multitask_id, int subtask_id, task_result& result)
    {
        std::lock_guard<std::mutex> lock(mutex_tasks);
        for (auto& multitask : queue_multitasks)
        {
            if (multitask.id == multitask_id)
            {
                multitask.subtasks_remaining.erase(subtask_id);
                multitask.results.push_back(result);
            }
        }
    }
 };
 struct llama_server_response {
    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
    callback_multitask_t callback_update_multitask;
    // for keeping track of all tasks waiting for the result
    std::set<int> waiting_task_ids;
    // the main result queue
    std::vector<task_result> queue_results;
    std::mutex mutex_results;
    std::condition_variable condition_results;
    void add_waiting_task_id(int task_id) {
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.insert(task_id);
    }
    void remove_waiting_task_id(int task_id) {
        std::unique_lock<std::mutex> lock(mutex_results);
        waiting_task_ids.erase(task_id);
    }
    // This function blocks the thread until there is a response for this task_id
    task_result recv(int task_id) {
        while (true)
        {
            std::unique_lock<std::mutex> lock(mutex_results);
            condition_results.wait(lock, [&]{
                return !queue_results.empty();
            });
            LOG_VERBOSE("condition_results unblock", {});
            for (int i = 0; i < (int) queue_results.size(); i++)
            {
                if (queue_results[i].id == task_id)
                {
                    assert(queue_results[i].multitask_id == -1);
                    task_result res = queue_results[i];
                    queue_results.erase(queue_results.begin() + i);
                    return res;
                }
            }
        }
        // should never reach here
    }
    // Register the function to update multitask
    void on_multitask_update(callback_multitask_t callback) {
        callback_update_multitask = callback;
    }
    // Send a new result to a waiting task_id
    void send(task_result result) {
        std::unique_lock<std::mutex> lock(mutex_results);
        LOG_VERBOSE("send new result", {});
        for (auto& task_id : waiting_task_ids) {
            // LOG_TEE("waiting task id %i \n", task_id);
            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
            if (result.multitask_id == task_id)
            {
                LOG_VERBOSE("callback_update_multitask", {});
                callback_update_multitask(task_id, result.id, result);
                continue;
            }
            if (result.id == task_id)
            {
                LOG_VERBOSE("queue_results.push_back", {});
                queue_results.push_back(result);
                condition_results.notify_one();
                return;
            }
        }
    }
 };
 //
 // base64 utils (TODO: move to common in the future)
 //
 static const std::string base64_chars =
             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
             "abcdefghijklmnopqrstuvwxyz"
             "0123456789+/";
 static inline bool is_base64(uint8_t c)
 {
    return (isalnum(c) || (c == '+') || (c == '/'));
 }
 static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
 {
    int i = 0;
    int j = 0;
    int in_ = 0;
    int in_len = encoded_string.size();
    uint8_t char_array_4[4];
    uint8_t char_array_3[3];
    std::vector<uint8_t> ret;
    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
    {
        char_array_4[i++] = encoded_string[in_]; in_++;
        if (i == 4)
        {
            for (i = 0; i <4; i++)
            {
                char_array_4[i] = base64_chars.find(char_array_4[i]);
            }
            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
            for (i = 0; (i < 3); i++)
            {
                ret.push_back(char_array_3[i]);
            }
            i = 0;
        }
    }
    if (i)
    {
        for (j = i; j <4; j++)
        {
            char_array_4[j] = 0;
        }
        for (j = 0; j <4; j++)
        {
            char_array_4[j] = base64_chars.find(char_array_4[j]);
        }
        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
        for (j = 0; (j < i - 1); j++)
        {
            ret.push_back(char_array_3[j]);
        }
    }
    return ret;
 }
 //
 // random string / id
 //
 static std::string random_string()
 {
    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
    std::random_device rd;
    std::mt19937 generator(rd());
    std::string result(32, ' ');
    for (int i = 0; i < 32; ++i) {
        result[i] = str[generator() % str.size()];
    }
    return result;
 }
 static std::string gen_chatcmplid()
 {
    std::stringstream chatcmplid;
    chatcmplid << "chatcmpl-" << random_string();
    return chatcmplid.str();
 }
--- a/backend/go/bark-cpp/Makefile
+++ b/backend/go/bark-cpp/Makefile
@@ -1,51 +0,0 @@
 INCLUDE_PATH := $(abspath ./)
 LIBRARY_PATH := $(abspath ./)
 AR?=ar
 CMAKE_ARGS?=-DGGML_NATIVE=OFF
 BUILD_TYPE?=
 GOCMD=go
 # keep standard at C11 and C++11
 CXXFLAGS = -I. -I$(INCLUDE_PATH)/sources/bark.cpp/examples -I$(INCLUDE_PATH)/sources/bark.cpp/encodec.cpp/ggml/include -I$(INCLUDE_PATH)/sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
 LDFLAGS  = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/sources/bark.cpp/build/examples -lbark -lstdc++ -lm
 # bark.cpp
 BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
 BARKCPP_VERSION?=5d5be84f089ab9ea53b7a793f088d3fbf7247495
 # warnings
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
 ## bark.cpp
 sources/bark.cpp:
 	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
 	cd sources/bark.cpp && \
 	git checkout $(BARKCPP_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/bark.cpp/build/libbark.a: sources/bark.cpp
 	cd sources/bark.cpp && \
 	mkdir -p build && \
 	cd build && \
 	cmake $(CMAKE_ARGS) .. && \
 	cmake --build . --config Release
 gobark.o:
 	$(CXX) $(CXXFLAGS) gobark.cpp -o gobark.o -c $(LDFLAGS)
 libbark.a: sources/bark.cpp/build/libbark.a gobark.o
 	cp $(INCLUDE_PATH)/sources/bark.cpp/build/libbark.a ./
 	$(AR) rcs libbark.a gobark.o
 bark-cpp: libbark.a
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH="$(CURDIR)" LIBRARY_PATH=$(CURDIR) \
 	$(GOCMD) build -v -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o bark-cpp ./
 package:
 	bash package.sh
 build: bark-cpp package
 clean:
 	rm -f gobark.o libbark.a
--- a/backend/go/bark-cpp/gobark.cpp
+++ b/backend/go/bark-cpp/gobark.cpp
@@ -1,85 +0,0 @@
 #include <iostream>
 #include <tuple>
 #include "bark.h"
 #include "gobark.h"
 #include "common.h"
 #include "ggml.h"
 struct bark_context *c;
 void bark_print_progress_callback(struct bark_context *bctx, enum bark_encoding_step step, int progress, void *user_data) {
    if (step == bark_encoding_step::SEMANTIC) {
        printf("\rGenerating semantic tokens... %d%%", progress);
    } else if (step == bark_encoding_step::COARSE) {
        printf("\rGenerating coarse tokens... %d%%", progress);
    } else if (step == bark_encoding_step::FINE) {
        printf("\rGenerating fine tokens... %d%%", progress);
    }
    fflush(stdout);
 }
 int load_model(char *model) {
    // initialize bark context
    struct bark_context_params ctx_params = bark_context_default_params();
    bark_params params;
    params.model_path = model;
   // ctx_params.verbosity = verbosity;
    ctx_params.progress_callback = bark_print_progress_callback;
    ctx_params.progress_callback_user_data = nullptr;
    struct bark_context *bctx = bark_load_model(params.model_path.c_str(), ctx_params, params.seed);
    if (!bctx) {
        fprintf(stderr, "%s: Could not load model\n", __func__);
        return 1;
    }
    c = bctx;
    return 0;
 }
 int tts(char *text,int  threads, char *dst ) {
    ggml_time_init();
    const int64_t t_main_start_us = ggml_time_us();
    // generate audio
    if (!bark_generate_audio(c, text, threads)) {
        fprintf(stderr, "%s: An error occurred. If the problem persists, feel free to open an issue to report it.\n", __func__);
        return 1;
    }
    const float *audio_data = bark_get_audio_data(c);
    if (audio_data == NULL) {
        fprintf(stderr, "%s: Could not get audio data\n", __func__);
        return 1;
    }
    const int audio_arr_size = bark_get_audio_data_size(c);
    std::vector<float> audio_arr(audio_data, audio_data + audio_arr_size);
    write_wav_on_disk(audio_arr, dst);
    // report timing
    {
        const int64_t t_main_end_us = ggml_time_us();
        const int64_t t_load_us = bark_get_load_time(c);
        const int64_t t_eval_us = bark_get_eval_time(c);
        printf("\n\n");
        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
        printf("%s:     eval time = %8.2f ms\n", __func__, t_eval_us / 1000.0f);
        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
    }
    return 0;
 }
 int unload() {
    bark_free(c);
 }
--- a/backend/go/bark-cpp/gobark.go
+++ b/backend/go/bark-cpp/gobark.go
@@ -1,52 +0,0 @@
 package main
 // #cgo CXXFLAGS: -I${SRCDIR}/sources/bark.cpp/ -I${SRCDIR}/sources/bark.cpp/encodec.cpp -I${SRCDIR}/sources/bark.cpp/encodec.cpp/ggml/include -I${SRCDIR}/sources/bark.cpp/examples -I${SRCDIR}/sources/bark.cpp/spm-headers
 // #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/sources/bark.cpp/build/examples -L${SRCDIR}/sources/bark.cpp/build/encodec.cpp/ggml/src/ -L${SRCDIR}/sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon -lggml -lgomp
 // #include <gobark.h>
 // #include <stdlib.h>
 import "C"
 import (
 	"fmt"
 	"unsafe"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type Bark struct {
 	base.SingleThread
 	threads int
 }
 func (sd *Bark) Load(opts *pb.ModelOptions) error {
 	sd.threads = int(opts.Threads)
 	modelFile := C.CString(opts.ModelFile)
 	defer C.free(unsafe.Pointer(modelFile))
 	ret := C.load_model(modelFile)
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
 	return nil
 }
 func (sd *Bark) TTS(opts *pb.TTSRequest) error {
 	t := C.CString(opts.Text)
 	defer C.free(unsafe.Pointer(t))
 	dst := C.CString(opts.Dst)
 	defer C.free(unsafe.Pointer(dst))
 	threads := C.int(sd.threads)
 	ret := C.tts(t, threads, dst)
 	if ret != 0 {
 		return fmt.Errorf("inference failed")
 	}
 	return nil
 }
--- a/backend/go/bark-cpp/gobark.h
+++ b/backend/go/bark-cpp/gobark.h
@@ -1,8 +0,0 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
 int load_model(char *model);
 int tts(char *text,int  threads, char *dst );
 #ifdef __cplusplus
 }
 #endif
--- a/backend/go/bark-cpp/package.sh
+++ b/backend/go/bark-cpp/package.sh
@@ -1,41 +0,0 @@
 #!/bin/bash
 # Script to copy the appropriate libraries based on architecture
 # This script is used in the final stage of the Dockerfile
 set -e
 CURDIR=$(dirname "$(realpath $0)")
 # Create lib directory
 mkdir -p $CURDIR/package/lib
 cp -avrf $CURDIR/bark-cpp $CURDIR/package/
 cp -rfv $CURDIR/run.sh $CURDIR/package/
 # Detect architecture and copy appropriate libraries
 if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
    # x86_64 architecture
    echo "Detected x86_64 architecture, copying x86_64 libraries..."
    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
 elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
    # ARM64 architecture
    echo "Detected ARM64 architecture, copying ARM64 libraries..."
    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
 else
    echo "Error: Could not detect architecture"
    exit 1
 fi
 echo "Packaging completed successfully" 
 ls -liah $CURDIR/package/
 ls -liah $CURDIR/package/lib/
--- a/backend/go/bark-cpp/run.sh
+++ b/backend/go/bark-cpp/run.sh
@@ -1,13 +0,0 @@
 #!/bin/bash
 set -ex
 CURDIR=$(dirname "$(realpath $0)")
 export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
 # If there is a lib/ld.so, use it
 if [ -f $CURDIR/lib/ld.so ]; then
 	echo "Using lib/ld.so"
 	exec $CURDIR/lib/ld.so $CURDIR/bark-cpp "$@"
 fi
 exec $CURDIR/bark-cpp "$@"
--- a/backend/go/huggingface/Makefile
+++ b/backend/go/huggingface/Makefile
@@ -1,9 +0,0 @@
 GOCMD=go
 huggingface:
 	CGO_ENABLED=0 $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o huggingface ./
 package:
 	bash package.sh
 build: huggingface package
--- a/backend/go/huggingface/package.sh
+++ b/backend/go/huggingface/package.sh
@@ -1,12 +0,0 @@
 #!/bin/bash
 # Script to copy the appropriate libraries based on architecture
 # This script is used in the final stage of the Dockerfile
 set -e
 CURDIR=$(dirname "$(realpath $0)")
 mkdir -p $CURDIR/package
 cp -avrf $CURDIR/huggingface $CURDIR/package/
 cp -rfv $CURDIR/run.sh $CURDIR/package/
--- a/backend/go/huggingface/run.sh
+++ b/backend/go/huggingface/run.sh
@@ -1,6 +0,0 @@
 #!/bin/bash
 set -ex
 CURDIR=$(dirname "$(realpath $0)")
 exec $CURDIR/huggingface "$@"
--- a/backend/go/image/stablediffusion/main.go
+++ b/backend/go/image/stablediffusion/main.go
@@ -1,6 +1,7 @@
 package main
 // Note: this is started internally by LocalAI and a server is allocated for each model
 import (
 	"flag"
@@ -14,7 +15,7 @@ var (
 func main() {
 	flag.Parse()
-	if err := grpc.StartServer(*addr, &SDGGML{}); err != nil {
+	if err := grpc.StartServer(*addr, &Image{}); err != nil {
 		panic(err)
 	}
 }
--- a/Show More
+++ b/Show More
`@@ -1,2 +1 @@`
	`*.sh text eol=lf`	`*.sh text eol=lf`
	`backend/cpp/llama/*.hpp linguist-vendored`