chore(model gallery): add websailor-7b (#6300 )

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
chore(model gallery): add websailor-32b (#6299 )
2026-02-03 03:02:38 -05:00 · 2025-09-17 09:49:58 +02:00 · 2025-09-17 09:48:16 +02:00 · 2025-09-17 09:42:54 +02:00 · 2025-09-17 09:36:25 +02:00 · 2025-09-17 09:22:19 +02:00
1082 changed files with 348635 additions and 57375 deletions
--- a/.devcontainer-scripts/postcreate.sh
+++ b/.devcontainer-scripts/postcreate.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+cd /workspace
+
+# Get the files into the volume without a bind mount
+if [ ! -d ".git" ]; then
+    git clone https://github.com/mudler/LocalAI.git .
+else
+    git fetch
+fi
+
+echo "Standard Post-Create script completed."
+
+if [ -f "/devcontainer-customization/postcreate.sh" ]; then
+    echo "Launching customization postcreate.sh"
+    bash "/devcontainer-customization/postcreate.sh"
+fi
--- a/.devcontainer-scripts/poststart.sh
+++ b/.devcontainer-scripts/poststart.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+cd /workspace
+
+# Ensures generated source files are present upon load
+make prepare
+
+echo "Standard Post-Start script completed."
+
+if [ -f "/devcontainer-customization/poststart.sh" ]; then
+    echo "Launching customization poststart.sh"
+    bash "/devcontainer-customization/poststart.sh"
+fi
--- a/.devcontainer-scripts/utils.sh
+++ b/.devcontainer-scripts/utils.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# This file contains some really simple functions that are useful when building up customization scripts.
+
+
+# Checks if the git config has a user registered - and sets it up if not.
+#
+# Param 1: name
+# Param 2: email
+#
+config_user() {
+    echo "Configuring git for $1 <$2>"
+    local gcn=$(git config --global user.name)
+    if [ -z "${gcn}" ]; then
+        echo "Setting up git user / remote"
+        git config --global user.name "$1"
+        git config --global user.email "$2"
+        
+    fi
+}
+
+# Checks if the git remote is configured - and sets it up if not. Fetches either way.
+#
+# Param 1: remote name
+# Param 2: remote url
+#
+config_remote() {
+    echo "Adding git remote and fetching $2 as $1"
+    local gr=$(git remote -v | grep $1)
+    if [ -z "${gr}" ]; then
+        git remote add $1 $2
+    fi
+    git fetch $1
+}
+
+# Setup special .ssh files
+# Prints out lines of text to make things pretty
+# Param 1: bash array, filenames relative to the customization directory that should be copied to ~/.ssh
+setup_ssh() {
+    echo "starting ~/.ssh directory setup..."
+    mkdir -p "${HOME}.ssh"
+    chmod 0700 "${HOME}/.ssh"
+    echo "-----"
+    local files=("$@")
+    for file in "${files[@]}" ; do
+        local cfile="/devcontainer-customization/${file}"
+        local hfile="${HOME}/.ssh/${file}"
+        if [ ! -f "${hfile}" ]; then
+            echo "copying \"${file}\""
+            cp "${cfile}" "${hfile}"
+            chmod 600 "${hfile}"
+        fi
+    done
+    echo "~/.ssh directory setup complete!"
+}
--- a/.devcontainer/customization/README.md
+++ b/.devcontainer/customization/README.md
@@ -0,0 +1,25 @@
+Place any additional resources your environment requires in this directory
+
+Script hooks are currently called for:
+`postcreate.sh` and `poststart.sh`
+
+If files with those names exist here, they will be called at the end of the normal script.
+
+This is a good place to set things like `git config --global user.name` are set - and to handle any other files that are mounted via this directory.
+
+To assist in doing so, `source /.devcontainer-scripts/utils.sh` will provide utility functions that may be useful - for example:
+
+```
+#!/bin/bash
+
+source "/.devcontainer-scripts/utils.sh"
+
+sshfiles=("config", "key.pub")
+
+setup_ssh "${sshfiles[@]}"
+
+config_user "YOUR NAME" "YOUR EMAIL"
+
+config_remote "REMOTE NAME" "REMOTE URL"
+
+```
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,24 @@
+{
+    "$schema": "https://raw.githubusercontent.com/devcontainers/spec/main/schemas/devContainer.schema.json",
+    "name": "LocalAI",
+    "workspaceFolder": "/workspace",
+    "dockerComposeFile": [ "./docker-compose-devcontainer.yml" ],
+    "service": "api",
+    "shutdownAction": "stopCompose",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "golang.go",
+                "ms-vscode.makefile-tools",
+                "ms-azuretools.vscode-docker",
+                "ms-python.python",
+                "ms-python.debugpy",
+                "wayou.vscode-todo-highlight",
+                "waderyan.gitblame"
+            ]
+        }
+    },
+    "forwardPorts": [8080, 3000],
+    "postCreateCommand": "bash /.devcontainer-scripts/postcreate.sh",
+    "postStartCommand": "bash /.devcontainer-scripts/poststart.sh"
+}
--- a/.devcontainer/docker-compose-devcontainer.yml
+++ b/.devcontainer/docker-compose-devcontainer.yml
@@ -0,0 +1,44 @@
+services:
+  api:
+    build:
+      context: ..
+      dockerfile: Dockerfile
+      target: devcontainer
+    env_file:
+      - ../.env
+    ports:
+      - 8080:8080
+    volumes:
+      - localai_workspace:/workspace
+      - ../models:/host-models
+      - ./customization:/devcontainer-customization
+    command: /bin/sh -c "while sleep 1000; do :; done"
+    cap_add:
+      - SYS_PTRACE
+    security_opt:
+      - seccomp:unconfined
+  prometheus:
+    image: prom/prometheus
+    container_name: prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+    ports:
+      - 9090:9090
+    restart: unless-stopped
+    volumes:
+      - ./prometheus:/etc/prometheus
+      - prom_data:/prometheus
+  grafana:
+    image: grafana/grafana
+    container_name: grafana
+    ports:
+      - 3000:3000
+    restart: unless-stopped
+    environment:
+      - GF_SECURITY_ADMIN_USER=admin
+      - GF_SECURITY_ADMIN_PASSWORD=grafana
+    volumes:
+      - ./grafana:/etc/grafana/provisioning/datasources
+volumes:
+  prom_data:
+  localai_workspace:
--- a/.devcontainer/grafana/datasource.yml
+++ b/.devcontainer/grafana/datasource.yml
@@ -0,0 +1,10 @@
+
+apiVersion: 1
+
+datasources:
+- name: Prometheus
+  type: prometheus
+  url: http://prometheus:9090 
+  isDefault: true
+  access: proxy
+  editable: true
--- a/.devcontainer/prometheus/prometheus.yml
+++ b/.devcontainer/prometheus/prometheus.yml
@@ -0,0 +1,21 @@
+global:
+  scrape_interval: 15s
+  scrape_timeout: 10s
+  evaluation_interval: 15s
+alerting:
+  alertmanagers:
+  - static_configs:
+    - targets: []
+    scheme: http
+    timeout: 10s
+    api_version: v1
+scrape_configs:
+- job_name: prometheus
+  honor_timestamps: true
+  scrape_interval: 15s
+  scrape_timeout: 10s
+  metrics_path: /metrics
+  scheme: http
+  static_configs:
+  - targets:
+    - localhost:9090
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,6 +1,23 @@
 .idea
+.github
+.vscode
+.devcontainer
 models
+backends
 examples/chatbot-ui/models
+backend/go/image/stablediffusion-ggml/build/
+backend/go/*/build
+backend/go/*/.cache
+backend/go/*/sources
+backend/go/*/package
 examples/rwkv/models
 examples/**/models
-Dockerfile
+Dockerfile*
+__pycache__
+
+# SonarQube
+.scannerwork
+
+# backend virtual environments
+**/venv
+backend/python/**/source
--- a/.editorconfig
+++ b/.editorconfig
@@ -0,0 +1,31 @@
+
+root = true
+
+[*]
+indent_style = space
+indent_size = 2
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.go]
+indent_style = tab
+
+[Makefile]
+indent_style = tab
+
+[*.proto]
+indent_size = 2
+
+[*.py]
+indent_size = 4
+
+[*.js]
+indent_size = 2
+
+[*.yaml]
+indent_size = 2
+
+[*.md]
+trim_trailing_whitespace = false
--- a/.env
+++ b/.env
@@ -1,33 +1,36 @@
 ## Set number of threads.
 ## Note: prefer the number of physical cores. Overbooking the CPU degrades performance notably.
-# THREADS=14
+# LOCALAI_THREADS=14

 ## Specify a different bind address (defaults to ":8080")
-# ADDRESS=127.0.0.1:8080
+# LOCALAI_ADDRESS=127.0.0.1:8080

 ## Default models context size
-# CONTEXT_SIZE=512
+# LOCALAI_CONTEXT_SIZE=512
 #
 ## Define galleries.
 ## models will to install will be visible in `/models/available`
-# GALLERIES=[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}]
+# LOCALAI_GALLERIES=[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}]

 ## CORS settings
-# CORS=true
-# CORS_ALLOW_ORIGINS=*
+# LOCALAI_CORS=true
+# LOCALAI_CORS_ALLOW_ORIGINS=*

 ## Default path for models
 #
-# MODELS_PATH=/models
+# LOCALAI_MODELS_PATH=/models

 ## Enable debug mode
-# DEBUG=true
+# LOCALAI_LOG_LEVEL=debug

 ## Disables COMPEL (Diffusers)
 # COMPEL=0

 ## Enable/Disable single backend (useful if only one GPU is available)
-# SINGLE_ACTIVE_BACKEND=true
+# LOCALAI_SINGLE_ACTIVE_BACKEND=true
+
+# Forces shutdown of the backends if busy (only if LOCALAI_SINGLE_ACTIVE_BACKEND is set)
+# LOCALAI_FORCE_BACKEND_SHUTDOWN=true

 ## Specify a build type. Available: cublas, openblas, clblas.
 ## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
@@ -38,21 +41,14 @@
 ## Uncomment and set to true to enable rebuilding from source
 # REBUILD=true

-## Enable go tags, available: stablediffusion, tts
-## stablediffusion: image generation with stablediffusion
-## tts: enables text-to-speech with go-piper 
-## (requires REBUILD=true)
-#
-# GO_TAGS=stablediffusion
-
 ## Path where to store generated images
-# IMAGE_PATH=/tmp
+# LOCALAI_IMAGE_PATH=/tmp/generated/images

 ## Specify a default upload limit in MB (whisper)
-# UPLOAD_LIMIT
+# LOCALAI_UPLOAD_LIMIT=15

 ## List of external GRPC backends (note on the container image this variable is already set to use extra backends available in extra/)
-# EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py
+# LOCALAI_EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py

 ### Advanced settings ###
 ### Those are not really used by LocalAI, but from components in the stack ###
@@ -71,19 +67,36 @@
 ### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
 # LLAMACPP_PARALLEL=1

+### Define a list of GRPC Servers for llama-cpp workers to distribute the load
+# https://github.com/ggerganov/llama.cpp/pull/6829
+# https://github.com/ggerganov/llama.cpp/blob/master/tools/rpc/README.md
+# LLAMACPP_GRPC_SERVERS=""
+
 ### Enable to run parallel requests
-# PARALLEL_REQUESTS=true
+# LOCALAI_PARALLEL_REQUESTS=true
+
+# Enable to allow p2p mode
+# LOCALAI_P2P=true
+
+# Enable to use federated mode
+# LOCALAI_FEDERATED=true
+
+# Enable to start federation server
+# FEDERATED_SERVER=true
+
+# Define to use federation token
+# TOKEN=""

 ### Watchdog settings
 ###
 # Enables watchdog to kill backends that are inactive for too much time
-# WATCHDOG_IDLE=true
-#
-# Enables watchdog to kill backends that are busy for too much time
-# WATCHDOG_BUSY=true
+# LOCALAI_WATCHDOG_IDLE=true
 #
 # Time in duration format (e.g. 1h30m) after which a backend is considered idle
-# WATCHDOG_IDLE_TIMEOUT=5m
+# LOCALAI_WATCHDOG_IDLE_TIMEOUT=5m
+#
+# Enables watchdog to kill backends that are busy for too much time
+# LOCALAI_WATCHDOG_BUSY=true
 #
 # Time in duration format (e.g. 1h30m) after which a backend is considered busy
-# WATCHDOG_BUSY_TIMEOUT=5m
+# LOCALAI_WATCHDOG_BUSY_TIMEOUT=5m
--- a/.gitattributes
+++ b/.gitattributes
@@ -1 +1,2 @@
 *.sh text eol=lf
+backend/cpp/llama/*.hpp linguist-vendored
--- a/.github/bump_deps.sh
+++ b/.github/bump_deps.sh
@@ -3,7 +3,25 @@ set -xe
 REPO=$1
 BRANCH=$2
 VAR=$3
+FILE=$4
+
+if [ -z "$FILE" ]; then
+    FILE="Makefile"
+fi

 LAST_COMMIT=$(curl -s -H "Accept: application/vnd.github.VERSION.sha" "https://api.github.com/repos/$REPO/commits/$BRANCH")

-sed -i Makefile -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
+# Read $VAR from Makefile (only first match)
+set +e
+CURRENT_COMMIT="$(grep -m1 "^$VAR?=" $FILE | cut -d'=' -f2)"
+set -e
+
+sed -i $FILE -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
+
+if [ -z "$CURRENT_COMMIT" ]; then
+    echo "Could not find $VAR in Makefile."
+    exit 0
+fi
+
+echo "Changes: https://github.com/$REPO/compare/${CURRENT_COMMIT}..${LAST_COMMIT}" >> "${VAR}_message.txt"
+echo "${LAST_COMMIT}" >> "${VAR}_commit.txt"
--- a/.github/bump_docs.sh
+++ b/.github/bump_docs.sh
@@ -2,6 +2,6 @@
 set -xe
 REPO=$1

-LATEST_TAG=$(curl -s "https://api.github.com/repos/$REPO/releases/latest" | jq -r '.name')
+LATEST_TAG=$(curl -s "https://api.github.com/repos/$REPO/releases/latest" | jq -r '.tag_name')

 cat <<< $(jq ".version = \"$LATEST_TAG\"" docs/data/version.json) > docs/data/version.json
--- a/.github/check_and_update.py
+++ b/.github/check_and_update.py
@@ -0,0 +1,85 @@
+import hashlib
+from huggingface_hub import hf_hub_download, get_paths_info
+import requests
+import sys
+import os
+
+uri = sys.argv[1]
+file_name = uri.split('/')[-1]
+
+# Function to parse the URI and determine download method
+def parse_uri(uri):
+    if uri.startswith('huggingface://'):
+        repo_id = uri.split('://')[1]
+        return 'huggingface', repo_id.rsplit('/', 1)[0]
+    elif 'huggingface.co' in uri:
+        parts = uri.split('/resolve/')
+        if len(parts) > 1:
+            repo_path = parts[0].split('https://huggingface.co/')[-1]
+            return 'huggingface', repo_path
+    return 'direct', uri
+
+def calculate_sha256(file_path):
+    sha256_hash = hashlib.sha256()
+    with open(file_path, 'rb') as f:
+        for byte_block in iter(lambda: f.read(4096), b''):
+            sha256_hash.update(byte_block)
+    return sha256_hash.hexdigest()
+
+def manual_safety_check_hf(repo_id):
+    scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
+    scan = scanResponse.json()
+    # Check if 'hasUnsafeFile' exists in the response
+    if 'hasUnsafeFile' in scan:
+        if scan['hasUnsafeFile']:
+            return scan
+        else:
+            return None
+    else:
+        return None
+
+download_type, repo_id_or_url = parse_uri(uri)
+
+new_checksum =  None
+file_path = None
+
+# Decide download method based on URI type
+if download_type == 'huggingface':
+    # Check if the repo is flagged as dangerous by HF
+    hazard = manual_safety_check_hf(repo_id_or_url)
+    if hazard != None:
+        print(f'Error: HuggingFace has detected security problems for {repo_id_or_url}: {str(hazard)}', filename=file_name)
+        sys.exit(5)
+    # Use HF API to pull sha
+    for file in get_paths_info(repo_id_or_url, [file_name], repo_type='model'):
+        try:
+            new_checksum = file.lfs.sha256
+            break
+        except Exception as e:
+            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
+            sys.exit(2)
+    if new_checksum is None:
+        try:
+            file_path = hf_hub_download(repo_id=repo_id_or_url, filename=file_name)
+        except Exception as e:
+            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
+            sys.exit(2)
+else:
+    response = requests.get(repo_id_or_url)
+    if response.status_code == 200:
+        with open(file_name, 'wb') as f:
+            f.write(response.content)
+        file_path = file_name
+    elif response.status_code == 404:
+        print(f'File not found: {response.status_code}', file=sys.stderr)
+        sys.exit(2)
+    else:
+        print(f'Error downloading file: {response.status_code}', file=sys.stderr)
+        sys.exit(1)
+
+if new_checksum is None:
+    new_checksum = calculate_sha256(file_path)
+    print(new_checksum)
+    os.remove(file_path)
+else:
+    print(new_checksum)
--- a/.github/checksum_checker.sh
+++ b/.github/checksum_checker.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# This scripts needs yq and huggingface_hub to be installed
+# to install hugingface_hub run pip install huggingface_hub
+
+# Path to the input YAML file
+input_yaml=$1
+
+# Function to download file and check checksum using Python
+function check_and_update_checksum() {
+    model_name="$1"
+    file_name="$2"
+    uri="$3"
+    old_checksum="$4"
+    idx="$5"
+
+    # Download the file and calculate new checksum using Python
+    new_checksum=$(python3 ./.github/check_and_update.py $uri)
+    result=$?
+
+    if [[ $result -eq 5 ]]; then
+        echo "Contaminated entry detected, deleting entry for $model_name..."
+        yq eval -i "del([$idx])" "$input_yaml"
+        return
+    fi
+
+    if [[ "$new_checksum" == "" ]]; then
+        echo "Error calculating checksum for $file_name. Skipping..."
+        return
+    fi
+
+    echo "Checksum for $file_name: $new_checksum"
+
+    # Compare and update the YAML file if checksums do not match
+    
+    if [[ $result -eq 2 ]]; then
+        echo "File not found, deleting entry for $file_name..."
+        # yq eval -i "del(.[$idx].files[] | select(.filename == \"$file_name\"))" "$input_yaml"
+    elif [[ "$old_checksum" != "$new_checksum" ]]; then
+        echo "Checksum mismatch for $file_name. Updating..."
+        yq eval -i "del(.[$idx].files[] | select(.filename == \"$file_name\").sha256)" "$input_yaml"
+        yq eval -i "(.[$idx].files[] | select(.filename == \"$file_name\")).sha256 = \"$new_checksum\"" "$input_yaml"
+    elif [[ $result -ne 0 ]]; then
+        echo "Error downloading file $file_name. Skipping..."
+    else
+        echo "Checksum match for $file_name. No update needed."
+    fi
+}
+
+# Read the YAML and process each file
+len=$(yq eval '. | length' "$input_yaml")
+for ((i=0; i<$len; i++))
+do
+    name=$(yq eval ".[$i].name" "$input_yaml")
+    files_len=$(yq eval ".[$i].files | length" "$input_yaml")
+    for ((j=0; j<$files_len; j++))
+    do
+        filename=$(yq eval ".[$i].files[$j].filename" "$input_yaml")
+        uri=$(yq eval ".[$i].files[$j].uri" "$input_yaml")
+        checksum=$(yq eval ".[$i].files[$j].sha256" "$input_yaml")
+        echo "Checking model $name, file $filename. URI = $uri, Checksum = $checksum"
+        check_and_update_checksum "$name" "$filename" "$uri" "$checksum" "$i"
+    done
+done
--- a/.github/ci/modelslist.go
+++ b/.github/ci/modelslist.go
@@ -0,0 +1,304 @@
+package main
+
+import (
+	"fmt"
+	"html/template"
+	"io/ioutil"
+	"os"
+
+	"github.com/microcosm-cc/bluemonday"
+	"gopkg.in/yaml.v3"
+)
+
+var modelPageTemplate string = `
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>LocalAI models</title>
+    <link href="https://cdnjs.cloudflare.com/ajax/libs/flowbite/2.3.0/flowbite.min.css" rel="stylesheet" />
+    <script src="https://cdn.jsdelivr.net/npm/vanilla-lazyload@19.1.3/dist/lazyload.min.js"></script>
+
+    <link
+    rel="stylesheet"
+    href="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.8.0/build/styles/default.min.css"
+  />
+    <script
+    defer
+    src="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.8.0/build/highlight.min.js"
+  ></script>
+    <script
+    defer
+    src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"
+  ></script>
+  <script
+    defer
+    src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"
+  ></script>
+  <script
+    defer
+    src="https://cdn.jsdelivr.net/npm/dompurify@3.0.6/dist/purify.min.js"
+  ></script>
+
+  <link href="/static/general.css" rel="stylesheet" />
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=Roboto:wght@400;500&display=swap" rel="stylesheet">
+    <link
+    href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700,900&display=swap"
+    rel="stylesheet" />
+  <link
+    rel="stylesheet"
+    href="https://cdn.jsdelivr.net/npm/tw-elements/css/tw-elements.min.css" />
+  <script src="https://cdn.tailwindcss.com/3.3.0"></script>
+  <script>
+    tailwind.config = {
+      darkMode: "class",
+      theme: {
+        fontFamily: {
+          sans: ["Roboto", "sans-serif"],
+          body: ["Roboto", "sans-serif"],
+          mono: ["ui-monospace", "monospace"],
+        },
+      },
+      corePlugins: {
+        preflight: false,
+      },
+    };
+  </script>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.1.1/css/all.min.css">
+    <script src="https://unpkg.com/htmx.org@1.9.12" integrity="sha384-ujb1lZYygJmzgSwoxRggbCHcjc0rB2XoQrxeTUQyRjrOnlCoYta87iKBWq3EsdM2" crossorigin="anonymous"></script>
+</head>
+
+<body class="bg-gray-900 text-gray-200">
+<div class="flex flex-col min-h-screen">
+
+<nav class="bg-gray-800 shadow-lg">
+    <div class="container mx-auto px-4 py-4">
+        <div class="flex items-center justify-between">
+            <div class="flex items-center">
+                <a href="/" class="text-white text-xl font-bold"><img src="https://github.com/mudler/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo" class="h-10 mr-3 border-2 border-gray-300 shadow rounded"></a>
+                <a href="/" class="text-white text-xl font-bold">LocalAI</a>
+            </div>
+            <!-- Menu button for small screens -->
+            <div class="lg:hidden">
+                <button id="menu-toggle" class="text-gray-400 hover:text-white focus:outline-none">
+                    <i class="fas fa-bars fa-lg"></i>
+                </button>
+            </div>
+            <!-- Navigation links -->
+            <div class="hidden lg:flex lg:items-center lg:justify-end lg:flex-1 lg:w-0">
+                <a href="https://localai.io" class="text-gray-400 hover:text-white px-3 py-2 rounded" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
+            </div>
+        </div>
+        <!-- Collapsible menu for small screens -->
+        <div class="hidden lg:hidden" id="mobile-menu">
+            <div class="pt-4 pb-3 border-t border-gray-700">
+
+                <a href="https://localai.io" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
+
+            </div>
+        </div>
+    </div>
+</nav>
+
+<style>
+  .is-hidden {
+	display: none;
+	  }
+</style>
+
+<div class="container mx-auto px-4 flex-grow">
+
+<div class="models mt-12">
+	<h2 class="text-center text-3xl font-semibold text-gray-100">
+	LocalAI model gallery list </h2><br>
+
+	<h2 class="text-center text-3xl font-semibold text-gray-100">
+
+	 🖼️ Available {{.AvailableModels}} models</i> <a href="https://localai.io/models/" target="_blank" >
+			<i class="fas fa-circle-info pr-2"></i>
+		</a></h2>
+
+	<h3>
+	Refer to the Model gallery <a href="https://localai.io/models/" target="_blank" ><i class="fas fa-circle-info pr-2"></i></a> for more information on how to use the models with LocalAI.<br>
+
+	You can install models with the CLI command <code>local-ai models install <model-name></code>. or by using the WebUI.
+	</h3>
+
+	<input class="form-control appearance-none block w-full mt-5 px-3 py-2 text-base font-normal text-gray-300 pb-2 mb-5 bg-gray-800 bg-clip-padding border border-solid border-gray-600 rounded transition ease-in-out m-0 focus:text-gray-300 focus:bg-gray-900 focus:border-blue-500 focus:outline-none" type="search"
+	id="searchbox" placeholder="Live search keyword..">
+	  <div class="dark grid grid-cols-1 grid-rows-1 md:grid-cols-3 block rounded-lg shadow-secondary-1 dark:bg-surface-dark">
+		{{ range $_, $model := .Models }}
+		<div class="box me-4 mb-2 block rounded-lg bg-white shadow-secondary-1  dark:bg-gray-800 dark:bg-surface-dark dark:text-white text-surface pb-2">
+		<div>
+		    {{ $icon := "https://upload.wikimedia.org/wikipedia/commons/6/65/No-Image-Placeholder.svg" }}
+			{{ if $model.Icon }}
+	  		{{ $icon = $model.Icon }}
+	  		{{ end }}
+			<div class="flex justify-center items-center">
+				<img data-src="{{ $icon }}" alt="{{$model.Name}}" class="rounded-t-lg max-h-48 max-w-96 object-cover mt-3 lazy">
+			</div>
+	  		<div class="p-6 text-surface dark:text-white">
+				<h5 class="mb-2 text-xl font-medium leading-tight">{{$model.Name}}</h5>
+
+
+				<p class="mb-4 text-base truncate">{{ $model.Description }}</p>
+
+			</div>
+			<div class="px-6 pt-4 pb-2">
+
+      <!-- Modal toggle -->
+      <button data-modal-target="{{ $model.Name}}-modal" data-modal-toggle="{{ $model.Name }}-modal" class="block text-white bg-blue-700 hover:bg-blue-800 focus:ring-4 focus:outline-none focus:ring-blue-300 font-medium rounded-lg text-sm px-5 py-2.5 text-center dark:bg-blue-600 dark:hover:bg-blue-700 dark:focus:ring-blue-800" type="button">
+        More info
+      </button>
+
+    <!-- Main modal -->
+    <div id="{{ $model.Name}}-modal" tabindex="-1" aria-hidden="true" class="hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full">
+        <div class="relative p-4 w-full max-w-2xl max-h-full">
+            <!-- Modal content -->
+            <div class="relative bg-white rounded-lg shadow dark:bg-gray-700">
+                <!-- Modal header -->
+                <div class="flex items-center justify-between p-4 md:p-5 border-b rounded-t dark:border-gray-600">
+                    <h3 class="text-xl font-semibold text-gray-900 dark:text-white">
+                        {{ $model.Name}}
+                    </h3>
+                    <button type="button" class="text-gray-400 bg-transparent hover:bg-gray-200 hover:text-gray-900 rounded-lg text-sm w-8 h-8 ms-auto inline-flex justify-center items-center dark:hover:bg-gray-600 dark:hover:text-white" data-modal-hide="{{$model.Name}}-modal">
+                        <svg class="w-3 h-3" aria-hidden="true" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 14 14">
+                            <path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="m1 1 6 6m0 0 6 6M7 7l6-6M7 7l-6 6"/>
+                        </svg>
+                        <span class="sr-only">Close modal</span>
+                    </button>
+                </div>
+                <!-- Modal body -->
+                <div class="p-4 md:p-5 space-y-4">
+                    <div class="flex justify-center items-center">
+                    <img data-src="{{ $icon }}" alt="{{$model.Name}}" class="lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3">
+                  </div>
+
+                    <p class="text-base leading-relaxed text-gray-500 dark:text-gray-400">
+                    {{ $model.Description }}
+
+                    </p>
+
+                    <p class="text-base leading-relaxed text-gray-500 dark:text-gray-400">
+                    To install the model with the CLI, run: <br>
+                    <code> local-ai models install {{$model.Name}} </code> <br>
+
+                    <hr>
+                    See also <a href="https://localai.io/models/" target="_blank" >
+                    Installation <i class="fas fa-circle-info pr-2"></i>
+                    </a> to see how to install models with the REST API.
+                    </p>
+
+                    <p class="text-base leading-relaxed text-gray-500 dark:text-gray-400">
+                    <ul>
+                    {{ range $_, $u := $model.URLs }}
+                    <li><a href="{{ $u }}" target=_blank><i class="fa-solid fa-link"></i> {{ $u }}</a></li>
+                    {{ end }}
+                    </ul>
+                    </p>
+                </div>
+                <!-- Modal footer -->
+                <div class="flex items-center p-4 md:p-5 border-t border-gray-200 rounded-b dark:border-gray-600">
+                    <button data-modal-hide="{{ $model.Name}}-modal" type="button" class="py-2.5 px-5 ms-3 text-sm font-medium text-gray-900 focus:outline-none bg-white rounded-lg border border-gray-200 hover:bg-gray-100 hover:text-blue-700 focus:z-10 focus:ring-4 focus:ring-gray-100 dark:focus:ring-gray-700 dark:bg-gray-800 dark:text-gray-400 dark:border-gray-600 dark:hover:text-white dark:hover:bg-gray-700">Close</button>
+                </div>
+            </div>
+        </div>
+    </div>
+
+
+			</div>
+		</div>
+		</div>
+		{{ end }}
+
+		</div>
+  </div>
+</div>
+
+<script>
+var lazyLoadInstance = new LazyLoad({
+  // Your custom settings go here
+});
+
+let cards = document.querySelectorAll('.box')
+
+function liveSearch() {
+    let search_query = document.getElementById("searchbox").value;
+
+    //Use innerText if all contents are visible
+    //Use textContent for including hidden elements
+    for (var i = 0; i < cards.length; i++) {
+        if(cards[i].textContent.toLowerCase()
+                .includes(search_query.toLowerCase())) {
+            cards[i].classList.remove("is-hidden");
+        } else {
+            cards[i].classList.add("is-hidden");
+        }
+    }
+}
+
+//A little delay
+let typingTimer;
+let typeInterval = 500;
+let searchInput = document.getElementById('searchbox');
+
+searchInput.addEventListener('keyup', () => {
+    clearTimeout(typingTimer);
+    typingTimer = setTimeout(liveSearch, typeInterval);
+});
+</script>
+
+</div>
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/flowbite/2.3.0/flowbite.min.js"></script>
+</body>
+</html>
+`
+
+type GalleryModel struct {
+	Name        string   `json:"name" yaml:"name"`
+	URLs        []string `json:"urls" yaml:"urls"`
+	Icon        string   `json:"icon" yaml:"icon"`
+	Description string   `json:"description" yaml:"description"`
+}
+
+func main() {
+	// read the YAML file which contains the models
+
+	f, err := ioutil.ReadFile(os.Args[1])
+	if err != nil {
+		fmt.Println("Error reading file:", err)
+		return
+	}
+
+	models := []*GalleryModel{}
+	err = yaml.Unmarshal(f, &models)
+	if err != nil {
+		// write to stderr
+		os.Stderr.WriteString("Error unmarshaling YAML: " + err.Error() + "\n")
+		return
+	}
+
+	// Ensure that all arbitrary text content is sanitized before display
+	for i, m := range models {
+		models[i].Name = bluemonday.StrictPolicy().Sanitize(m.Name)
+		models[i].Description = bluemonday.StrictPolicy().Sanitize(m.Description)
+	}
+
+	// render the template
+	data := struct {
+		Models          []*GalleryModel
+		AvailableModels int
+	}{
+		Models:          models,
+		AvailableModels: len(models),
+	}
+	tmpl := template.Must(template.New("modelPage").Parse(modelPageTemplate))
+
+	err = tmpl.Execute(os.Stdout, data)
+	if err != nil {
+		fmt.Println("Error executing template:", err)
+		return
+	}
+}
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,119 @@
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+version: 2
+updates:
+  - package-ecosystem: "gitsubmodule"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "gomod"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    ignore:
+    - dependency-name: "github.com/mudler/LocalAI/pkg/grpc/proto"
+  - package-ecosystem: "github-actions"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
+  - package-ecosystem: "docker"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/bark"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/common/template"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/coqui"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/diffusers"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/exllama"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/exllama2"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/mamba"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/openvoice"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/rerankers"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/sentencetransformers"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/transformers"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/backend/python/vllm"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/examples/chainlit"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/examples/functions"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/examples/langchain/langchainpy-localai-example"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/examples/langchain-chroma"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/examples/streamlit-bot"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "docker"
+    directory: "/examples/k8sgpt"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "docker"
+    directory: "/examples/kubernetes"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "docker"
+    directory: "/examples/langchain"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "gomod"
+    directory: "/examples/semantic-todo"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "docker"
+    directory: "/examples/telegram-bot"
+    schedule:
+      interval: "weekly"
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -0,0 +1,33 @@
+enhancement:
+ - head-branch: ['^feature', 'feature']
+
+dependencies:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'Makefile'
+  - changed-files:
+    - any-glob-to-any-file: '*.mod'
+  - changed-files:
+    - any-glob-to-any-file: '*.sum'
+
+kind/documentation:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'docs/*'
+  - changed-files:
+    - any-glob-to-any-file: '*.md'
+
+area/ai-model:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'gallery/*'
+
+examples:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'examples/*'
+
+ci:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: '.github/*'
--- a/.github/release.yml
+++ b/.github/release.yml
@@ -12,13 +12,26 @@ changelog:
    - title: "Bug fixes :bug:"
      labels:
        - bug
+        - regression
+    - title: "🖧 P2P area"
+      labels:
+         - area/p2p
    - title: Exciting New Features 🎉
      labels:
        - Semver-Minor
        - enhancement
+        - ux
+        - roadmap
+    - title: 🧠 Models
+      labels:
+        - area/ai-model
+    - title: 📖 Documentation and examples
+      labels:
+        - kind/documentation
+        - examples
    - title: 👒 Dependencies
      labels:
        - dependencies
    - title: Other Changes
      labels:
-        - "*"
+        - "*"
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
--- a/.github/workflows/backend_build.yml
+++ b/.github/workflows/backend_build.yml
@@ -0,0 +1,243 @@
+---
+name: 'build python backend container images (reusable)'
+
+on:
+  workflow_call:
+    inputs:
+      base-image:
+        description: 'Base image'
+        required: true
+        type: string
+      build-type:
+        description: 'Build type'
+        default: ''
+        type: string
+      cuda-major-version:
+        description: 'CUDA major version'
+        default: "12"
+        type: string
+      cuda-minor-version:
+        description: 'CUDA minor version'
+        default: "1"
+        type: string
+      platforms:
+        description: 'Platforms'
+        default: ''
+        type: string
+      tag-latest:
+        description: 'Tag latest'
+        default: ''
+        type: string
+      tag-suffix:
+        description: 'Tag suffix'
+        default: ''
+        type: string
+      runs-on:
+        description: 'Runs on'
+        required: true
+        default: ''
+        type: string
+      backend:
+        description: 'Backend to build'
+        required: true
+        type: string
+      context:
+        description: 'Build context'
+        required: true
+        type: string
+      dockerfile:
+        description: 'Build Dockerfile'
+        required: true
+        type: string
+      skip-drivers:
+        description: 'Skip drivers'
+        default: 'false'
+        type: string
+    secrets:
+      dockerUsername:
+        required: false
+      dockerPassword:
+        required: false
+      quayUsername:
+        required: true
+      quayPassword:
+        required: true
+
+jobs:
+  backend-build:
+    runs-on: ${{ inputs.runs-on }}
+    env:
+        quay_username: ${{ secrets.quayUsername }}
+    steps:
+
+
+      - name: Free Disk Space (Ubuntu)
+        if: inputs.runs-on == 'ubuntu-latest'
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
+
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+
+      - name: Checkout
+        uses: actions/checkout@v5
+
+      - name: Release space from worker
+        if: inputs.runs-on == 'ubuntu-latest'
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools snapd || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools snapd || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get remove -y microsoft-edge-stable || true
+          sudo apt-get remove -y firefox || true
+          sudo apt-get remove -y powershell || true
+          sudo apt-get remove -y r-base-core || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf "/usr/local/share/boost" || true
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+          df -h
+
+      - name: Docker meta
+        id: meta
+        if: github.event_name != 'pull_request'
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/local-ai-backends
+            localai/localai-backends
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.tag-suffix }},onlatest=true
+
+      - name: Docker meta for PR
+        id: meta_pull_request
+        if: github.event_name == 'pull_request'
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/ci-tests
+          tags: |
+            type=ref,event=branch,suffix=${{ github.event.number }}-${{ inputs.backend }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
+            type=semver,pattern={{raw}},suffix=${{ github.event.number }}-${{ inputs.backend }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
+            type=sha,suffix=${{ github.event.number }}-${{ inputs.backend }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.tag-suffix }},onlatest=true
+## End testing image
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.dockerUsername }}
+          password: ${{ secrets.dockerPassword }}
+
+      - name: Login to Quay.io
+        if: ${{ env.quay_username != '' }}
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.quayUsername }}
+          password: ${{ secrets.quayPassword }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        if: github.event_name != 'pull_request'
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BUILD_TYPE=${{ inputs.build-type }}
+            SKIP_DRIVERS=${{ inputs.skip-drivers }}
+            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
+            BASE_IMAGE=${{ inputs.base-image }}
+            BACKEND=${{ inputs.backend }}
+          context: ${{ inputs.context }}
+          file: ${{ inputs.dockerfile }}
+          cache-from: type=gha
+          platforms: ${{ inputs.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+
+      - name: Build and push (PR)
+        uses: docker/build-push-action@v6
+        if: github.event_name == 'pull_request'
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BUILD_TYPE=${{ inputs.build-type }}
+            SKIP_DRIVERS=${{ inputs.skip-drivers }}
+            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
+            BASE_IMAGE=${{ inputs.base-image }}
+            BACKEND=${{ inputs.backend }}
+          context: ${{ inputs.context }}
+          file: ${{ inputs.dockerfile }}
+          cache-from: type=gha
+          platforms: ${{ inputs.platforms }}
+          push: ${{ env.quay_username != '' }}
+          tags: ${{ steps.meta_pull_request.outputs.tags }}
+          labels: ${{ steps.meta_pull_request.outputs.labels }}
+
+
+
+      - name: job summary
+        run: |
+          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/backend_build_darwin.yml
+++ b/.github/workflows/backend_build_darwin.yml
@@ -0,0 +1,144 @@
+---
+name: 'build darwin python backend container images (reusable)'
+
+on:
+  workflow_call:
+    inputs:
+      backend:
+        description: 'Backend to build'
+        required: true
+        type: string
+      build-type:
+        description: 'Build type (e.g., mps)'
+        default: ''
+        type: string
+      use-pip:
+        description: 'Use pip to install dependencies'
+        default: false
+        type: boolean
+      lang:
+        description: 'Programming language (e.g. go)'
+        default: 'python'
+        type: string
+      go-version:
+        description: 'Go version to use'
+        default: '1.24.x'
+        type: string
+      tag-suffix:
+        description: 'Tag suffix for the built image'
+        required: true
+        type: string
+      runs-on:
+        description: 'Runner to use'
+        default: 'macOS-14'
+        type: string
+    secrets:
+      dockerUsername:
+        required: false
+      dockerPassword:
+        required: false
+      quayUsername:
+        required: true
+      quayPassword:
+        required: true
+
+jobs:
+  darwin-backend-build:
+    runs-on: ${{ inputs.runs-on }}
+    strategy:
+      matrix:
+        go-version: ['${{ inputs.go-version }}']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v5
+        with:
+          submodules: true
+
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+          cache: false
+
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
+
+      - name: Dependencies
+        run: |
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
+
+      - name: Build ${{ inputs.backend }}-darwin
+        run: |
+          make protogen-go
+          BACKEND=${{ inputs.backend }} BUILD_TYPE=${{ inputs.build-type }} USE_PIP=${{ inputs.use-pip }} make build-darwin-${{ inputs.lang }}-backend
+
+      - name: Upload ${{ inputs.backend }}.tar
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ inputs.backend }}-tar
+          path: backend-images/${{ inputs.backend }}.tar
+
+  darwin-backend-publish:
+    needs: darwin-backend-build
+    if: github.event_name != 'pull_request'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download ${{ inputs.backend }}.tar
+        uses: actions/download-artifact@v5
+        with:
+          name: ${{ inputs.backend }}-tar
+          path: .
+
+      - name: Install crane
+        run: |
+          curl -L https://github.com/google/go-containerregistry/releases/latest/download/go-containerregistry_Linux_x86_64.tar.gz | tar -xz
+          sudo mv crane /usr/local/bin/
+
+      - name: Log in to DockerHub
+        run: |
+          echo "${{ secrets.dockerPassword }}" | crane auth login docker.io -u "${{ secrets.dockerUsername }}" --password-stdin
+
+      - name: Log in to quay.io
+        run: |
+          echo "${{ secrets.quayPassword }}" | crane auth login quay.io -u "${{ secrets.quayUsername }}" --password-stdin
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            localai/localai-backends
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=auto
+            suffix=${{ inputs.tag-suffix }},onlatest=true
+
+      - name: Docker meta
+        id: quaymeta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/local-ai-backends
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=auto
+            suffix=${{ inputs.tag-suffix }},onlatest=true
+
+      - name: Push Docker image (DockerHub)
+        run: |
+          for tag in $(echo "${{ steps.meta.outputs.tags }}" | tr ',' '\n'); do
+            crane push ${{ inputs.backend }}.tar $tag
+          done
+
+      - name: Push Docker image (Quay)
+        run: |
+          for tag in $(echo "${{ steps.quaymeta.outputs.tags }}" | tr ',' '\n'); do
+            crane push ${{ inputs.backend }}.tar $tag
+          done
--- a/.github/workflows/backend_pr.yml
+++ b/.github/workflows/backend_pr.yml
@@ -0,0 +1,78 @@
+name: 'build backend container images (PR-filtered)'
+
+on:
+  pull_request:
+
+concurrency:
+  group: ci-backends-pr-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  generate-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      matrix-darwin: ${{ steps.set-matrix.outputs.matrix-darwin }}
+      has-backends: ${{ steps.set-matrix.outputs.has-backends }}
+      has-backends-darwin: ${{ steps.set-matrix.outputs.has-backends-darwin }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Setup Bun
+        uses: oven-sh/setup-bun@v2
+
+      - name: Install dependencies
+        run: |
+          bun add js-yaml
+          bun add @octokit/core
+
+      # filters the matrix in backend.yml
+      - name: Filter matrix for changed backends
+        id: set-matrix
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_EVENT_PATH: ${{ github.event_path }}
+        run: bun run scripts/changed-backends.js
+
+  backend-jobs:
+    needs: generate-matrix
+    uses: ./.github/workflows/backend_build.yml
+    if: needs.generate-matrix.outputs.has-backends == 'true'
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      base-image: ${{ matrix.base-image }}
+      backend: ${{ matrix.backend }}
+      dockerfile: ${{ matrix.dockerfile }}
+      skip-drivers: ${{ matrix.skip-drivers }}
+      context: ${{ matrix.context }}
+    secrets:
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      fail-fast: true
+      matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
+  backend-jobs-darwin:
+    needs: generate-matrix
+    uses: ./.github/workflows/backend_build_darwin.yml
+    if: needs.generate-matrix.outputs.has-backends-darwin == 'true'
+    with:
+      backend: ${{ matrix.backend }}
+      build-type: ${{ matrix.build-type }}
+      go-version: "1.24.x"
+      tag-suffix: ${{ matrix.tag-suffix }}
+      lang: ${{ matrix.lang || 'python' }}
+      use-pip: ${{ matrix.backend == 'diffusers' }}
+      runs-on: "macOS-14"
+    secrets:
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      fail-fast: true
+      matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix-darwin) }}
--- a/.github/workflows/build-test.yaml
+++ b/.github/workflows/build-test.yaml
@@ -0,0 +1,67 @@
+name: Build test
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+
+jobs:
+  build-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: 1.23
+      - name: Run GoReleaser
+        run: |
+          make dev-dist
+  launcher-build-darwin:
+    runs-on: macos-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: 1.23
+      - name: Build launcher for macOS ARM64
+        run: |
+          make build-launcher-darwin
+          ls -liah dist
+      - name: Upload macOS launcher artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: launcher-macos
+          path: dist/
+          retention-days: 30
+      
+  launcher-build-linux:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: 1.23
+      - name: Build launcher for Linux
+        run: |
+          sudo apt-get update
+          sudo apt-get install golang gcc libgl1-mesa-dev xorg-dev libxkbcommon-dev
+          make build-launcher-linux
+      - name: Upload Linux launcher artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: launcher-linux
+          path: local-ai-launcher-linux.tar.xz
+          retention-days: 30
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -9,54 +9,54 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - repository: "go-skynet/go-llama.cpp"
-            variable: "GOLLAMA_VERSION"
+          - repository: "ggml-org/llama.cpp"
+            variable: "LLAMA_VERSION"
            branch: "master"
-          - repository: "ggerganov/llama.cpp"
-            variable: "CPPLLAMA_VERSION"
-            branch: "master"
-          - repository: "go-skynet/go-ggml-transformers.cpp"
-            variable: "GOGGMLTRANSFORMERS_VERSION"
-            branch: "master"
-          - repository: "donomii/go-rwkv.cpp"
-            variable: "RWKV_VERSION"
-            branch: "main"
-          - repository: "ggerganov/whisper.cpp"
+            file: "backend/cpp/llama-cpp/Makefile"
+          - repository: "ggml-org/whisper.cpp"
            variable: "WHISPER_CPP_VERSION"
            branch: "master"
-          - repository: "go-skynet/go-bert.cpp"
-            variable: "BERT_VERSION"
-            branch: "master"
-          - repository: "go-skynet/bloomz.cpp"
-            variable: "BLOOMZ_VERSION"
+            file: "backend/go/whisper/Makefile"
+          - repository: "PABannier/bark.cpp"
+            variable: "BARKCPP_VERSION"
            branch: "main"
-          - repository: "nomic-ai/gpt4all"
-            variable: "GPT4ALL_VERSION"
-            branch: "main"
-          - repository: "mudler/go-ggllm.cpp"
-            variable: "GOGGLLM_VERSION"
-            branch: "master"
-          - repository: "mudler/go-stable-diffusion"
-            variable: "STABLEDIFFUSION_VERSION"
+            file: "Makefile"
+          - repository: "leejet/stable-diffusion.cpp"
+            variable: "STABLEDIFFUSION_GGML_VERSION"
            branch: "master"
+            file: "backend/go/stablediffusion-ggml/Makefile"
          - repository: "mudler/go-piper"
            variable: "PIPER_VERSION"
            branch: "master"
+            file: "backend/go/piper/Makefile"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
      - name: Bump dependencies 🔧
+        id: bump
        run: |
-          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
+          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }} ${{ matrix.file }}
+          {
+            echo 'message<<EOF'
+            cat "${{ matrix.variable }}_message.txt"
+            echo EOF
+          } >> "$GITHUB_OUTPUT"
+          {
+            echo 'commit<<EOF'
+            cat "${{ matrix.variable }}_commit.txt"
+            echo EOF
+          } >> "$GITHUB_OUTPUT"
+          rm -rfv ${{ matrix.variable }}_message.txt
+          rm -rfv ${{ matrix.variable }}_commit.txt
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v5
+        uses: peter-evans/create-pull-request@v7
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Update ${{ matrix.repository }}'
-          title: ':arrow_up: Update ${{ matrix.repository }}'
+          title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`'
          branch: "update/${{ matrix.variable }}"
-          body: Bump of ${{ matrix.repository }} version
+          body: ${{ steps.bump.outputs.message }}
          signoff: true


--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -12,17 +12,17 @@ jobs:
          - repository: "mudler/LocalAI"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
      - name: Bump dependencies 🔧
        run: |
          bash .github/bump_docs.sh ${{ matrix.repository }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v5
+        uses: peter-evans/create-pull-request@v7
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
          commit-message: ':arrow_up: Update docs version ${{ matrix.repository }}'
-          title: ':arrow_up: Update docs version ${{ matrix.repository }}'
+          title: 'docs: :arrow_up: update docs version ${{ matrix.repository }}'
          branch: "update/docs"
          body: Bump of ${{ matrix.repository }} version inside docs
          signoff: true
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -0,0 +1,46 @@
+name: Check if checksums are up-to-date
+on:
+  schedule:
+    - cron: 0 20 * * *
+  workflow_dispatch:
+jobs:
+  checksum_check:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+      - uses: actions/checkout@v5
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y pip wget
+          pip install huggingface_hub
+      - name: 'Setup yq'
+        uses: dcarbone/install-yq-action@v1.3.1
+        with:
+          version: 'v4.44.2'
+          download-compressed: true
+          force: true
+
+      - name: Checksum checker 🔧
+        run: |
+          export HF_HOME=/hf_cache
+          sudo mkdir /hf_cache
+          sudo chmod 777 /hf_cache
+          bash .github/checksum_checker.sh gallery/index.yaml
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v7
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: ':arrow_up: Checksum updates in gallery/index.yaml'
+          title: 'chore(model-gallery): :arrow_up: update checksum'
+          branch: "update/checksum"
+          body: Updating checksums in gallery/index.yaml
+          signoff: true
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -0,0 +1,43 @@
+name: Dependabot auto-merge
+on:
+- pull_request_target
+
+permissions:
+  contents: write
+  pull-requests: write
+  packages: read
+
+jobs:
+  dependabot:
+    runs-on: ubuntu-latest
+    if: ${{ github.actor == 'dependabot[bot]' }}
+    steps:
+      - name: Dependabot metadata
+        id: metadata
+        uses: dependabot/fetch-metadata@v2.4.0
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          skip-commit-verification: true
+
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Approve a PR if not already approved
+        run: |
+          gh pr checkout "$PR_URL"
+            if [ "$(gh pr status --json reviewDecision -q .currentBranch.reviewDecision)" != "APPROVED" ];
+          then
+            gh pr review --approve "$PR_URL"
+          else
+            echo "PR already approved.";
+          fi
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+
+      - name: Enable auto-merge for Dependabot PRs
+        if: ${{ contains(github.event.pull_request.title, 'bump')}}
+        run: gh pr merge --auto --squash "$PR_URL"
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
--- a/.github/workflows/deploy-explorer.yaml
+++ b/.github/workflows/deploy-explorer.yaml
@@ -0,0 +1,64 @@
+name: Explorer deployment
+
+on:
+  push:
+    branches:
+      - master
+    tags:
+      - 'v*'
+
+concurrency:
+  group: ci-deploy-${{ github.head_ref || github.ref }}-${{ github.repository }}
+
+jobs:
+  build-linux:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v5
+        with:
+          submodules: true
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21.x'
+          cache: false
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          make protogen-go
+      - name: Build api
+        run: |
+          CGO_ENABLED=0 make build
+      - name: rm
+        uses: appleboy/ssh-action@v1.2.2
+        with:
+            host: ${{ secrets.EXPLORER_SSH_HOST }}
+            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
+            key: ${{ secrets.EXPLORER_SSH_KEY }}
+            port: ${{ secrets.EXPLORER_SSH_PORT }}
+            script: |
+                sudo rm -rf local-ai/ || true
+      - name: copy file via ssh
+        uses: appleboy/scp-action@v1.0.0
+        with:
+            host: ${{ secrets.EXPLORER_SSH_HOST }}
+            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
+            key: ${{ secrets.EXPLORER_SSH_KEY }}
+            port: ${{ secrets.EXPLORER_SSH_PORT }}
+            source: "local-ai"
+            overwrite: true
+            rm: true
+            target: ./local-ai
+      - name: restarting
+        uses: appleboy/ssh-action@v1.2.2
+        with:
+            host: ${{ secrets.EXPLORER_SSH_HOST }}
+            username: ${{ secrets.EXPLORER_SSH_USERNAME }}
+            key: ${{ secrets.EXPLORER_SSH_KEY }}
+            port: ${{ secrets.EXPLORER_SSH_PORT }}
+            script: |
+                sudo cp -rfv local-ai/local-ai /usr/bin/local-ai
+                sudo systemctl restart local-ai
--- a/.github/workflows/disabled/comment-pr.yaml
+++ b/.github/workflows/disabled/comment-pr.yaml
@@ -0,0 +1,83 @@
+name: Comment PRs
+on:
+  pull_request_target:
+
+jobs:
+  comment-pr:
+    env:
+        MODEL_NAME: hermes-2-theta-llama-3-8b
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+      with:
+        ref: "${{ github.event.pull_request.merge_commit_sha }}"
+        fetch-depth: 0 # needed to checkout all branches for this Action to work
+    - uses: mudler/localai-github-action@v1
+      with:
+        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+      # Check the PR diff using the current branch and the base branch of the PR
+    - uses: GrantBirki/git-diff-action@v2.7.0
+      id: git-diff-action
+      with:
+            json_diff_file_output: diff.json
+            raw_diff_file_output: diff.txt
+            file_output_only: "true"
+            base_branch: ${{ github.event.pull_request.base.sha }}
+    - name: Show diff
+      env:
+        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
+      run: |
+            cat $DIFF
+    - name: Summarize
+      env:
+        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
+      id: summarize
+      run: |
+            input="$(cat $DIFF)"
+
+            # Define the LocalAI API endpoint
+            API_URL="http://localhost:8080/chat/completions"
+
+            # Create a JSON payload using jq to handle special characters
+            json_payload=$(jq -n --arg input "$input" '{
+            model: "'$MODEL_NAME'",
+            messages: [
+                {
+                role: "system",
+                content: "You are LocalAI-bot in Github that helps understanding PRs and assess complexity. Explain what has changed in this PR diff and why"
+                },
+                {
+                role: "user",
+                content: $input
+                }
+            ]
+            }')
+
+            # Send the request to LocalAI
+            response=$(curl -s -X POST $API_URL \
+            -H "Content-Type: application/json" \
+            -d "$json_payload")
+
+            # Extract the summary from the response
+            summary="$(echo $response | jq -r '.choices[0].message.content')"
+
+            # Print the summary
+            #  -H "Authorization: Bearer $API_KEY" \
+            echo "Summary:"
+            echo "$summary"
+            echo "payload sent"
+            echo "$json_payload"
+            {
+                echo 'message<<EOF'
+                echo "$summary"
+                echo EOF
+              } >> "$GITHUB_OUTPUT"
+            docker logs --tail 10 local-ai
+    - uses: mshick/add-pr-comment@v2
+      if: always()
+      with:
+          repo-token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          message: ${{ steps.summarize.outputs.message }}
+          message-failure: |
+            Uh oh! Could not analyze this PR, maybe it's too big?
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -0,0 +1,95 @@
+name: 'generate and publish GRPC docker caches'
+
+on:
+  workflow_dispatch:
+
+  schedule:
+    # daily at midnight
+    - cron: '0 0 * * *'
+
+concurrency:
+  group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  generate_caches:
+    strategy:
+      matrix:
+        include:
+          - grpc-base-image: ubuntu:22.04
+            runs-on: 'ubuntu-latest'
+            platforms: 'linux/amd64,linux/arm64'
+    runs-on: ${{matrix.runs-on}}
+    steps:
+      - name: Release space from worker
+        if: matrix.runs-on == 'ubuntu-latest'
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get remove -y microsoft-edge-stable || true
+          sudo apt-get remove -y firefox || true
+          sudo apt-get remove -y powershell || true
+          sudo apt-get remove -y r-base-core || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf "/usr/local/share/boost" || true
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+          df -h
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Checkout
+        uses: actions/checkout@v5
+
+      - name: Cache GRPC
+        uses: docker/build-push-action@v6
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
+          build-args: |
+            GRPC_BASE_IMAGE=${{ matrix.grpc-base-image }}
+            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
+            GRPC_VERSION=v1.65.0
+          context: .
+          file: ./Dockerfile
+          cache-to: type=gha,ignore-error=true
+          cache-from: type=gha
+          target: grpc
+          platforms: ${{ matrix.platforms }}
+          push: false
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -0,0 +1,59 @@
+name: 'generate and publish intel docker caches'
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+
+concurrency:
+  group: intel-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  generate_caches:
+    strategy:
+      matrix:
+        include:
+          - base-image: intel/oneapi-basekit:2025.2.0-0-devel-ubuntu22.04
+            runs-on: 'ubuntu-latest'
+            platforms: 'linux/amd64'
+    runs-on: ${{matrix.runs-on}}
+    steps:
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+
+      - name: Login to quay
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Checkout
+        uses: actions/checkout@v5
+
+      - name: Cache Intel images
+        uses: docker/build-push-action@v6
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BASE_IMAGE=${{ matrix.base-image }}
+          context: .
+          file: ./Dockerfile
+          tags: quay.io/go-skynet/intel-oneapi-base:latest
+          push: true
+          target: intel
+          platforms: ${{ matrix.platforms }}
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -9,19 +9,19 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  extras-image-build:
+  image-build:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
+      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -30,87 +30,39 @@ jobs:
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
+      max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
+      fail-fast: false
      matrix:
        include:
-          - build-type: ''
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "1"
+            cuda-minor-version: "8"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
+            tag-suffix: '-gpu-nvidia-cuda-12'
+            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-hipblas'
-            ffmpeg: 'false'
-            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
-            runs-on: 'arc-runner-set'
-          - build-type: 'sycl_f16'
+            base-image: "rocm/dev-ubuntu-22.04:6.4.3"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'ubuntu-latest'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'sycl'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
-            tag-suffix: 'sycl-f16-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-  core-image-build:
-    uses: ./.github/workflows/image_build.yml
-    with:
-      tag-latest: ${{ matrix.tag-latest }}
-      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
-      build-type: ${{ matrix.build-type }}
-      cuda-major-version: ${{ matrix.cuda-major-version }}
-      cuda-minor-version: ${{ matrix.cuda-minor-version }}
-      platforms: ${{ matrix.platforms }}
-      runs-on: ${{ matrix.runs-on }}
-      base-image: ${{ matrix.base-image }}
-    secrets:
-      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
-      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
-      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-    strategy:
-      matrix:
-        include:
-          - build-type: ''
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: 'sycl'
+            runs-on: 'ubuntu-latest'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
+            tag-suffix: '-vulkan-core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-          - build-type: 'sycl_f16'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
-            tag-suffix: 'sycl-f16-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "1"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -13,192 +13,127 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  self-hosted-jobs:
+  hipblas-jobs:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
+      aio: ${{ matrix.aio }}
+      makeflags: ${{ matrix.makeflags }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
-      # Pushing with all jobs in parallel
-      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
      matrix:
        include:
-          # Extra images
-          - build-type: ''
-            #platforms: 'linux/amd64,linux/arm64'
+          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            tag-suffix: ''
-            ffmpeg: ''
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-          - build-type: ''
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-          - build-type: 'cublas'
-            cuda-major-version: "11"
-            cuda-minor-version: "7"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11'
-            ffmpeg: ''
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "1"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12'
-            ffmpeg: ''
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-          - build-type: 'cublas'
-            cuda-major-version: "11"
-            cuda-minor-version: "7"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "1"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-            base-image: "ubuntu:22.04"
-          - build-type: ''
-            #platforms: 'linux/amd64,linux/arm64'
-            platforms: 'linux/amd64'
-            tag-latest: 'auto'
-            tag-suffix: ''
-            ffmpeg: ''
-            image-type: 'extras'
-            base-image: "ubuntu:22.04"
-            runs-on: 'arc-runner-set'
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
-            runs-on: 'arc-runner-set'
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas'
-            ffmpeg: 'false'
-            image-type: 'extras'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
-            runs-on: 'arc-runner-set'
-          - build-type: 'sycl_f16'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
-            tag-suffix: '-sycl-f16-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-          - build-type: 'sycl_f32'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
-            tag-suffix: '-sycl-f32-ffmpeg'
-            ffmpeg: 'true'
-            image-type: 'extras'
-            runs-on: 'arc-runner-set'
-          # Core images
-          - build-type: 'sycl_f16'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
-            tag-suffix: '-sycl-f16-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
-          - build-type: 'sycl_f32'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
-            tag-suffix: '-sycl-f32-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
-          - build-type: 'sycl_f16'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
-            tag-suffix: '-sycl-f16-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
-          - build-type: 'sycl_f32'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04"
-            tag-suffix: '-sycl-f32-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'arc-runner-set'
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
-            runs-on: 'arc-runner-set'
-          - build-type: 'hipblas'
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-hipblas-core'
-            ffmpeg: 'false'
-            image-type: 'core'
-            base-image: "rocm/dev-ubuntu-22.04:6.0-complete"
-            runs-on: 'arc-runner-set'
-  
+            tag-suffix: '-gpu-hipblas'
+            base-image: "rocm/dev-ubuntu-22.04:6.4.3"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'ubuntu-latest'
+            makeflags: "--jobs=3 --output-sync=target"
+            aio: "-aio-gpu-hipblas"
+
  core-image-build:
    uses: ./.github/workflows/image_build.yml
    with:
      tag-latest: ${{ matrix.tag-latest }}
      tag-suffix: ${{ matrix.tag-suffix }}
-      ffmpeg: ${{ matrix.ffmpeg }}
-      image-type: ${{ matrix.image-type }}
      build-type: ${{ matrix.build-type }}
      cuda-major-version: ${{ matrix.cuda-major-version }}
      cuda-minor-version: ${{ matrix.cuda-minor-version }}
      platforms: ${{ matrix.platforms }}
      runs-on: ${{ matrix.runs-on }}
+      aio: ${{ matrix.aio }}
      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
+      makeflags: ${{ matrix.makeflags }}
+      skip-drivers: ${{ matrix.skip-drivers }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      #max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
+      matrix:
+        include:
+          - build-type: ''
+            platforms: 'linux/amd64,linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: ''
+            base-image: "ubuntu:22.04"
+            runs-on: 'ubuntu-latest'
+            aio: "-aio-cpu"
+            makeflags: "--jobs=4 --output-sync=target"
+            skip-drivers: 'false'
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-11'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=4 --output-sync=target"
+            skip-drivers: 'false'
+            aio: "-aio-gpu-nvidia-cuda-11"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "8"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-nvidia-cuda-12'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            makeflags: "--jobs=4 --output-sync=target"
+            aio: "-aio-gpu-nvidia-cuda-12"
+          - build-type: 'vulkan'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-gpu-vulkan'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            skip-drivers: 'false'
+            makeflags: "--jobs=4 --output-sync=target"
+            aio: "-aio-gpu-vulkan"
+          - build-type: 'intel'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: '-gpu-intel'
+            runs-on: 'ubuntu-latest'
+            makeflags: "--jobs=3 --output-sync=target"
+            aio: "-aio-gpu-intel"
+
+  gh-runner:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      aio: ${{ matrix.aio }}
+      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
+      makeflags: ${{ matrix.makeflags }}
+      skip-drivers: ${{ matrix.skip-drivers }}
    secrets:
      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
@@ -207,51 +142,13 @@ jobs:
    strategy:
      matrix:
        include:
-          - build-type: ''
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            base-image: "ubuntu:22.04"
-            runs-on: 'ubuntu-latest'
-          - build-type: 'cublas'
-            cuda-major-version: "11"
-            cuda-minor-version: "7"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-core'
-            ffmpeg: ''
-            image-type: 'core'
-            base-image: "ubuntu:22.04"
-            runs-on: 'ubuntu-latest'
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "1"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-core'
-            ffmpeg: ''
-            image-type: 'core'
-            base-image: "ubuntu:22.04"
-            runs-on: 'ubuntu-latest'
-          - build-type: 'cublas'
-            cuda-major-version: "11"
-            cuda-minor-version: "7"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:22.04"
-          - build-type: 'cublas'
-            cuda-major-version: "12"
-            cuda-minor-version: "1"
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg-core'
-            ffmpeg: 'true'
-            image-type: 'core'
-            runs-on: 'ubuntu-latest'
-            base-image: "ubuntu:22.04"
+            cuda-minor-version: "8"
+            platforms: 'linux/arm64'
+            tag-latest: 'auto'
+            tag-suffix: '-nvidia-l4t-arm64'
+            base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+            runs-on: 'ubuntu-24.04-arm'
+            makeflags: "--jobs=4 --output-sync=target"
+            skip-drivers: 'true'
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -6,6 +6,10 @@ on:
    inputs:
      base-image:
        description: 'Base image'
+        required: true
+        type: string
+      grpc-base-image:
+        description: 'GRPC Base image, must be a compatible image with base-image'
        required: false
        default: ''
        type: string
@@ -15,11 +19,11 @@ on:
        type: string
      cuda-major-version:
        description: 'CUDA major version'
-        default: "11"
+        default: "12"
        type: string
      cuda-minor-version:
        description: 'CUDA minor version'
-        default: "7"
+        default: "4"
        type: string
      platforms:
        description: 'Platforms'
@@ -33,19 +37,25 @@ on:
        description: 'Tag suffix'
        default: ''
        type: string
-      ffmpeg:
-        description: 'FFMPEG'
-        default: ''
-        type: string
-      image-type:
-        description: 'Image type'
-        default: ''
+      skip-drivers:
+        description: 'Skip drivers by default'
+        default: 'false'
        type: string
      runs-on:
        description: 'Runs on'
        required: true
        default: ''
        type: string
+      makeflags:
+        description: 'Make Flags'
+        required: false
+        default: '--jobs=4 --output-sync=target'
+        type: string
+      aio:
+        description: 'AIO Image Name'
+        required: false
+        default: ''
+        type: string
    secrets:
      dockerUsername:
        required: true
@@ -59,6 +69,22 @@ jobs:
  reusable_image-build:
    runs-on: ${{ inputs.runs-on }}
    steps:
+
+      - name: Free Disk Space (Ubuntu)
+        if: inputs.runs-on == 'ubuntu-latest'
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
      - name: Force Install GIT latest
        run: |
          sudo apt-get update \
@@ -68,7 +94,8 @@ jobs:
          && sudo apt-get update \
          && sudo apt-get install -y git
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
+
      - name: Release space from worker
        if: inputs.runs-on == 'ubuntu-latest'
        run: |
@@ -79,8 +106,8 @@ jobs:
          df -h
          echo
          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
-          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
-          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools snapd || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools snapd || true
          sudo rm -rf /usr/local/lib/android
          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
          sudo rm -rf /usr/share/dotnet
@@ -110,8 +137,10 @@ jobs:
          sudo rm -rf "/usr/local/share/boost" || true
          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
          df -h
+
      - name: Docker meta
        id: meta
+        if: github.event_name != 'pull_request'
        uses: docker/metadata-action@v5
        with:
          images: |
@@ -121,9 +150,50 @@ jobs:
            type=ref,event=branch
            type=semver,pattern={{raw}}
            type=sha
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.tag-suffix }},onlatest=true
+      - name: Docker meta for PR
+        id: meta_pull_request
+        if: github.event_name == 'pull_request'
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/ci-tests
+          tags: |
+            type=ref,event=branch,suffix=localai${{ github.event.number }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
+            type=semver,pattern={{raw}},suffix=localai${{ github.event.number }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
+            type=sha,suffix=localai${{ github.event.number }}-${{ inputs.build-type }}-${{ inputs.cuda-major-version }}-${{ inputs.cuda-minor-version }}
          flavor: |
            latest=${{ inputs.tag-latest }}
            suffix=${{ inputs.tag-suffix }}
+      - name: Docker meta AIO (quay.io)
+        if: inputs.aio != ''
+        id: meta_aio
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/local-ai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.aio }},onlatest=true
+
+      - name: Docker meta AIO (dockerhub)
+        if: inputs.aio != ''
+        id: meta_aio_dockerhub
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            localai/localai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.aio }},onlatest=true

      - name: Set up QEMU
        uses: docker/setup-qemu-action@master
@@ -150,22 +220,94 @@ jobs:
          password: ${{ secrets.quayPassword }}

      - name: Build and push
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
+        if: github.event_name != 'pull_request'
        with:
          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
+          # This is why some build args like GRPC_VERSION and MAKEFLAGS are hardcoded
          build-args: |
            BUILD_TYPE=${{ inputs.build-type }}
            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
-            FFMPEG=${{ inputs.ffmpeg }}
-            IMAGE_TYPE=${{ inputs.image-type }}
            BASE_IMAGE=${{ inputs.base-image }}
+            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
+            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
+            GRPC_VERSION=v1.65.0
+            MAKEFLAGS=${{ inputs.makeflags }}
+            SKIP_DRIVERS=${{ inputs.skip-drivers }}
          context: .
          file: ./Dockerfile
+          cache-from: type=gha
          platforms: ${{ inputs.platforms }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
+### Start testing image
+      - name: Build and push
+        uses: docker/build-push-action@v6
+        if: github.event_name == 'pull_request'
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
+          # This is why some build args like GRPC_VERSION and MAKEFLAGS are hardcoded
+          build-args: |
+            BUILD_TYPE=${{ inputs.build-type }}
+            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
+            BASE_IMAGE=${{ inputs.base-image }}
+            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
+            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
+            GRPC_VERSION=v1.65.0
+            MAKEFLAGS=${{ inputs.makeflags }}
+            SKIP_DRIVERS=${{ inputs.skip-drivers }}
+          context: .
+          file: ./Dockerfile
+          cache-from: type=gha
+          platforms: ${{ inputs.platforms }}
+          #push: true
+          tags: ${{ steps.meta_pull_request.outputs.tags }}
+          labels: ${{ steps.meta_pull_request.outputs.labels }}
+## End testing image
+      - name: Build and push AIO image
+        if: inputs.aio != ''
+        uses: docker/build-push-action@v6
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BASE_IMAGE=quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
+            MAKEFLAGS=${{ inputs.makeflags }}
+          context: .
+          file: ./Dockerfile.aio
+          platforms: ${{ inputs.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta_aio.outputs.tags }}
+          labels: ${{ steps.meta_aio.outputs.labels }}
+
+      - name: Build and push AIO image (dockerhub)
+        if: inputs.aio != ''
+        uses: docker/build-push-action@v6
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BASE_IMAGE=localai/localai:${{ steps.meta.outputs.version }}
+            MAKEFLAGS=${{ inputs.makeflags }}
+          context: .
+          file: ./Dockerfile.aio
+          platforms: ${{ inputs.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
+          labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
+
      - name: job summary
        run: |
          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
+
+      - name: job summary(AIO)
+        if: inputs.aio != ''
+        run: |
+          echo "Built image: ${{ steps.meta_aio.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -0,0 +1,12 @@
+name: "Pull Request Labeler"
+on:
+- pull_request_target
+
+jobs:
+  labeler:
+    permissions:
+      contents: read
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/labeler@v6
--- a/.github/workflows/localaibot_automerge.yml
+++ b/.github/workflows/localaibot_automerge.yml
@@ -0,0 +1,35 @@
+name: LocalAI-bot auto-merge
+on:
+- pull_request_target
+
+permissions:
+  contents: write
+  pull-requests: write
+  packages: read
+
+jobs:
+  dependabot:
+    runs-on: ubuntu-latest
+    if: ${{ github.actor == 'localai-bot' }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Approve a PR if not already approved
+        run: |
+          gh pr checkout "$PR_URL"
+            if [ "$(gh pr status --json reviewDecision -q .currentBranch.reviewDecision)" != "APPROVED" ];
+          then
+            gh pr review --approve "$PR_URL"
+          else
+            echo "PR already approved.";
+          fi
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+
+      - name: Enable auto-merge for LocalAIBot PRs
+        run: gh pr merge --auto --squash "$PR_URL"
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -0,0 +1,168 @@
+name: Notifications for new models
+on:
+  pull_request:
+     types:
+       - closed
+
+jobs:
+  notify-discord:
+    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
+    env:
+        MODEL_NAME: gemma-3-12b-it
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v5
+      with:
+        fetch-depth: 0 # needed to checkout all branches for this Action to work
+    - uses: mudler/localai-github-action@v1
+      with:
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        # Check the PR diff using the current branch and the base branch of the PR
+    - uses: GrantBirki/git-diff-action@v2.8.1
+      id: git-diff-action
+      with:
+            json_diff_file_output: diff.json
+            raw_diff_file_output: diff.txt
+            file_output_only: "true"
+    - name: Summarize
+      env:
+        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
+      id: summarize
+      run: |
+            input="$(cat $DIFF)"
+
+            # Define the LocalAI API endpoint
+            API_URL="http://localhost:8080/chat/completions"
+
+            # Create a JSON payload using jq to handle special characters
+            json_payload=$(jq -n --arg input "$input" '{
+            model: "'$MODEL_NAME'",
+            messages: [
+                {
+                role: "system",
+                content: "You are LocalAI-bot. Write a discord message to notify everyone about the new model from the git diff. Make it informal. An example can include: the URL of the model, the name, and a brief description of the model if exists. Also add an hint on how to install it in LocalAI and that can be browsed over https://models.localai.io. For example: local-ai run model_name_here"
+                },
+                {
+                role: "user",
+                content: $input
+                }
+            ]
+            }')
+
+            # Send the request to LocalAI
+            response=$(curl -s -X POST $API_URL \
+            -H "Content-Type: application/json" \
+            -d "$json_payload")
+
+            # Extract the summary from the response
+            summary="$(echo $response | jq -r '.choices[0].message.content')"
+
+            # Print the summary
+            #  -H "Authorization: Bearer $API_KEY" \
+            echo "Summary:"
+            echo "$summary"
+            echo "payload sent"
+            echo "$json_payload"
+            {
+                echo 'message<<EOF'
+                echo "$summary"
+                echo EOF
+              } >> "$GITHUB_OUTPUT"
+            docker logs --tail 10 local-ai
+    - name: Discord notification
+      env:
+        DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK_URL }}
+        DISCORD_USERNAME: "LocalAI-Bot"
+        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
+      uses: Ilshidur/action-discord@master
+      with:
+        args: ${{ steps.summarize.outputs.message }}
+    - name: Setup tmate session if fails
+      if: ${{ failure() }}
+      uses: mxschmitt/action-tmate@v3.22
+      with:
+        detached: true
+        connect-timeout-seconds: 180
+        limit-access-to-actor: true
+  notify-twitter:
+    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
+    env:
+        MODEL_NAME: gemma-3-12b-it
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v5
+      with:
+        fetch-depth: 0 # needed to checkout all branches for this Action to work
+    - name: Start LocalAI
+      run: |
+        echo "Starting LocalAI..."
+        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master run --debug $MODEL_NAME
+        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
+      # Check the PR diff using the current branch and the base branch of the PR
+    - uses: GrantBirki/git-diff-action@v2.8.1
+      id: git-diff-action
+      with:
+            json_diff_file_output: diff.json
+            raw_diff_file_output: diff.txt
+            file_output_only: "true"
+    - name: Summarize
+      env:
+        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
+      id: summarize
+      run: |
+            input="$(cat $DIFF)"
+
+            # Define the LocalAI API endpoint
+            API_URL="http://localhost:8080/chat/completions"
+
+            # Create a JSON payload using jq to handle special characters
+            json_payload=$(jq -n --arg input "$input" '{
+            model: "'$MODEL_NAME'",
+            messages: [
+                {
+                role: "system",
+                content: "You are LocalAI-bot. Write a twitter message to notify everyone about the new model from the git diff. Make it informal and really short. An example can include: the name, and a brief description of the model if exists. Also add an hint on how to install it in LocalAI. For example: local-ai run model_name_here"
+                },
+                {
+                role: "user",
+                content: $input
+                }
+            ]
+            }')
+
+            # Send the request to LocalAI
+            response=$(curl -s -X POST $API_URL \
+            -H "Content-Type: application/json" \
+            -d "$json_payload")
+
+            # Extract the summary from the response
+            summary="$(echo $response | jq -r '.choices[0].message.content')"
+
+            # Print the summary
+            #  -H "Authorization: Bearer $API_KEY" \
+            echo "Summary:"
+            echo "$summary"
+            echo "payload sent"
+            echo "$json_payload"
+            {
+                echo 'message<<EOF'
+                echo "$summary"
+                echo EOF
+              } >> "$GITHUB_OUTPUT"
+            docker logs --tail 10 local-ai
+    - uses: Eomm/why-don-t-you-tweet@v2
+      with:
+        tweet-message: ${{ steps.summarize.outputs.message }}
+      env:
+        # Get your tokens from https://developer.twitter.com/apps
+        TWITTER_CONSUMER_API_KEY: ${{ secrets.TWITTER_APP_KEY }}
+        TWITTER_CONSUMER_API_SECRET: ${{ secrets.TWITTER_APP_SECRET }}
+        TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }}
+        TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
+    - name: Setup tmate session if fails
+      if: ${{ failure() }}
+      uses: mxschmitt/action-tmate@v3.22
+      with:
+        detached: true
+        connect-timeout-seconds: 180
+        limit-access-to-actor: true
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@@ -0,0 +1,63 @@
+name: Release notifications
+on:
+  release:
+    types:
+      - published
+
+jobs:
+  notify-discord:
+    runs-on: ubuntu-latest
+    env:
+        RELEASE_BODY: ${{ github.event.release.body }}
+        RELEASE_TITLE: ${{ github.event.release.name }}
+        RELEASE_TAG_NAME: ${{ github.event.release.tag_name }}
+    steps:
+    - uses: mudler/localai-github-action@v1
+      with:
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+    - name: Summarize
+      id: summarize
+      run: |
+            input="$RELEASE_TITLE\b$RELEASE_BODY"
+
+            # Define the LocalAI API endpoint
+            API_URL="http://localhost:8080/chat/completions"
+
+            # Create a JSON payload using jq to handle special characters
+            json_payload=$(jq -n --arg input "$input" '{
+            model: "'$MODEL_NAME'",
+            messages: [
+                {
+                role: "system",
+                content: "Write a discord message with a bullet point summary of the release notes."
+                },
+                {
+                role: "user",
+                content: $input
+                }
+            ]
+            }')
+
+            # Send the request to LocalAI API
+            response=$(curl -s -X POST $API_URL \
+            -H "Content-Type: application/json" \
+            -d "$json_payload")
+
+            # Extract the summary from the response
+            summary=$(echo $response | jq -r '.choices[0].message.content')
+
+            # Print the summary
+            #  -H "Authorization: Bearer $API_KEY" \
+            {
+                echo 'message<<EOF'
+                echo "$summary"
+                echo EOF
+              } >> "$GITHUB_OUTPUT"
+    - name: Discord notification
+      env:
+        DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK_URL_RELEASE }}
+        DISCORD_USERNAME: "LocalAI-Bot"
+        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
+      uses: Ilshidur/action-discord@master
+      with:
+        args: ${{ steps.summarize.outputs.message }}
--- a/.github/workflows/prlint.yaml
+++ b/.github/workflows/prlint.yaml
@@ -0,0 +1,28 @@
+name: Check PR style
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - reopened
+      - edited
+      - synchronize
+
+jobs:
+  title-lint:
+    runs-on: ubuntu-latest
+    permissions:
+      statuses: write
+    steps:
+      - uses: aslafy-z/conventional-pr-title-action@v3
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+#  check-pr-description:
+#    runs-on: ubuntu-latest
+#    steps:
+#      - uses: actions/checkout@v2
+#      - uses: jadrol/pr-description-checker-action@v1.0.0
+#        id: description-checker
+#        with:
+#          repo-token: ${{ secrets.GITHUB_TOKEN }}
+#          exempt-labels: no qa
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,161 +1,64 @@
-name: Build and Release
+name: goreleaser

-on: push
-
-permissions:
-  contents: write
-
-concurrency:
-  group: ci-releases-${{ github.head_ref || github.ref }}-${{ github.repository }}
-  cancel-in-progress: true
+on:
+  push:
+    tags:
+      - 'v*'

 jobs:
-  build-linux:
-    strategy:
-      matrix:
-        include:
-          - build: 'avx2'
-            defines: ''
-          - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
-          - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON'
-          - build: 'cuda12'
-            defines: ''
-          - build: 'cuda11'
-            defines: ''
+  goreleaser:
    runs-on: ubuntu-latest
    steps:
-      - name: Clone
-        uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v5
        with:
-          submodules: true
-      - uses: actions/setup-go@v4
+          fetch-depth: 0
+      - name: Set up Go
+        uses: actions/setup-go@v5
        with:
-          go-version: '>=1.21.0'
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-      - name: Install CUDA Dependencies
-        if: ${{ matrix.build == 'cuda12' || matrix.build == 'cuda11' }}
-        run: |
-          if [ "${{ matrix.build }}" == "cuda12" ]; then
-            export CUDA_VERSION=12-3
-          else
-            export CUDA_VERSION=11-7
-          fi
-          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-          sudo dpkg -i cuda-keyring_1.1-1_all.deb
-          sudo apt-get update
-          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
-      - name: Cache grpc
-        id: cache-grpc
-        uses: actions/cache@v3
+          go-version: 1.23
+      - name: Run GoReleaser
+        uses: goreleaser/goreleaser-action@v6
        with:
-          path: grpc
-          key: ${{ runner.os }}-grpc
-      - name: Build grpc
-        if: steps.cache-grpc.outputs.cache-hit != 'true'
-        run: |
-          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-            -DgRPC_BUILD_TESTS=OFF \
-            ../.. && sudo make -j12
-      - name: Install gRPC
-        run: |
-          cd grpc && cd cmake/build && sudo make -j12 install
-      - name: Build
-        id: build
+          version: v2.11.0
+          args: release --clean
        env:
-          CMAKE_ARGS: "${{ matrix.defines }}"
-          BUILD_ID: "${{ matrix.build }}"
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  launcher-build-darwin:
+    runs-on: macos-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: 1.23
+      - name: Build launcher for macOS ARM64
        run: |
-          if [ "${{ matrix.build }}" == "cuda12" ] || [ "${{ matrix.build }}" == "cuda11" ]; then
-            export BUILD_TYPE=cublas
-            export PATH=/usr/local/cuda/bin:$PATH
-            make dist
-          else
-            STATIC=true make dist
-          fi
-      - uses: actions/upload-artifact@v3
+          make build-launcher-darwin
+      - name: Upload DMG to Release
+        uses: softprops/action-gh-release@v2
        with:
-          name: ${{ matrix.build }}
-          path: release/
-      - name: Release
-        uses: softprops/action-gh-release@v1
-        if: startsWith(github.ref, 'refs/tags/')
-        with:
-          files: |
-            release/*
-
-  build-stablediffusion:
+          files: ./dist/LocalAI.dmg
+  launcher-build-linux:
    runs-on: ubuntu-latest
    steps:
-      - name: Clone
-        uses: actions/checkout@v4
+      - name: Checkout
+        uses: actions/checkout@v5
        with:
-          submodules: true
-      - uses: actions/setup-go@v4
+          fetch-depth: 0
+      - name: Set up Go
+        uses: actions/setup-go@v5
        with:
-          go-version: '>=1.21.0'
-      - name: Dependencies
+          go-version: 1.23
+      - name: Build launcher for Linux
        run: |
-          sudo apt-get install -y --no-install-recommends libopencv-dev
-          sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-      - name: Build stablediffusion
-        run: |
-          make backend-assets/grpc/stablediffusion
-          mkdir -p release && cp backend-assets/grpc/stablediffusion release
-      - uses: actions/upload-artifact@v3
+          sudo apt-get update
+          sudo apt-get install golang gcc libgl1-mesa-dev xorg-dev libxkbcommon-dev
+          make build-launcher-linux
+      - name: Upload Linux launcher artifacts
+        uses: softprops/action-gh-release@v2
        with:
-          name: stablediffusion
-          path: release/
-      - name: Release
-        uses: softprops/action-gh-release@v1
-        if: startsWith(github.ref, 'refs/tags/')
-        with:
-          files: |
-            release/*
-
-  build-macOS:
-    strategy:
-      matrix:
-        include:
-          - build: 'avx2'
-            defines: ''
-          - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
-          - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON'
-    runs-on: macOS-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-      - uses: actions/setup-go@v4
-        with:
-          go-version: '>=1.21.0'
-      - name: Dependencies
-        run: |
-          brew install protobuf grpc
-      - name: Build
-        id: build
-        env:
-          CMAKE_ARGS: "${{ matrix.defines }}"
-          BUILD_ID: "${{ matrix.build }}"
-        run: |
-          export C_INCLUDE_PATH=/usr/local/include
-          export CPLUS_INCLUDE_PATH=/usr/local/include
-          make dist
-      - uses: actions/upload-artifact@v3
-        with:
-          name: ${{ matrix.build }}
-          path: release/
-      - name: Release
-        uses: softprops/action-gh-release@v1
-        if: startsWith(github.ref, 'refs/tags/')
-        with:
-          files: |
-            release/*
+          files: ./local-ai-launcher-linux.tar.xz
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -0,0 +1,30 @@
+name: "Security Scan"
+
+# Run workflow each time code is pushed to your repository and on a schedule.
+# The scheduled workflow runs every at 00:00 on Sunday UTC time.
+on:
+  push:
+  schedule:
+  - cron: '0 0 * * 0'
+
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    env:
+      GO111MODULE: on
+    steps:
+      - name: Checkout Source
+        uses: actions/checkout@v5
+        if: ${{ github.actor != 'dependabot[bot]' }}
+      - name: Run Gosec Security Scanner
+        if: ${{ github.actor != 'dependabot[bot]' }}
+        uses: securego/gosec@v2.22.8
+        with:
+          # we let the report trigger content trigger a failure using the GitHub Security features.
+          args: '-no-fail -fmt sarif -out results.sarif ./...'
+      - name: Upload SARIF file
+        if: ${{ github.actor != 'dependabot[bot]' }}
+        uses: github/codeql-action/upload-sarif@v3
+        with:
+          # Path to SARIF file relative to the root of the repository
+          sarif_file: results.sarif
--- a/.github/workflows/stalebot.yml
+++ b/.github/workflows/stalebot.yml
@@ -0,0 +1,24 @@
+name: 'Close stale issues and PRs'
+permissions:
+  issues: write
+  pull-requests: write
+on:
+  schedule:
+    - cron: '30 1 * * *'
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v9
+        with:
+          stale-issue-message: 'This issue is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
+          stale-pr-message: 'This PR is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 10 days.'
+          close-issue-message: 'This issue was closed because it has been stalled for 5 days with no activity.'
+          close-pr-message: 'This PR was closed because it has been stalled for 10 days with no activity.'
+          days-before-issue-stale: 90
+          days-before-pr-stale: 90
+          days-before-issue-close: 5
+          days-before-pr-close: 10
+          exempt-issue-labels: 'roadmap'
+          exempt-pr-labels: 'roadmap'
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -14,155 +14,133 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  # Requires CUDA
+  # tests-chatterbox-tts:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v5
+  #       with:
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
+
+  #     - name: Test chatterbox-tts
+  #       run: |
+  #          make --jobs=5 --output-sync=target -C backend/python/chatterbox
+  #          make --jobs=5 --output-sync=target -C backend/python/chatterbox test
  tests-transformers:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v4
-        with: 
+        uses: actions/checkout@v5
+        with:
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          
-          sudo rm -rfv /usr/bin/conda || true
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user --no-cache-dir grpcio-tools==1.64.1

      - name: Test transformers
        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/transformers
-           make -C backend/python/transformers test
-
-  tests-sentencetransformers:
+           make --jobs=5 --output-sync=target -C backend/python/transformers
+           make --jobs=5 --output-sync=target -C backend/python/transformers test
+  tests-rerankers:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v4
-        with: 
+        uses: actions/checkout@v5
+        with:
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          
-          sudo rm -rfv /usr/bin/conda || true
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user --no-cache-dir grpcio-tools==1.64.1

-      - name: Test sentencetransformers
+      - name: Test rerankers
        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/sentencetransformers
-           make -C backend/python/sentencetransformers test
+           make --jobs=5 --output-sync=target -C backend/python/rerankers
+           make --jobs=5 --output-sync=target -C backend/python/rerankers test

  tests-diffusers:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v4
-        with: 
+        uses: actions/checkout@v5
+        with:
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          
-          sudo rm -rfv /usr/bin/conda || true
-
+          sudo apt-get install -y build-essential ffmpeg
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test diffusers
        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/diffusers
-           make -C backend/python/diffusers test
+          make --jobs=5 --output-sync=target -C backend/python/diffusers
+          make --jobs=5 --output-sync=target -C backend/python/diffusers test

+  #tests-vllm:
+  #  runs-on: ubuntu-latest
+  #  steps:
+  #    - name: Clone
+  #      uses: actions/checkout@v5
+  #      with:
+  #        submodules: true
+  #    - name: Dependencies
+  #      run: |
+  #        sudo apt-get update
+  #        sudo apt-get install -y build-essential ffmpeg
+  #        sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #        sudo apt-get install -y libopencv-dev
+  #        # Install UV
+  #        curl -LsSf https://astral.sh/uv/install.sh | sh
+  #        pip install --user --no-cache-dir grpcio-tools==1.64.1
+  #    - name: Test vllm backend
+  #      run: |
+  #        make --jobs=5 --output-sync=target -C backend/python/vllm
+  #        make --jobs=5 --output-sync=target -C backend/python/vllm test
+  # tests-transformers-musicgen:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v5
+  #       with:
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user --no-cache-dir grpcio-tools==1.64.1

-  tests-transformers-musicgen:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          
-          sudo rm -rfv /usr/bin/conda || true
-
-      - name: Test transformers-musicgen
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/transformers-musicgen
-           make -C backend/python/transformers-musicgen test
-
-
-
-  tests-petals:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          
-          sudo rm -rfv /usr/bin/conda || true
-
-      - name: Test petals
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/petals
-           make -C backend/python/petals test
-
-           
+  #     - name: Test transformers-musicgen
+  #       run: |
+  #          make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
+  #          make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test

  # tests-bark:
  #   runs-on: ubuntu-latest
@@ -208,110 +186,64 @@ jobs:
  #           sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
  #           df -h
  #     - name: Clone
-  #       uses: actions/checkout@v4
-  #       with: 
+  #       uses: actions/checkout@v5
+  #       with:
  #         submodules: true
  #     - name: Dependencies
  #       run: |
  #         sudo apt-get update
  #         sudo apt-get install build-essential ffmpeg
-  #         curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-  #            sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-  #             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-  #            sudo apt-get update && \
-  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
-  #         sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          
-  #         sudo rm -rfv /usr/bin/conda || true
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user --no-cache-dir grpcio-tools==1.64.1

  #     - name: Test bark
  #       run: |
-  #          export PATH=$PATH:/opt/conda/bin
-  #          make -C backend/python/bark
-  #          make -C backend/python/bark test
+  #          make --jobs=5 --output-sync=target -C backend/python/bark
+  #          make --jobs=5 --output-sync=target -C backend/python/bark test
+

-           
  # Below tests needs GPU. Commented out for now
  # TODO: Re-enable as soon as we have GPU nodes
  # tests-vllm:
  #   runs-on: ubuntu-latest
  #   steps:
  #     - name: Clone
-  #       uses: actions/checkout@v4
-  #       with: 
+  #       uses: actions/checkout@v5
+  #       with:
  #         submodules: true
  #     - name: Dependencies
  #       run: |
  #         sudo apt-get update
  #         sudo apt-get install build-essential ffmpeg
-  #         curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-  #            sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-  #             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-  #            sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-  #            sudo apt-get update && \
-  #            sudo apt-get install -y conda
-  #         sudo apt-get install -y ca-certificates cmake curl patch
-  #         sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-  #         sudo rm -rfv /usr/bin/conda || true
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user --no-cache-dir grpcio-tools==1.64.1
  #     - name: Test vllm
  #       run: |
-  #          export PATH=$PATH:/opt/conda/bin
-  #          make -C backend/python/vllm
-  #          make -C backend/python/vllm test
-  tests-vallex:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2    
-          sudo rm -rfv /usr/bin/conda || true
-      - name: Test vall-e-x
-        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/vall-e-x
-           make -C backend/python/vall-e-x test
+  #          make --jobs=5 --output-sync=target -C backend/python/vllm
+  #          make --jobs=5 --output-sync=target -C backend/python/vllm test

  tests-coqui:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v4
-        with: 
+        uses: actions/checkout@v5
+        with:
          submodules: true
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng          
-          sudo rm -rfv /usr/bin/conda || true
-
+          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng python3-pip
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          pip install --user --no-cache-dir grpcio-tools==1.64.1
      - name: Test coqui
        run: |
-           export PATH=$PATH:/opt/conda/bin
-           make -C backend/python/coqui
-           make -C backend/python/coqui test
+          make --jobs=5 --output-sync=target -C backend/python/coqui
+          make --jobs=5 --output-sync=target -C backend/python/coqui test
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -9,6 +9,9 @@ on:
    tags:
      - '*'

+env:
+  GRPC_VERSION: v1.65.0
+
 concurrency:
  group: ci-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
  cancel-in-progress: true
@@ -19,6 +22,116 @@ jobs:
    strategy:
      matrix:
        go-version: ['1.21.x']
+    steps:
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
+      - name: Release space from worker
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          df -h
+      - name: Clone
+        uses: actions/checkout@v5
+        with:
+          submodules: true
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+          cache: false
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
+      - name: Proto Dependencies
+        run: |
+          # Install protoc
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          PATH="$PATH:$HOME/go/bin" make protogen-go
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
+          sudo apt-get install -y libgmock-dev clang
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake patch python3-pip unzip
+          sudo apt-get install -y libopencv-dev
+
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+
+          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
+          export CUDACXX=/usr/local/cuda/bin/nvcc
+
+
+          # The python3-grpc-tools package in 22.04 is too old
+          pip install --user grpcio-tools==1.71.0 grpcio==1.71.0
+
+          make -C backend/python/transformers
+
+          make backends/huggingface backends/llama-cpp backends/local-store backends/silero-vad backends/piper backends/whisper backends/stablediffusion-ggml
+        env:
+          CUDA_VERSION: 12-4
+      - name: Test
+        run: |
+          PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.22
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
+
+  tests-aio-container:
+    runs-on: ubuntu-latest
    steps:
      - name: Release space from worker
        run: |
@@ -53,81 +166,68 @@ jobs:
          sudo rm -rfv build || true
          df -h
      - name: Clone
-        uses: actions/checkout@v4
-        with: 
-          submodules: true
-      - name: Setup Go ${{ matrix.go-version }}
-        uses: actions/setup-go@v4
+        uses: actions/checkout@v5
        with:
-          go-version: ${{ matrix.go-version }}
-      # You can test your matrix by printing the current Go version
-      - name: Display Go version
-        run: go version
+          submodules: true
      - name: Dependencies
        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
-          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
-             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
-             sudo apt-get update && \
-             sudo apt-get install -y conda
-          sudo apt-get install -y ca-certificates cmake curl patch
-          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
-          
-          sudo rm -rfv /usr/bin/conda || true
-          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
-
-          # Pre-build piper before we start tests in order to have shared libraries in place
-          make sources/go-piper && \
-          GO_TAGS="tts" make -C sources/go-piper piper.o && \
-          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
-          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
-          GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
-      - name: Cache grpc
-        id: cache-grpc
-        uses: actions/cache@v3
-        with:
-          path: grpc
-          key: ${{ runner.os }}-grpc
-      - name: Build grpc
-        if: steps.cache-grpc.outputs.cache-hit != 'true'
-        run: |
-          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-            -DgRPC_BUILD_TESTS=OFF \
-            ../.. && sudo make -j12
-      - name: Install gRPC
-        run: |
-          cd grpc && cd cmake/build && sudo make -j12 install
+          # Install protoc
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          PATH="$PATH:$HOME/go/bin" make protogen-go
      - name: Test
        run: |
-          GO_TAGS="stablediffusion tts" make test
+            PATH="$PATH:$HOME/go/bin" make backends/local-store backends/silero-vad backends/llama-cpp backends/whisper backends/piper backends/stablediffusion-ggml docker-build-aio e2e-aio
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.22
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true

  tests-apple:
-    runs-on: macOS-latest
+    runs-on: macOS-14
    strategy:
      matrix:
        go-version: ['1.21.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v4
-        with: 
+        uses: actions/checkout@v5
+        with:
          submodules: true
      - name: Setup Go ${{ matrix.go-version }}
-        uses: actions/setup-go@v4
+        uses: actions/setup-go@v5
        with:
          go-version: ${{ matrix.go-version }}
+          cache: false
      # You can test your matrix by printing the current Go version
      - name: Display Go version
        run: go version
      - name: Dependencies
        run: |
-          brew install protobuf grpc
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc libomp llvm
+          pip install --user --no-cache-dir grpcio-tools==1.71.0 grpcio==1.71.0
+      - name: Build llama-cpp-darwin
+        run: |
+          make protogen-go
+          make backends/llama-cpp-darwin
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
-          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
+          export CC=/opt/homebrew/opt/llvm/bin/clang
+          # Used to run the newer GNUMake version from brew that supports --output-sync
+          export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
+          PATH="$PATH:$HOME/go/bin" make protogen-go
+          PATH="$PATH:$HOME/go/bin" BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.22
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -0,0 +1,37 @@
+name: Update swagger
+on:
+  schedule:
+    - cron: 0 20 * * *
+  workflow_dispatch:
+jobs:
+  swagger:
+    strategy:
+      fail-fast: false
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+      - uses: actions/setup-go@v5
+        with:
+          go-version: 'stable'
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install protobuf-compiler
+      - run: |
+          go install github.com/swaggo/swag/cmd/swag@latest
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
+      - name: Bump swagger 🔧
+        run: |
+          make protogen-go swagger
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v7
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: 'feat(swagger): update swagger'
+          title: 'feat(swagger): update swagger'
+          branch: "update/swagger"
+          body:  Update swagger
+          signoff: true
+
--- a/.github/workflows/yaml-check.yml
+++ b/.github/workflows/yaml-check.yml
@@ -0,0 +1,26 @@
+name: 'Yamllint GitHub Actions'
+on:
+  - pull_request
+jobs:
+  yamllint:
+    name: 'Yamllint'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@master
+      - name: 'Yamllint model gallery'
+        uses: karancode/yamllint-github-action@master
+        with:
+          yamllint_file_or_dir: 'gallery'
+          yamllint_strict: false
+          yamllint_comment: true
+        env:
+          GITHUB_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: 'Yamllint Backend gallery'
+        uses: karancode/yamllint-github-action@master
+        with:
+          yamllint_file_or_dir: 'backend'
+          yamllint_strict: false
+          yamllint_comment: true
+        env:
+          GITHUB_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -2,21 +2,29 @@
 /sources/
 __pycache__/
 *.a
+*.o
 get-sources
 prepare-sources
-/backend/cpp/llama/grpc-server
-/backend/cpp/llama/llama.cpp
+/backend/cpp/llama-cpp/grpc-server
+/backend/cpp/llama-cpp/llama.cpp
+/backend/cpp/llama-*
+!backend/cpp/llama-cpp
+/backends
+/backend-images
+/result.yaml
+protoc
+
+*.log

 go-ggml-transformers
 go-gpt2
-go-rwkv
 whisper.cpp
 /bloomz
 go-bert

 # LocalAI build binary
 LocalAI
-local-ai
+/local-ai
 # prevent above rules from omitting the helm chart
 !charts/*
 # prevent above rules from omitting the api/localai folder
@@ -39,3 +47,18 @@ backend-assets/*
 !backend-assets/.keep
 prepare
 /ggml-metal.metal
+docs/static/gallery.html
+
+# Protobuf generated files
+*.pb.go
+*pb2.py
+*pb2_grpc.py
+
+# SonarQube
+.scannerwork
+
+# backend virtual environments
+**/venv
+
+# per-developer customization files for the development container
+.devcontainer/customization/*
--- a/.goreleaser.yaml
+++ b/.goreleaser.yaml
@@ -0,0 +1,33 @@
+version: 2
+before:
+  hooks:
+    - make protogen-go
+    - go mod tidy
+dist: release
+source:
+  enabled: true
+  name_template: '{{ .ProjectName }}-{{ .Tag }}-source'
+builds:
+  - main: ./cmd/local-ai
+    env:
+      - CGO_ENABLED=0
+    ldflags:
+      - -s -w
+      - -X "github.com/mudler/LocalAI/internal.Version={{ .Tag }}"
+      - -X "github.com/mudler/LocalAI/internal.Commit={{ .FullCommit }}"
+    goos:
+      - linux
+      - darwin
+      #- windows
+    goarch:
+      - amd64
+      - arm64
+archives:
+  - formats: [ 'binary' ] # this removes the tar of the archives, leaving the binaries alone
+    name_template: local-ai-{{ .Tag }}-{{ .Os }}-{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}
+checksum:
+  name_template: '{{ .ProjectName }}-{{ .Tag }}-checksums.txt'
+snapshot:
+  version_template: "{{ .Tag }}-next"
+changelog:
+  use: github-native
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -0,0 +1,5 @@
+{
+    "recommendations": [
+        "golang.go"
+    ]
+}
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -3,12 +3,12 @@
    "configurations": [
        {
            "name": "Python: Current File",
-            "type": "python",
+            "type": "debugpy",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal",
            "justMyCode": false,
-            "cwd": "${workspaceFolder}/examples/langchain-chroma",
+            "cwd": "${fileDirname}",
            "env": {
                "OPENAI_API_BASE": "http://localhost:8080/v1",
                "OPENAI_API_KEY": "abc"
@@ -19,15 +19,16 @@
            "type": "go",
            "request": "launch",
            "mode": "debug",
-            "program": "${workspaceFolder}/main.go",
-            "args": [
-                "api"
-            ],
+            "program": "${workspaceRoot}",
+            "args": [],
            "env": {
-                "C_INCLUDE_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
-                "LIBRARY_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
-                "DEBUG": "true"
-            }
+                "LOCALAI_LOG_LEVEL": "debug",
+                "LOCALAI_P2P": "true",
+                "LOCALAI_FEDERATED": "true"
+            },
+            "buildFlags": ["-tags", "", "-v"],
+            "envFile": "${workspaceFolder}/.env",
+            "cwd": "${workspaceRoot}"
        }
    ]
 }
--- a/.yamllint
+++ b/.yamllint
@@ -0,0 +1,4 @@
+extends: default
+
+rules:
+    line-length: disable
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
-# Contributing to localAI
+# Contributing to LocalAI

 Thank you for your interest in contributing to LocalAI! We appreciate your time and effort in helping to improve our project. Before you get started, please take a moment to review these guidelines.

@@ -15,8 +15,6 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
 - [Documentation](#documentation)
 - [Community and Communication](#community-and-communication)

-
-
 ## Getting Started

 ### Prerequisites
@@ -29,8 +27,9 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time

 1. Clone the repository: `git clone https://github.com/go-skynet/LocalAI.git`
 2. Navigate to the project directory: `cd LocalAI`
-3. Install the required dependencies: `make prepare`
-4. Run LocalAI: `make run`
+3. Install the required dependencies ( see https://localai.io/basics/build/#build-localai-locally )
+4. Build LocalAI: `make build`
+5. Run LocalAI: `./local-ai`

 ## Contributing

@@ -53,20 +52,33 @@ If you find a bug, have a feature request, or encounter any issues, please check

 ## Coding Guidelines

- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here.

 ## Testing

 `make test` cannot handle all the model now. Please be sure to add a test case for the new features or the part was changed.

+### Running AIO tests
+
+All-In-One images has a set of tests that automatically verifies that most of the endpoints works correctly, a flow can be :
+
+```bash
+# Build the LocalAI docker image
+make DOCKER_IMAGE=local-ai docker
+
+# Build the corresponding AIO image
+BASE_IMAGE=local-ai DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
+
+# Run the AIO e2e tests
+LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio make run-e2e-aio
+```
+
 ## Documentation

- We are welcome the contribution of the documents, please open new PR in the official document repo [localai-website](https://github.com/go-skynet/localai-website)
-
+We are welcome the contribution of the documents, please open new PR or create a new issue. The documentation is available under `docs/` https://github.com/mudler/LocalAI/tree/master/docs
+ 
 ## Community and Communication

 - You can reach out via the Github issue tracker.
 - Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
 - Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
-
---
--- a/433
+++ b/433
@@ -1,236 +1,327 @@
-ARG IMAGE_TYPE=extras
 ARG BASE_IMAGE=ubuntu:22.04
+ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
+ARG INTEL_BASE_IMAGE=${BASE_IMAGE}

-# extras or core
-FROM ${BASE_IMAGE} as requirements-core
+FROM ${BASE_IMAGE} AS requirements

-USER root
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ca-certificates curl wget espeak-ng libgomp1 \
+        ffmpeg libopenblas-base libopenblas-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# The requirements-drivers target is for BUILD_TYPE specific items.  If you need to install something specific to CUDA, or specific to ROCM, it goes here.
+FROM requirements AS requirements-drivers

-ARG GO_VERSION=1.21.7
 ARG BUILD_TYPE
-ARG CUDA_MAJOR_VERSION=11
-ARG CUDA_MINOR_VERSION=7
+ARG CUDA_MAJOR_VERSION=12
+ARG CUDA_MINOR_VERSION=8
+ARG SKIP_DRIVERS=false
+ARG TARGETARCH
+ARG TARGETVARIANT
+ENV BUILD_TYPE=${BUILD_TYPE}
+
+RUN mkdir -p /run/localai
+RUN echo "default" > /run/localai/capability
+
+# Vulkan requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils wget gpg-agent && \
+        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+        apt-get update && \
+        apt-get install -y \
+            vulkan-sdk && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        echo "vulkan" > /run/localai/capability
+    fi
+EOT
+
+# CuBLAS requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils
+        if [ "amd64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        fi
+        if [ "arm64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
+        fi
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+        rm -f cuda-keyring_1.1-1_all.deb && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        echo "nvidia" > /run/localai/capability
+    fi
+EOT
+
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "arm64" ]; then
+        echo "nvidia-l4t" > /run/localai/capability
+    fi
+EOT
+
+# If we are building with clblas support, we need the libraries for the builds
+RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            libclblast-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
+    ; fi
+
+RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            hipblas-dev \
+            rocblas-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        echo "amd" > /run/localai/capability && \
+        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
+        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
+        ldconfig \
+    ; fi
+
+RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
+    ln -s /opt/rocm-**/lib/llvm/lib/libomp.so /usr/lib/libomp.so \
+    ; fi
+
+RUN expr "${BUILD_TYPE}" = intel && echo "intel" > /run/localai/capability || echo "not intel"
+
+# Cuda
+ENV PATH=/usr/local/cuda/bin:${PATH}
+
+# HipBLAS requirements
+ENV PATH=/opt/rocm/bin:${PATH}
+
+###################################
+###################################
+
+# The requirements-core target is common to all images.  It should not be placed in requirements-core unless every single build will use it.
+FROM requirements-drivers AS build-requirements
+
+ARG GO_VERSION=1.22.6
+ARG CMAKE_VERSION=3.26.4
+ARG CMAKE_FROM_SOURCE=false
 ARG TARGETARCH
 ARG TARGETVARIANT

-ENV BUILD_TYPE=${BUILD_TYPE}
-ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh"
-
-ARG GO_TAGS="stablediffusion tinydream tts"

 RUN apt-get update && \
-    apt-get install -y ca-certificates curl patch pip cmake git && apt-get clean
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ccache \
+        ca-certificates espeak-ng \
+        curl libssl-dev \
+        git \
+        git-lfs \
+        unzip upx-ucl python3 python-is-python3 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install CMake (the version in 22.04 is too old)
+RUN <<EOT bash
+    if [ "${CMAKE_FROM_SOURCE}" = "true" ]; then
+        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
+    else
+        apt-get update && \
+        apt-get install -y \
+            cmake && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT

 # Install Go
-RUN curl -L -s https://go.dev/dl/go$GO_VERSION.linux-$TARGETARCH.tar.gz | tar -C /usr/local -xz
-ENV PATH $PATH:/usr/local/go/bin
+RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
+ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
+
+# Install grpc compilers
+RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af

 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates

+
+# OpenBLAS requirements and stable diffusion
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        libopenblas-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN test -n "$TARGETARCH" \
+    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
+
 # Use the variables in subsequent instructions
 RUN echo "Target Architecture: $TARGETARCH"
 RUN echo "Target Variant: $TARGETVARIANT"

-# CuBLAS requirements
-RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
-    apt-get install -y software-properties-common && \
-    curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
-    dpkg -i cuda-keyring_1.1-1_all.deb && \
-    rm -f cuda-keyring_1.1-1_all.deb && \
-    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
-    ; fi

-# Cuda
-ENV PATH /usr/local/cuda/bin:${PATH}

-# HipBLAS requirements
-ENV PATH /opt/rocm/bin:${PATH}
-
-# OpenBLAS requirements and stable diffusion
-RUN apt-get install -y \
-    libopenblas-dev \
-    libopencv-dev \ 
-    && apt-get clean
-
-# Set up OpenCV
-RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2

 WORKDIR /build

-RUN test -n "$TARGETARCH" \
-    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
-
-# Extras requirements
-FROM requirements-core as requirements-extras
-
-RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
-    install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
-    gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
-    apt-get update && \
-    apt-get install -y conda && apt-get clean
-
-ENV PATH="/root/.cargo/bin:${PATH}"
-RUN apt-get install -y python3-pip && apt-get clean
-RUN pip install --upgrade pip
-
-RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-RUN apt-get install -y espeak-ng espeak && apt-get clean
-
-RUN if [ ! -e /usr/bin/python ]; then \
-	  ln -s /usr/bin/python3 /usr/bin/python \
-    ; fi

 ###################################
 ###################################

-FROM requirements-${IMAGE_TYPE} as builder
+# Temporary workaround for Intel's repository to work correctly
+# https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/APT-Repository-not-working-signatures-invalid/m-p/1599436/highlight/true#M36143
+# This is a temporary workaround until Intel fixes their repository
+FROM ${INTEL_BASE_IMAGE} AS intel
+RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \
+gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
+RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" > /etc/apt/sources.list.d/intel-graphics.list
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        intel-oneapi-runtime-libs && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*

-ARG GO_TAGS="stablediffusion tts"
+###################################
+###################################
+
+# The builder-base target has the arguments, variables, and copies shared between full builder images and the uncompiled devcontainer
+
+FROM build-requirements AS builder-base
+
+ARG GO_TAGS=""
 ARG GRPC_BACKENDS
-ARG BUILD_GRPC=true
+ARG MAKEFLAGS
+ARG LD_FLAGS="-s -w"
+ARG TARGETARCH
+ARG TARGETVARIANT
 ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
+ENV MAKEFLAGS=${MAKEFLAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
+ENV LD_FLAGS=${LD_FLAGS}
+
+RUN echo "GO_TAGS: $GO_TAGS" && echo "TARGETARCH: $TARGETARCH"
+
+WORKDIR /build
+
+
+# We need protoc installed, and the version in 22.04 is too old.
+RUN <<EOT bash
+    if [ "amd64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+    if [ "arm64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+EOT
+
+###################################
+###################################
+
+# Compile backends first in a separate stage
+FROM builder-base AS builder-backends
+ARG TARGETARCH
+ARG TARGETVARIANT
+
+WORKDIR /build
+
+COPY ./Makefile .
+COPY ./backend ./backend
+COPY ./go.mod .
+COPY ./go.sum .
+COPY ./.git ./.git
+
+# Some of the Go backends use libs from the main src, we could further optimize the caching by building the CPP backends before here
+COPY ./pkg/grpc ./pkg/grpc
+COPY ./pkg/utils ./pkg/utils
+COPY ./pkg/langchain ./pkg/langchain
+
+RUN ls -l ./
+RUN make protogen-go
+
+# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
+# Adjustments to the build process should likely be made here.
+FROM builder-backends AS builder

 WORKDIR /build

 COPY . .
-COPY .git .
-RUN make prepare

-# If we are building with clblas support, we need the libraries for the builds
-RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
-    apt-get update && \
-    apt-get install -y libclblast-dev && \
-    apt-get clean \
-    ; fi
-
-# stablediffusion does not tolerate a newer version of abseil, build it first
-RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
-
-RUN if [ "${BUILD_GRPC}" = "true" ]; then \
-    git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
-    cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
-      -DgRPC_BUILD_TESTS=OFF \
-       ../.. && make -j12 install \
-    ; fi
-
-# Rebuild with defaults backends
+## Build the binary
+## If we're on arm64 AND using cublas/hipblas, skip some of the llama-compat backends to save space
+## Otherwise just run the normal build
 RUN make build

-RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
-    mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
-    touch /build/sources/go-piper/piper-phonemize/pi/lib/keep \
-    ; fi
+###################################
+###################################
+
+# The devcontainer target is not used on CI. It is a target for developers to use locally -
+# rather than copying files it mounts them locally and leaves building to the developer
+
+FROM builder-base AS devcontainer
+
+COPY .devcontainer-scripts /.devcontainer-scripts
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ssh less
+# For the devcontainer, leave apt functional in case additional devtools are needed at runtime.
+
+RUN go install github.com/go-delve/delve/cmd/dlv@latest
+
+RUN go install github.com/mikefarah/yq/v4@latest

 ###################################
 ###################################

-FROM requirements-${IMAGE_TYPE}
+# This is the final target. The result of this target will be the image uploaded to the registry.
+# If you cannot find a more suitable place for an addition, this layer is a suitable place for it.
+FROM requirements-drivers

-ARG FFMPEG
-ARG BUILD_TYPE
-ARG TARGETARCH
-ARG IMAGE_TYPE=extras
-
-ENV BUILD_TYPE=${BUILD_TYPE}
-ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz

-ARG CUDA_MAJOR_VERSION=11
+ARG CUDA_MAJOR_VERSION=12
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
-ENV PIP_CACHE_PURGE=true

-# Add FFmpeg
-RUN if [ "${FFMPEG}" = "true" ]; then \
-    apt-get install -y ffmpeg && apt-get clean \
-    ; fi
+WORKDIR /

-# Add OpenCL
-RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
-    apt-get update && \
-    apt-get install -y libclblast1 && \
-    apt-get clean \
-    ; fi
-
-WORKDIR /build
-
-# we start fresh & re-copy all assets because `make build` does not clean up nicely after itself
-# so when `entrypoint.sh` runs `make build` again (which it does by default), the build would fail
-# see https://github.com/go-skynet/LocalAI/pull/658#discussion_r1241971626 and
-# https://github.com/go-skynet/LocalAI/pull/434
-COPY . .
-
-COPY --from=builder /build/sources ./sources/
-COPY --from=builder /build/grpc ./grpc/
-
-RUN make prepare-sources && cd /build/grpc/cmake/build && make install && rm -rf grpc
+COPY ./entrypoint.sh .

 # Copy the binary
 COPY --from=builder /build/local-ai ./

-# Copy shared libraries for piper
-COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
-
-# do not let stablediffusion rebuild (requires an older version of absl)
-COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
-
-## Duplicated from Makefile to avoid having a big layer that's hard to push
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/autogptq \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/bark \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/diffusers \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/vllm \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/mamba \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/sentencetransformers \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/transformers \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/vall-e-x \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/exllama \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-     make -C backend/python/exllama2 \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/petals \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/transformers-musicgen \
-    ; fi
-RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
-	 make -C backend/python/coqui \
-    ; fi
-
 # Make sure the models directory exists
-RUN mkdir -p /build/models
+RUN mkdir -p /models /backends

 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
-  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
+  CMD curl -f ${HEALTHCHECK_ENDPOINT} || exit 1

+VOLUME /models /backends
 EXPOSE 8080
-ENTRYPOINT [ "/build/entrypoint.sh" ]
+ENTRYPOINT [ "/entrypoint.sh" ]
--- a/Dockerfile.aio
+++ b/Dockerfile.aio
@@ -0,0 +1,8 @@
+ARG BASE_IMAGE=ubuntu:22.04
+
+FROM ${BASE_IMAGE} 
+
+RUN apt-get update && apt-get install -y pciutils && apt-get clean
+
+COPY aio/ /aio
+ENTRYPOINT [ "/aio/entrypoint.sh" ]
--- a/5
+++ b/5
@@ -1,5 +0,0 @@
-VERSION 0.7
-
-build:
-    FROM DOCKERFILE -f Dockerfile .
-    SAVE ARTIFACT /usr/bin/local-ai AS LOCAL local-ai
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)
+Copyright (c) 2023-2025 Ettore Di Giacinto (mudler@localai.io)

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/805
+++ b/805
@@ -2,45 +2,15 @@ GOCMD=go
 GOTEST=$(GOCMD) test
 GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai
+LAUNCHER_BINARY_NAME=local-ai-launcher

-# llama.cpp versions
-GOLLAMA_VERSION?=6a8041ef6b46d4712afc3ae791d1c2d73da0ad1c
-
-GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
-
-CPPLLAMA_VERSION?=4755afd1cbd40d93c017e5b98c39796f52345314
-
-# gpt4all version
-GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
-GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
-
-# go-rwkv version
-RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
-RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
-
-# whisper.cpp version
-WHISPER_CPP_VERSION?=37a709f6558c6d9783199e2b8cbb136e1c41d346
-
-# bert.cpp version
-BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
-
-# go-piper version
-PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759
-
-# stablediffusion version
-STABLEDIFFUSION_VERSION?=362df9da29f882dbf09ade61972d16a1f53c3485
-
-# tinydream version
-TINYDREAM_VERSION?=772a9c0d9aaf768290e63cca3c904fe69faf677a
+GORELEASER?=

 export BUILD_TYPE?=
-export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
-export CMAKE_ARGS?=

-CGO_LDFLAGS?=
-CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
-BUILD_ID?=git
+BUILD_ID?=
+NATIVE?=false

 TEST_DIR=/tmp/test

@@ -50,13 +20,13 @@ RANDOM := $(shell bash -c 'echo $$RANDOM')

 VERSION?=$(shell git describe --always --tags || echo "dev" )
 # go tool nm ./local-ai | grep Commit
-LD_FLAGS?=
-override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Version=$(VERSION)"
-override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"
+LD_FLAGS?=-s -w
+override LD_FLAGS += -X "github.com/mudler/LocalAI/internal.Version=$(VERSION)"
+override LD_FLAGS += -X "github.com/mudler/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"

 OPTIONAL_TARGETS?=

-OS := $(shell uname -s)
+export OS := $(shell uname -s)
 ARCH := $(shell uname -m)
 GREEN  := $(shell tput -Txterm setaf 2)
 YELLOW := $(shell tput -Txterm setaf 3)
@@ -72,285 +42,135 @@ UNAME_S := $(shell uname -s)
 endif

 ifeq ($(OS),Darwin)
-	CGO_LDFLAGS += -lcblas -framework Accelerate
 	ifeq ($(OSX_SIGNING_IDENTITY),)
 		OSX_SIGNING_IDENTITY := $(shell security find-identity -v -p codesigning | grep '"' | head -n 1 | sed -E 's/.*"(.*)"/\1/')
 	endif
-
-	# on OSX, if BUILD_TYPE is blank, we should default to use Metal
-	ifeq ($(BUILD_TYPE),)
-		BUILD_TYPE=metal
-	# disable metal if on Darwin and any other value is explicitly passed.
-	else ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DLLAMA_METAL=OFF
-	endif
 endif

-ifeq ($(BUILD_TYPE),openblas)
-	CGO_LDFLAGS+=-lopenblas
-	export WHISPER_OPENBLAS=1
+# check if goreleaser exists
+ifeq (, $(shell which goreleaser))
+	GORELEASER=curl -sfL https://goreleaser.com/static/run | bash -s --
+else
+	GORELEASER=$(shell which goreleaser)
 endif

-ifeq ($(BUILD_TYPE),cublas)
-	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
-	export LLAMA_CUBLAS=1
-	export WHISPER_CUBLAS=1
-endif
-
-ifeq ($(BUILD_TYPE),hipblas)
-	ROCM_HOME ?= /opt/rocm
-	ROCM_PATH ?= /opt/rocm
-	LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
-	export CXX=$(ROCM_HOME)/llvm/bin/clang++
-	export CC=$(ROCM_HOME)/llvm/bin/clang
-	# llama-ggml has no hipblas support, so override it here.
-	export STABLE_BUILD_TYPE=
-	export WHISPER_HIPBLAS=1
-	GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
-	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
-	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
-	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
-endif
-
-ifeq ($(BUILD_TYPE),metal)
-	CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
-	export LLAMA_METAL=1
-	export WHISPER_METAL=1
-endif
-
-ifeq ($(BUILD_TYPE),clblas)
-	CGO_LDFLAGS+=-lOpenCL -lclblast
-	export WHISPER_CLBLAST=1
-endif
-
-# glibc-static or glibc-devel-static required
-ifeq ($(STATIC),true)
-	LD_FLAGS=-linkmode external -extldflags -static
-endif
-
-ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
-#	OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
-	OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
-endif
-
-ifeq ($(findstring tinydream,$(GO_TAGS)),tinydream)
-#	OPTIONAL_TARGETS+=go-tiny-dream/libtinydream.a
-	OPTIONAL_GRPC+=backend-assets/grpc/tinydream
-endif
-
-ifeq ($(findstring tts,$(GO_TAGS)),tts)
-#	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
-#	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
-	PIPER_CGO_CXXFLAGS+=-I$(CURDIR)/sources/go-piper/piper/src/cpp -I$(CURDIR)/sources/go-piper/piper/build/fi/include -I$(CURDIR)/sources/go-piper/piper/build/pi/include -I$(CURDIR)/sources/go-piper/piper/build/si/include
-	PIPER_CGO_LDFLAGS+=-L$(CURDIR)/sources/go-piper/piper/build/fi/lib -L$(CURDIR)/sources/go-piper/piper/build/pi/lib -L$(CURDIR)/sources/go-piper/piper/build/si/lib -lfmt -lspdlog -lucd
-	OPTIONAL_GRPC+=backend-assets/grpc/piper
-endif
-
-ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface
-ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
-ALL_GRPC_BACKENDS+=backend-assets/grpc/llama
-ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp
-ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
-ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
-ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
-ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
-ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
-
-GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
 TEST_PATHS?=./api/... ./pkg/... ./core/...

-# If empty, then we build all
-ifeq ($(GRPC_BACKENDS),)
-	GRPC_BACKENDS=$(ALL_GRPC_BACKENDS)
-endif
-
-ifeq ($(BUILD_API_ONLY),true)
-	GRPC_BACKENDS=
-endif

 .PHONY: all test build vendor

 all: help

-## GPT4ALL
-sources/gpt4all:
-	git clone --recurse-submodules $(GPT4ALL_REPO) sources/gpt4all
-	cd sources/gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1
-
-## go-piper
-sources/go-piper:
-	git clone --recurse-submodules https://github.com/mudler/go-piper sources/go-piper
-	cd sources/go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1
-
-## BERT embeddings
-sources/go-bert:
-	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp sources/go-bert
-	cd sources/go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1
-
-## stable diffusion
-sources/go-stable-diffusion:
-	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion sources/go-stable-diffusion
-	cd sources/go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1
-
-sources/go-stable-diffusion/libstablediffusion.a:
-	$(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
-
-## tiny-dream
-sources/go-tiny-dream:
-	git clone --recurse-submodules https://github.com/M0Rf30/go-tiny-dream sources/go-tiny-dream
-	cd sources/go-tiny-dream && git checkout -b build $(TINYDREAM_VERSION) && git submodule update --init --recursive --depth 1
-
-sources/go-tiny-dream/libtinydream.a:
-	$(MAKE) -C sources/go-tiny-dream libtinydream.a
-
-## RWKV
-sources/go-rwkv:
-	git clone --recurse-submodules $(RWKV_REPO) sources/go-rwkv
-	cd sources/go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1
-
-sources/go-rwkv/librwkv.a: sources/go-rwkv
-	cd sources/go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
-
-sources/go-bert/libgobert.a: sources/go-bert
-	$(MAKE) -C sources/go-bert libgobert.a
-
-backend-assets/gpt4all: sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
-	mkdir -p backend-assets/gpt4all
-	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
-	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
-	@cp sources/gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
-
-backend-assets/espeak-ng-data: sources/go-piper
-	mkdir -p backend-assets/espeak-ng-data
-	$(MAKE) -C sources/go-piper piper.o
-	@cp -rf sources/go-piper/piper-phonemize/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
-
-sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a: sources/gpt4all
-	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ libgpt4all.a
-
-sources/whisper.cpp:
-	git clone https://github.com/ggerganov/whisper.cpp.git sources/whisper.cpp
-	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
-
-sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
-	cd sources/whisper.cpp && make libwhisper.a
-
-sources/go-llama:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama
-	cd sources/go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1
-
-sources/go-llama-ggml:
-	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp sources/go-llama-ggml
-	cd sources/go-llama-ggml && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1
-
-sources/go-llama/libbinding.a: sources/go-llama
-	$(MAKE) -C sources/go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a
-
-sources/go-llama-ggml/libbinding.a: sources/go-llama-ggml
-	$(MAKE) -C sources/go-llama-ggml BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
-
-sources/go-piper/libpiper_binding.a: sources/go-piper
-	$(MAKE) -C sources/go-piper libpiper_binding.a example/main
-
-backend/cpp/llama/llama.cpp:
-	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp
-
-get-sources: backend/cpp/llama/llama.cpp sources/go-llama sources/go-llama-ggml sources/gpt4all sources/go-piper sources/go-rwkv sources/whisper.cpp sources/go-bert sources/go-stable-diffusion sources/go-tiny-dream
-	touch $@
-
-replace:
-	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang
-	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(CURDIR)/sources/go-rwkv
-	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
-	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert
-	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
-	$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
-	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
-
-prepare-sources: get-sources replace
-	$(GOCMD) mod download
-	touch $@
-
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
-	$(MAKE) -C sources/go-llama clean
-	$(MAKE) -C sources/go-llama-ggml clean
-	$(MAKE) -C sources/gpt4all/gpt4all-bindings/golang/ clean
-	$(MAKE) -C sources/go-rwkv clean
-	$(MAKE) -C sources/whisper.cpp clean
-	$(MAKE) -C sources/go-stable-diffusion clean
-	$(MAKE) -C sources/go-bert clean
-	$(MAKE) -C sources/go-piper clean
-	$(MAKE) -C sources/go-tiny-dream clean
 	$(MAKE) build

-prepare: prepare-sources $(OPTIONAL_TARGETS)
-	touch $@
-
 clean: ## Remove build related file
 	$(GOCMD) clean -cache
 	rm -f prepare
-	rm -rf ./sources
 	rm -rf $(BINARY_NAME)
 	rm -rf release/
-	rm -rf backend-assets
-	$(MAKE) -C backend/cpp/grpc clean
-	$(MAKE) -C backend/cpp/llama clean
+	$(MAKE) protogen-clean
+	rmdir pkg/grpc/proto || true
+
+clean-tests:
+	rm -rf test-models
+	rm -rf test-dir
+
+## Install Go tools
+install-go-tools:
+	go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+	go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2

 ## Build:
-
-build: backend-assets grpcs prepare ## Build the project
+build: protogen-go install-go-tools ## Build the project
 	$(info ${GREEN}I local-ai build info:${RESET})
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
 	$(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
+	$(info ${GREEN}I UPX: ${YELLOW}$(UPX)${RESET})
+	rm -rf $(BINARY_NAME) || true
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./cmd/local-ai

-dist: build
-	mkdir -p release
-	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH)
+build-launcher: ## Build the launcher application
+	$(info ${GREEN}I local-ai launcher build info:${RESET})
+	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
+	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
+	$(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
+	rm -rf $(LAUNCHER_BINARY_NAME) || true
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(LAUNCHER_BINARY_NAME) ./cmd/launcher
+
+build-all: build build-launcher ## Build both server and launcher
+
+dev-dist:
+	$(GORELEASER) build --snapshot --clean
+
+dist:
+	$(GORELEASER) build --clean

 osx-signed: build
 	codesign --deep --force --sign "$(OSX_SIGNING_IDENTITY)" --entitlements "./Entitlements.plist" "./$(BINARY_NAME)"

 ## Run
-run: prepare ## run local-ai
+run: ## run local-ai
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./

-test-models/testmodel:
-	mkdir test-models
-	mkdir test-dir
-	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel
+test-models/testmodel.ggml:
+	mkdir -p test-models
+	mkdir -p test-dir
+	wget -q https://huggingface.co/mradermacher/gpt2-alpaca-gpt4-GGUF/resolve/main/gpt2-alpaca-gpt4.Q4_K_M.gguf -O test-models/testmodel.ggml
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
-	wget -q https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
-	wget -q https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
 	cp tests/models_fixtures/* test-models

-prepare-test: grpcs
-	cp -rf backend-assets core/http
+prepare-test: protogen-go
 	cp tests/models_fixtures/* test-models

-test: prepare test-models/testmodel grpcs
+########################################################
+## Tests
+########################################################
+
+## Test targets
+test: test-models/testmodel.ggml protogen-go
 	@echo 'Running tests'
-	export GO_TAGS="tts stablediffusion"
+	export GO_TAGS="debug"
 	$(MAKE) prepare-test
-	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/sentencetransformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
-	$(MAKE) test-gpt4all
-	$(MAKE) test-llama
+	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models BACKENDS_PATH=$(abspath ./)/backends \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion

+########################################################
+## AIO tests
+########################################################
+
+docker-build-aio:
+	docker build --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
+	BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test $(MAKE) docker-aio
+
+e2e-aio:
+	LOCALAI_BACKEND_DIR=$(abspath ./backends) \
+	LOCALAI_MODELS_DIR=$(abspath ./models) \
+	LOCALAI_IMAGE_TAG=test \
+	LOCALAI_IMAGE=local-ai-aio \
+	$(MAKE) run-e2e-aio
+
+run-e2e-aio: protogen-go
+	@echo 'Running e2e AIO tests'
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
+
+########################################################
+## E2E tests
+########################################################
+
 prepare-e2e:
 	mkdir -p $(TEST_DIR)
 	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
 	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
-	docker build --build-arg BUILD_GRPC=true --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=11 --build-arg CUDA_MINOR_VERSION=7 --build-arg FFMPEG=true -t localai-tests .
+	docker build --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=12 --build-arg CUDA_MINOR_VERSION=8 -t localai-tests .

 run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
@@ -360,36 +180,39 @@ test-e2e:
 	@echo 'Running e2e tests'
 	BUILD_TYPE=$(BUILD_TYPE) \
 	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e

 teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)

-test-gpt4all: prepare-test
-	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r $(TEST_PATHS)
-
-test-llama: prepare-test
-	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
+########################################################
+## Integration and unit tests
+########################################################

 test-llama-gguf: prepare-test
-	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models BACKENDS_PATH=$(abspath ./)/backends \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)

 test-tts: prepare-test
-	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models BACKENDS_PATH=$(abspath ./)/backends \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)

 test-stablediffusion: prepare-test
-	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models BACKENDS_PATH=$(abspath ./)/backends \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
+
+test-stores:
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts $(TEST_FLAKES) -v -r tests/integration

 test-container:
 	docker build --target requirements -t local-ai-test-container .
 	docker run -ti --rm --entrypoint /bin/bash -ti -v $(abspath ./):/build local-ai-test-container

+########################################################
+## Help
+########################################################
+
 ## Help:
 help: ## Show this help.
 	@echo ''
@@ -402,141 +225,70 @@ help: ## Show this help.
 		else if (/^## .*$$/) {printf "  ${CYAN}%s${RESET}\n", substr($$1,4)} \
 		}' $(MAKEFILE_LIST)

-protogen: protogen-go protogen-python
+########################################################
+## Backends
+########################################################

-protogen-go:
-	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
+.PHONY: protogen
+protogen: protogen-go
+
+protoc:
+	@OS_NAME=$$(uname -s | tr '[:upper:]' '[:lower:]'); \
+	ARCH_NAME=$$(uname -m); \
+	if [ "$$OS_NAME" = "darwin" ]; then \
+	  if [ "$$ARCH_NAME" = "arm64" ]; then \
+	    FILE=protoc-31.1-osx-aarch_64.zip; \
+	  elif [ "$$ARCH_NAME" = "x86_64" ]; then \
+	    FILE=protoc-31.1-osx-x86_64.zip; \
+	  else \
+	    echo "Unsupported macOS architecture: $$ARCH_NAME"; exit 1; \
+	  fi; \
+	elif [ "$$OS_NAME" = "linux" ]; then \
+	  if [ "$$ARCH_NAME" = "x86_64" ]; then \
+	    FILE=protoc-31.1-linux-x86_64.zip; \
+	  elif [ "$$ARCH_NAME" = "aarch64" ] || [ "$$ARCH_NAME" = "arm64" ]; then \
+	    FILE=protoc-31.1-linux-aarch_64.zip; \
+	  elif [ "$$ARCH_NAME" = "ppc64le" ]; then \
+	    FILE=protoc-31.1-linux-ppcle_64.zip; \
+	  elif [ "$$ARCH_NAME" = "s390x" ]; then \
+	    FILE=protoc-31.1-linux-s390_64.zip; \
+	  elif [ "$$ARCH_NAME" = "i386" ] || [ "$$ARCH_NAME" = "x86" ]; then \
+	    FILE=protoc-31.1-linux-x86_32.zip; \
+	  else \
+	    echo "Unsupported Linux architecture: $$ARCH_NAME"; exit 1; \
+	  fi; \
+	else \
+	  echo "Unsupported OS: $$OS_NAME"; exit 1; \
+	fi; \
+	URL=https://github.com/protocolbuffers/protobuf/releases/download/v31.1/$$FILE; \
+	curl -L -s $$URL -o protoc.zip && \
+	unzip -j -d $(CURDIR) protoc.zip bin/protoc && rm protoc.zip
+
+.PHONY: protogen-go
+protogen-go: protoc install-go-tools
+	mkdir -p pkg/grpc/proto
+	./protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
    backend/backend.proto

-protogen-python:
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/sentencetransformers/ --grpc_python_out=backend/python/sentencetransformers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers/ --grpc_python_out=backend/python/transformers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/transformers-musicgen/ --grpc_python_out=backend/python/transformers-musicgen/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/autogptq/ --grpc_python_out=backend/python/autogptq/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama/ --grpc_python_out=backend/python/exllama/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/bark/ --grpc_python_out=backend/python/bark/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/diffusers/ --grpc_python_out=backend/python/diffusers/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/coqui/ --grpc_python_out=backend/python/coqui/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vall-e-x/ --grpc_python_out=backend/python/vall-e-x/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/vllm/ --grpc_python_out=backend/python/vllm/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/petals/ --grpc_python_out=backend/python/petals/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/mamba/ --grpc_python_out=backend/python/mamba/ backend/backend.proto
-	python3 -m grpc_tools.protoc -Ibackend/ --python_out=backend/python/exllama2/ --grpc_python_out=backend/python/exllama2/ backend/backend.proto
+.PHONY: protogen-go-clean
+protogen-go-clean:
+	$(RM) pkg/grpc/proto/backend.pb.go pkg/grpc/proto/backend_grpc.pb.go
+	$(RM) bin/*

-## GRPC
-# Note: it is duplicated in the Dockerfile
-prepare-extra-conda-environments:
-	$(MAKE) -C backend/python/autogptq
-	$(MAKE) -C backend/python/bark
-	$(MAKE) -C backend/python/coqui
+prepare-test-extra: protogen-python
+	$(MAKE) -C backend/python/transformers
 	$(MAKE) -C backend/python/diffusers
+	$(MAKE) -C backend/python/chatterbox
 	$(MAKE) -C backend/python/vllm
-	$(MAKE) -C backend/python/mamba
-	$(MAKE) -C backend/python/sentencetransformers
-	$(MAKE) -C backend/python/transformers
-	$(MAKE) -C backend/python/transformers-musicgen
-	$(MAKE) -C backend/python/vall-e-x
-	$(MAKE) -C backend/python/exllama
-	$(MAKE) -C backend/python/petals
-	$(MAKE) -C backend/python/exllama2
-
-prepare-test-extra:
-	$(MAKE) -C backend/python/transformers
-	$(MAKE) -C backend/python/diffusers

 test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/transformers test
 	$(MAKE) -C backend/python/diffusers test
-
-backend-assets:
-	mkdir -p backend-assets
-ifeq ($(BUILD_API_ONLY),true)
-	touch backend-assets/keep
-endif
-
-backend-assets/grpc:
-	mkdir -p backend-assets/grpc
-
-backend-assets/grpc/llama: backend-assets/grpc sources/go-llama/libbinding.a
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama LIBRARY_PATH=$(CURDIR)/sources/go-llama \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./backend/go/llm/llama/
-# TODO: every binary should have its own folder instead, so can have different  implementations
-
-## BACKEND CPP LLAMA START
-# Sets the variables in case it has to build the gRPC locally.
-INSTALLED_PACKAGES=$(CURDIR)/backend/cpp/grpc/installed_packages
-INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
-ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
-                 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
-                 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
-                 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
-                 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
-
-backend/cpp/llama/grpc-server:
-ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
-	$(MAKE) -C backend/cpp/grpc build
-	export _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto && \
-	export _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin && \
-	export PATH="${INSTALLED_PACKAGES}/bin:${PATH}" && \
-	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
-else
-	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
-	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server
-endif
-## BACKEND CPP LLAMA END
-
-##
-backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
-	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
-# TODO: every binary should have its own folder instead, so can have different metal implementations
-ifeq ($(BUILD_TYPE),metal)
-	cp backend/cpp/llama/llama.cpp/build/bin/default.metallib backend-assets/grpc/
-endif
-
-backend-assets/grpc/llama-ggml: backend-assets/grpc sources/go-llama-ggml/libbinding.a
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama-ggml
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama-ggml LIBRARY_PATH=$(CURDIR)/sources/go-llama-ggml \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
-
-backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
-
-backend-assets/grpc/rwkv: backend-assets/grpc sources/go-rwkv/librwkv.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv LIBRARY_PATH=$(CURDIR)/sources/go-rwkv \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
-
-backend-assets/grpc/bert-embeddings: backend-assets/grpc sources/go-bert/libgobert.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert LIBRARY_PATH=$(CURDIR)/sources/go-bert \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
-
-backend-assets/grpc/langchain-huggingface: backend-assets/grpc
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./backend/go/llm/langchain/
-
-backend-assets/grpc/stablediffusion: backend-assets/grpc
-	if [ ! -f backend-assets/grpc/stablediffusion ]; then \
-		$(MAKE) sources/go-stable-diffusion; \
-		$(MAKE) sources/go-stable-diffusion/libstablediffusion.a; \
-		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-stable-diffusion/ LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
-		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion; \
-	fi
-
-backend-assets/grpc/tinydream: backend-assets/grpc sources/go-tiny-dream/libtinydream.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
-
-backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data sources/go-piper/libpiper_binding.a
-	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
-
-backend-assets/grpc/whisper: backend-assets/grpc sources/whisper.cpp/libwhisper.a
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/whisper.cpp LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
-
-grpcs: prepare $(GRPC_BACKENDS)
+	$(MAKE) -C backend/python/chatterbox test
+	$(MAKE) -C backend/python/vllm test

 DOCKER_IMAGE?=local-ai
+DOCKER_AIO_IMAGE?=local-ai-aio
 IMAGE_TYPE?=core
 BASE_IMAGE?=ubuntu:22.04

@@ -544,20 +296,255 @@ docker:
 	docker build \
 		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
-		--build-arg GO_TAGS=$(GO_TAGS) \
+		--build-arg GO_TAGS="$(GO_TAGS)" \
+		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
 		--build-arg BUILD_TYPE=$(BUILD_TYPE) \
 		-t $(DOCKER_IMAGE) .

+docker-cuda11:
+	docker build \
+		--build-arg CUDA_MAJOR_VERSION=11 \
+		--build-arg CUDA_MINOR_VERSION=8 \
+		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
+		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
+		--build-arg GO_TAGS="$(GO_TAGS)" \
+		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
+		--build-arg BUILD_TYPE=$(BUILD_TYPE) \
+		-t $(DOCKER_IMAGE)-cuda-11 .
+
+docker-aio:
+	@echo "Building AIO image with base $(BASE_IMAGE) as $(DOCKER_AIO_IMAGE)"
+	docker build \
+		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
+		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
+		-t $(DOCKER_AIO_IMAGE) -f Dockerfile.aio .
+
+docker-aio-all:
+	$(MAKE) docker-aio DOCKER_AIO_SIZE=cpu
+	$(MAKE) docker-aio DOCKER_AIO_SIZE=cpu
+
 docker-image-intel:
 	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
+		--build-arg BASE_IMAGE=quay.io/go-skynet/intel-oneapi-base:latest \
 		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
-		--build-arg GO_TAGS="none" \
-		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
+		--build-arg GO_TAGS="$(GO_TAGS)" \
+		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
+		--build-arg BUILD_TYPE=intel -t $(DOCKER_IMAGE) .

-docker-image-intel-xpu:
-	docker build \
-		--build-arg BASE_IMAGE=intel/oneapi-basekit:2024.0.1-devel-ubuntu22.04 \
-		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
-		--build-arg GO_TAGS="none" \
-		--build-arg BUILD_TYPE=sycl_f32 -t $(DOCKER_IMAGE) .
+########################################################
+## Backends
+########################################################
+
+
+backends/diffusers: docker-build-diffusers docker-save-diffusers build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/diffusers.tar)"
+
+backends/llama-cpp: docker-build-llama-cpp docker-save-llama-cpp build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)"
+
+backends/piper: docker-build-piper docker-save-piper build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/piper.tar)"
+
+backends/stablediffusion-ggml: docker-build-stablediffusion-ggml docker-save-stablediffusion-ggml build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/stablediffusion-ggml.tar)"
+
+backends/whisper: docker-build-whisper docker-save-whisper build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/whisper.tar)"
+
+backends/silero-vad: docker-build-silero-vad docker-save-silero-vad build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/silero-vad.tar)"
+
+backends/local-store: docker-build-local-store docker-save-local-store build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/local-store.tar)"
+
+backends/huggingface: docker-build-huggingface docker-save-huggingface build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/huggingface.tar)"
+
+backends/rfdetr: docker-build-rfdetr docker-save-rfdetr build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/rfdetr.tar)"
+
+backends/kitten-tts: docker-build-kitten-tts docker-save-kitten-tts build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/kitten-tts.tar)"
+
+backends/kokoro: docker-build-kokoro docker-save-kokoro build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/kokoro.tar)"
+
+backends/chatterbox: docker-build-chatterbox docker-save-chatterbox build
+	./local-ai backends install "ocifile://$(abspath ./backend-images/chatterbox.tar)"
+
+backends/llama-cpp-darwin: build
+	bash ./scripts/build/llama-cpp-darwin.sh
+	./local-ai backends install "ocifile://$(abspath ./backend-images/llama-cpp.tar)"
+
+build-darwin-python-backend: build
+	bash ./scripts/build/python-darwin.sh
+
+build-darwin-go-backend: build
+	bash ./scripts/build/golang-darwin.sh
+
+backends/mlx:
+	BACKEND=mlx $(MAKE) build-darwin-python-backend
+	./local-ai backends install "ocifile://$(abspath ./backend-images/mlx.tar)"
+
+backends/diffuser-darwin:
+	BACKEND=diffusers $(MAKE) build-darwin-python-backend
+	./local-ai backends install "ocifile://$(abspath ./backend-images/diffusers.tar)"
+
+backends/mlx-vlm:
+	BACKEND=mlx-vlm $(MAKE) build-darwin-python-backend
+	./local-ai backends install "ocifile://$(abspath ./backend-images/mlx-vlm.tar)"
+
+backends/mlx-audio:
+	BACKEND=mlx-audio $(MAKE) build-darwin-python-backend
+	./local-ai backends install "ocifile://$(abspath ./backend-images/mlx-audio.tar)"
+
+backends/stablediffusion-ggml-darwin:
+	BACKEND=stablediffusion-ggml BUILD_TYPE=metal $(MAKE) build-darwin-go-backend
+	./local-ai backends install "ocifile://$(abspath ./backend-images/stablediffusion-ggml.tar)"
+
+backend-images:
+	mkdir -p backend-images
+
+docker-build-llama-cpp:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:llama-cpp -f backend/Dockerfile.llama-cpp .
+
+docker-build-bark-cpp:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:bark-cpp -f backend/Dockerfile.golang --build-arg BACKEND=bark-cpp .
+
+docker-build-piper:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:piper -f backend/Dockerfile.golang --build-arg BACKEND=piper .
+
+docker-build-local-store:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:local-store -f backend/Dockerfile.golang --build-arg BACKEND=local-store .
+
+docker-build-huggingface:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:huggingface -f backend/Dockerfile.golang --build-arg BACKEND=huggingface .
+
+docker-build-rfdetr:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:rfdetr -f backend/Dockerfile.python --build-arg BACKEND=rfdetr ./backend
+
+docker-build-kitten-tts:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kitten-tts -f backend/Dockerfile.python --build-arg BACKEND=kitten-tts ./backend
+
+docker-save-kitten-tts: backend-images
+	docker save local-ai-backend:kitten-tts -o backend-images/kitten-tts.tar
+
+docker-build-kokoro:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:kokoro -f backend/Dockerfile.python --build-arg BACKEND=kokoro ./backend
+
+docker-save-kokoro: backend-images
+	docker save local-ai-backend:kokoro -o backend-images/kokoro.tar
+
+docker-save-rfdetr: backend-images
+	docker save local-ai-backend:rfdetr -o backend-images/rfdetr.tar
+
+docker-save-huggingface: backend-images
+	docker save local-ai-backend:huggingface -o backend-images/huggingface.tar
+
+docker-save-local-store: backend-images
+	docker save local-ai-backend:local-store -o backend-images/local-store.tar
+
+docker-build-silero-vad:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:silero-vad -f backend/Dockerfile.golang --build-arg BACKEND=silero-vad .
+
+docker-save-silero-vad: backend-images
+	docker save local-ai-backend:silero-vad -o backend-images/silero-vad.tar
+
+docker-save-piper: backend-images
+	docker save local-ai-backend:piper -o backend-images/piper.tar
+
+docker-save-llama-cpp: backend-images
+	docker save local-ai-backend:llama-cpp -o backend-images/llama-cpp.tar
+
+docker-save-bark-cpp: backend-images
+	docker save local-ai-backend:bark-cpp -o backend-images/bark-cpp.tar
+
+docker-build-stablediffusion-ggml:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:stablediffusion-ggml -f backend/Dockerfile.golang --build-arg BACKEND=stablediffusion-ggml .
+
+docker-save-stablediffusion-ggml: backend-images
+	docker save local-ai-backend:stablediffusion-ggml -o backend-images/stablediffusion-ggml.tar
+
+docker-build-rerankers:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:rerankers -f backend/Dockerfile.python --build-arg BACKEND=rerankers .
+
+docker-build-vllm:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:vllm -f backend/Dockerfile.python --build-arg BACKEND=vllm .
+
+docker-build-transformers:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:transformers -f backend/Dockerfile.python --build-arg BACKEND=transformers .
+
+docker-build-diffusers:
+	docker build --progress=plain --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:diffusers -f backend/Dockerfile.python --build-arg BACKEND=diffusers ./backend
+
+docker-save-diffusers: backend-images
+	docker save local-ai-backend:diffusers -o backend-images/diffusers.tar
+
+docker-build-whisper:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:whisper -f backend/Dockerfile.golang --build-arg BACKEND=whisper  .
+
+docker-save-whisper: backend-images
+	docker save local-ai-backend:whisper -o backend-images/whisper.tar
+
+docker-build-faster-whisper:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:faster-whisper -f backend/Dockerfile.python --build-arg BACKEND=faster-whisper .
+
+docker-build-coqui:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:coqui -f backend/Dockerfile.python --build-arg BACKEND=coqui .
+
+docker-build-bark:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:bark -f backend/Dockerfile.python --build-arg BACKEND=bark .
+
+docker-build-chatterbox:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:chatterbox -f backend/Dockerfile.python --build-arg BACKEND=chatterbox ./backend
+
+docker-build-exllama2:
+	docker build --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg BASE_IMAGE=$(BASE_IMAGE) -t local-ai-backend:exllama2 -f backend/Dockerfile.python --build-arg BACKEND=exllama2 .
+
+docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-exllama2
+
+########################################################
+### END Backends
+########################################################
+
+.PHONY: swagger
+swagger:
+	swag init -g core/http/app.go --output swagger
+
+.PHONY: gen-assets
+gen-assets:
+	$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets
+
+## Documentation
+docs/layouts/_default:
+	mkdir -p docs/layouts/_default
+
+docs/static/gallery.html: docs/layouts/_default
+	$(GOCMD) run ./.github/ci/modelslist.go ./gallery/index.yaml > docs/static/gallery.html
+
+docs/public: docs/layouts/_default docs/static/gallery.html
+	cd docs && hugo --minify
+
+docs-clean:
+	rm -rf docs/public
+	rm -rf docs/static/gallery.html
+
+.PHONY: docs
+docs: docs/static/gallery.html
+	cd docs && hugo serve
+
+########################################################
+## Platform-specific builds
+########################################################
+
+## fyne cross-platform build
+build-launcher-darwin: build-launcher
+	go run github.com/tiagomelo/macos-dmg-creator/cmd/createdmg@latest \
+	--appName "LocalAI" \
+	--appBinaryPath "$(LAUNCHER_BINARY_NAME)" \
+	--bundleIdentifier "com.localai.launcher" \
+	--iconPath "core/http/static/logo.png" \
+	--outputDir "dist/"
+
+build-launcher-linux:
+	cd cmd/launcher && go run fyne.io/tools/cmd/fyne@latest package -os linux -icon ../../core/http/static/logo.png --executable $(LAUNCHER_BINARY_NAME)-linux && mv launcher.tar.xz ../../$(LAUNCHER_BINARY_NAME)-linux.tar.xz
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
 <h1 align="center">
  <br>
-  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
-    LocalAI
+  <img width="300" src="./core/http/static/logo.png"> <br>
 <br>
 </h1>

@@ -20,73 +19,280 @@
 </a>
 </p>

-[<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker">](https://hub.docker.com/r/localai/localai)
-[<img src="https://img.shields.io/badge/quay.io-images-important.svg?">](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest)
-
-> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
->
-> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
-
-[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)
+<p align="center">
+<a href="https://hub.docker.com/r/localai/localai" target="blank">
+<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker" alt="LocalAI Docker hub"/>
+</a>
+<a href="https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest" target="blank">
+<img src="https://img.shields.io/badge/quay.io-images-important.svg?" alt="LocalAI Quay.io"/>
+</a>
+</p>

 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
-<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
+<img src="https://img.shields.io/badge/X-%23000000.svg?style=for-the-badge&logo=X&logoColor=white&label=LocalAI_API" alt="Follow LocalAI_API"/>
 </a>
 <a href="https://discord.gg/uJAeKSAGDy" target="blank">
 <img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
 </a>
+</p>

-**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU.
+<p align="center">
+<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>

-## 🔥🔥 Hot topics / Roadmap
+> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
+>
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [🖼️ Models](https://models.localai.io/) [🚀 Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap) [🌍 Explorer](https://explorer.localai.io) [🛫 Examples](https://github.com/mudler/LocalAI-examples) Try on 
+[![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white)](https://t.me/localaiofficial_bot)

-[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

- Parallel function calling: https://github.com/mudler/LocalAI/pull/1726
- Upload file API: https://github.com/mudler/LocalAI/pull/1703
- Tools API support: https://github.com/mudler/LocalAI/pull/1715
- LLaVa 1.6: https://github.com/mudler/LocalAI/pull/1714
- ROCm container images: https://github.com/mudler/LocalAI/pull/1595
- Intel GPU support (sycl, transformers, diffusers): https://github.com/mudler/LocalAI/issues/1653
- Deprecation of old backends: https://github.com/mudler/LocalAI/issues/1651
- Mamba support: https://github.com/mudler/LocalAI/pull/1589
- Start and share models with config file: https://github.com/mudler/LocalAI/pull/1522
- 🐸 Coqui: https://github.com/mudler/LocalAI/pull/1489
- Img2vid https://github.com/mudler/LocalAI/pull/1442
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that's compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

-Hot topics (looking for contributors):
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
- Assistant API: https://github.com/mudler/LocalAI/issues/1273
- Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
- Vulkan: https://github.com/mudler/LocalAI/issues/1647

-If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22
+## 📚🆕 Local Stack Family

-## 💻 [Getting started](https://localai.io/basics/getting_started/index.html)
+🆕 LocalAI is now part of a comprehensive suite of AI tools designed to work together:

-For a detailed step-by-step introduction, refer to the [Getting Started](https://localai.io/basics/getting_started/index.html) guide. For those in a hurry, here's a straightforward one-liner to launch a LocalAI instance with [phi-2](https://huggingface.co/microsoft/phi-2) using `docker`:
+<table>
+  <tr>
+    <td width="50%" valign="top">
+      <a href="https://github.com/mudler/LocalAGI">
+        <img src="https://raw.githubusercontent.com/mudler/LocalAGI/refs/heads/main/webui/react-ui/public/logo_2.png" width="300" alt="LocalAGI Logo">
+      </a>
+    </td>
+    <td width="50%" valign="top">
+      <h3><a href="https://github.com/mudler/LocalAGI">LocalAGI</a></h3>
+      <p>A powerful Local AI agent management platform that serves as a drop-in replacement for OpenAI's Responses API, enhanced with advanced agentic capabilities.</p>
+    </td>
+  </tr>
+  <tr>
+    <td width="50%" valign="top">
+      <a href="https://github.com/mudler/LocalRecall">
+        <img src="https://raw.githubusercontent.com/mudler/LocalRecall/refs/heads/main/static/localrecall_horizontal.png" width="300" alt="LocalRecall Logo">
+      </a>
+    </td>
+    <td width="50%" valign="top">
+      <h3><a href="https://github.com/mudler/LocalRecall">LocalRecall</a></h3>
+      <p>A REST-ful API and knowledge base management system that provides persistent memory and storage capabilities for AI agents.</p>
+    </td>
+  </tr>
+</table>

+## Screenshots
+
+
+| Talk Interface | Generate Audio |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 12-01-36 LocalAI - Talk](./docs/assets/images/screenshots/screenshot_tts.png) | ![Screenshot 2025-03-31 at 12-01-29 LocalAI - Generate audio with voice-en-us-ryan-low](./docs/assets/images/screenshots/screenshot_tts.png) |
+
+| Models Overview | Generate Images |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 12-01-20 LocalAI - Models](./docs/assets/images/screenshots/screenshot_gallery.png) | ![Screenshot 2025-03-31 at 12-31-41 LocalAI - Generate images with flux 1-dev](./docs/assets/images/screenshots/screenshot_image.png) |
+
+| Chat Interface | Home |
+| --- | --- |
+| ![Screenshot 2025-03-31 at 11-57-44 LocalAI - Chat with localai-functioncall-qwen2 5-7b-v0 5](./docs/assets/images/screenshots/screenshot_chat.png) | ![Screenshot 2025-03-31 at 11-57-23 LocalAI API - c2a39e3 (c2a39e3639227cfd94ffffe9f5691239acc275a8)](./docs/assets/images/screenshots/screenshot_home.png) |
+
+| Login | Swarm |
+| --- | --- |
+|![Screenshot 2025-03-31 at 12-09-59 ](./docs/assets/images/screenshots/screenshot_login.png) | ![Screenshot 2025-03-31 at 12-10-39 LocalAI - P2P dashboard](./docs/assets/images/screenshots/screenshot_p2p.png) |
+
+## 💻 Quickstart
+
+Run the installer script:
+
+```bash
+# Basic installation
+curl https://localai.io/install.sh | sh
 ```
-docker run -ti -p 8080:8080 localai/localai:v2.9.0-ffmpeg-core phi-2
+
+For more installation options, see [Installer Options](https://localai.io/docs/advanced/installer/).
+
+### macOS Download:
+
+<a href="https://github.com/mudler/LocalAI/releases/latest/download/LocalAI.dmg">
+  <img src="https://img.shields.io/badge/Download-macOS-blue?style=for-the-badge&logo=apple&logoColor=white" alt="Download LocalAI for macOS"/>
+</a>
+
+Or run with docker:
+
+### CPU only image:
+
+```bash
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
 ```

+### NVIDIA GPU Images:
+
+```bash
+# CUDA 12.0
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
+
+# CUDA 11.7
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11
+
+# NVIDIA Jetson (L4T) ARM64
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-nvidia-l4t-arm64
+```
+
+### AMD GPU Images (ROCm):
+
+```bash
+docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-gpu-hipblas
+```
+
+### Intel GPU Images (oneAPI):
+
+```bash
+docker run -ti --name local-ai -p 8080:8080 --device=/dev/dri/card1 --device=/dev/dri/renderD128 localai/localai:latest-gpu-intel
+```
+
+### Vulkan GPU Images:
+
+```bash
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-vulkan
+```
+
+### AIO Images (pre-downloaded models):
+
+```bash
+# CPU version
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
+
+# NVIDIA CUDA 12 version
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
+
+# NVIDIA CUDA 11 version
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-11
+
+# Intel GPU version
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-gpu-intel
+
+# AMD GPU version
+docker run -ti --name local-ai -p 8080:8080 --device=/dev/kfd --device=/dev/dri --group-add=video localai/localai:latest-aio-gpu-hipblas
+```
+
+For more information about the AIO images and pre-downloaded models, see [Container Documentation](https://localai.io/basics/container/).
+
+To load models:
+
+```bash
+# From the model gallery (see available models with `local-ai models list`, in the WebUI from the model tab, or visiting https://models.localai.io)
+local-ai run llama-3.2-1b-instruct:q4_k_m
+# Start LocalAI with the phi-2 model directly from huggingface
+local-ai run huggingface://TheBloke/phi-2-GGUF/phi-2.Q8_0.gguf
+# Install and run a model from the Ollama OCI registry
+local-ai run ollama://gemma:2b
+# Run a model from a configuration file
+local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
+# Install and run a model from a standard OCI registry (e.g., Docker Hub)
+local-ai run oci://localai/phi-2:latest
+```
+
+> ⚡ **Automatic Backend Detection**: When you install models from the gallery or YAML files, LocalAI automatically detects your system's GPU capabilities (NVIDIA, AMD, Intel) and downloads the appropriate backend. For advanced configuration options, see [GPU Acceleration](https://localai.io/features/gpu-acceleration/#automatic-backend-detection).
+
+For more information, see [💻 Getting started](https://localai.io/basics/getting_started/index.html)
+
+## 📰 Latest project news
+
+- August 2025: MLX, MLX-VLM, Diffusers and llama.cpp are now supported on Mac M1/M2/M3+ chips ( with `development` suffix in the gallery ): https://github.com/mudler/LocalAI/pull/6049 https://github.com/mudler/LocalAI/pull/6119 https://github.com/mudler/LocalAI/pull/6121 https://github.com/mudler/LocalAI/pull/6060
+- July/August 2025: 🔍 [Object Detection](https://localai.io/features/object-detection/) added to the API featuring [rf-detr](https://github.com/roboflow/rf-detr)
+- July 2025: All backends migrated outside of the main binary. LocalAI is now more lightweight, small, and automatically downloads the required backend to run the model. [Read the release notes](https://github.com/mudler/LocalAI/releases/tag/v3.2.0)
+- June 2025: [Backend management](https://github.com/mudler/LocalAI/pull/5607) has been added. Attention: extras images are going to be deprecated from the next release! Read [the backend management PR](https://github.com/mudler/LocalAI/pull/5607).
+- May 2025: [Audio input](https://github.com/mudler/LocalAI/pull/5466) and [Reranking](https://github.com/mudler/LocalAI/pull/5396) in llama.cpp backend, [Realtime API](https://github.com/mudler/LocalAI/pull/5392),  Support to Gemma, SmollVLM, and more multimodal models (available in the gallery).
+- May 2025: Important: image name changes [See release](https://github.com/mudler/LocalAI/releases/tag/v2.29.0)
+- Apr 2025: Rebrand, WebUI enhancements
+- Apr 2025: [LocalAGI](https://github.com/mudler/LocalAGI) and [LocalRecall](https://github.com/mudler/LocalRecall) join the LocalAI family stack.
+- Apr 2025: WebUI overhaul, AIO images updates
+- Feb 2025: Backend cleanup, Breaking changes, new backends (kokoro, OutelTTS, faster-whisper), Nvidia L4T images
+- Jan 2025: LocalAI model release: https://huggingface.co/mudler/LocalAI-functioncall-phi-4-v0.3, SANA support in diffusers: https://github.com/mudler/LocalAI/pull/4603
+- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
+- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
+- Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204
+- Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
+- Aug 2024:  🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
+- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723. P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
+- May 2024: 🔥🔥 Decentralized P2P llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs  https://localai.io/features/distribute/
+- May 2024: 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
+- April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
+
+Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
+
 ## 🚀 [Features](https://localai.io/features/)

- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 🧩 [Backend Gallery](https://localai.io/backends/): Install/remove backends on the fly, powered by OCI images — fully customizable and API-driven.
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `transformers`, `vllm` ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
 - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
 - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
- 🔥 [OpenAI functions](https://localai.io/features/openai-functions/) 🆕
+- 🎨 [Image generation](https://localai.io/features/image-generation)
+- 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) 
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
 - 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
- 🆕 [Vision API](https://localai.io/features/gpt-vision/)
+- 🥽 [Vision API](https://localai.io/features/gpt-vision/)
+- 🔍 [Object Detection](https://localai.io/features/object-detection/)
+- 📈 [Reranker API](https://localai.io/features/reranker/)
+- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
+- [Agentic capabilities](https://github.com/mudler/LocalAGI)
+- 🔊 Voice activity detection (Silero-VAD support)
+- 🌍 Integrated WebUI!

-## 💻 Usage
+## 🧩 Supported Backends & Acceleration

-Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.
+LocalAI supports a comprehensive range of AI backends with multiple acceleration options:
+
+### Text Generation & Language Models
+| Backend | Description | Acceleration Support |
+|---------|-------------|---------------------|
+| **llama.cpp** | LLM inference in C/C++ | CUDA 11/12, ROCm, Intel SYCL, Vulkan, Metal, CPU |
+| **vLLM** | Fast LLM inference with PagedAttention | CUDA 12, ROCm, Intel |
+| **transformers** | HuggingFace transformers framework | CUDA 11/12, ROCm, Intel, CPU |
+| **exllama2** | GPTQ inference library | CUDA 12 |
+| **MLX** | Apple Silicon LLM inference | Metal (M1/M2/M3+) |
+| **MLX-VLM** | Apple Silicon Vision-Language Models | Metal (M1/M2/M3+) |
+
+### Audio & Speech Processing
+| Backend | Description | Acceleration Support |
+|---------|-------------|---------------------|
+| **whisper.cpp** | OpenAI Whisper in C/C++ | CUDA 12, ROCm, Intel SYCL, Vulkan, CPU |
+| **faster-whisper** | Fast Whisper with CTranslate2 | CUDA 12, ROCm, Intel, CPU |
+| **bark** | Text-to-audio generation | CUDA 12, ROCm, Intel |
+| **bark-cpp** | C++ implementation of Bark | CUDA, Metal, CPU |
+| **coqui** | Advanced TTS with 1100+ languages | CUDA 12, ROCm, Intel, CPU |
+| **kokoro** | Lightweight TTS model | CUDA 12, ROCm, Intel, CPU |
+| **chatterbox** | Production-grade TTS | CUDA 11/12, CPU |
+| **piper** | Fast neural TTS system | CPU |
+| **kitten-tts** | Kitten TTS models | CPU |
+| **silero-vad** | Voice Activity Detection | CPU |
+
+### Image & Video Generation
+| Backend | Description | Acceleration Support |
+|---------|-------------|---------------------|
+| **stablediffusion.cpp** | Stable Diffusion in C/C++ | CUDA 12, Intel SYCL, Vulkan, CPU |
+| **diffusers** | HuggingFace diffusion models | CUDA 11/12, ROCm, Intel, Metal, CPU |
+
+### Specialized AI Tasks
+| Backend | Description | Acceleration Support |
+|---------|-------------|---------------------|
+| **rfdetr** | Real-time object detection | CUDA 12, Intel, CPU |
+| **rerankers** | Document reranking API | CUDA 11/12, ROCm, Intel, CPU |
+| **local-store** | Vector database | CPU |
+| **huggingface** | HuggingFace API integration | API-based |
+
+### Hardware Acceleration Matrix
+
+| Acceleration Type | Supported Backends | Hardware Support |
+|-------------------|-------------------|------------------|
+| **NVIDIA CUDA 11** | llama.cpp, whisper, stablediffusion, diffusers, rerankers, bark, chatterbox | Nvidia hardware |
+| **NVIDIA CUDA 12** | All CUDA-compatible backends | Nvidia hardware |
+| **AMD ROCm** | llama.cpp, whisper, vllm, transformers, diffusers, rerankers, coqui, kokoro, bark | AMD Graphics |
+| **Intel oneAPI** | llama.cpp, whisper, stablediffusion, vllm, transformers, diffusers, rfdetr, rerankers, exllama2, coqui, kokoro, bark | Intel Arc, Intel iGPUs |
+| **Apple Metal** | llama.cpp, whisper, diffusers, MLX, MLX-VLM, bark-cpp | Apple M1/M2/M3+ |
+| **Vulkan** | llama.cpp, whisper, stablediffusion | Cross-platform GPUs |
+| **NVIDIA Jetson** | llama.cpp, whisper, stablediffusion, diffusers, rfdetr | ARM64 embedded AI |
+| **CPU Optimized** | All backends | AVX/AVX2/AVX512, quantization support |

 ### 🔗 Community and integrations

@@ -96,24 +302,35 @@ Build and deploy custom containers:
 WebUIs:
 - https://github.com/Jirubizu/localai-admin
 - https://github.com/go-skynet/LocalAI-frontend
+- QA-Pilot(An interactive chat project that leverages LocalAI LLMs for rapid understanding and navigation of GitHub code repository) https://github.com/reid41/QA-Pilot

 Model galleries
 - https://github.com/go-skynet/model-gallery

+Voice:
+- https://github.com/richiejp/VoxInput
+
 Other:
 - Helm chart https://github.com/go-skynet/helm-charts
 - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
+- Langchain: https://python.langchain.com/docs/integrations/providers/localai/
+- Terminal utility https://github.com/djcopley/ShellOracle
 - Local Smart assistant https://github.com/mudler/LocalAGI
- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation
+- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
 - Discord bot https://github.com/mudler/LocalAGI/tree/main/examples/discord
 - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
+- Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
 - Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
+- Another Telegram Bot https://github.com/JackBekket/Hellper
+- Auto-documentation https://github.com/JackBekket/Reflexia
+- Github bot which answer on issues, with code and documentation as context https://github.com/JackBekket/GitHelper
+- Github Actions: https://github.com/marketplace/actions/start-localai
 - Examples: https://github.com/mudler/LocalAI/tree/master/examples/
  

 ### 🔗 Resources

- 🆕 New! [LLM finetuning guide](https://localai.io/docs/advanced/fine-tuning/)
+- [LLM finetuning guide](https://localai.io/docs/advanced/fine-tuning/)
 - [How to build locally](https://localai.io/basics/build/index.html)
 - [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
 - [Projects integrating LocalAI](https://localai.io/docs/integrations/)
@@ -121,7 +338,9 @@ Other:

 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

- [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/ai/answers/tiZMDoZzZV6TLxgDXNBnFE/deploying-helm-charts-on-aws-eks)
+- [Run Visual studio code with LocalAI (SUSE)](https://www.suse.com/c/running-ai-locally/)
+- 🆕 [Run LocalAI on Jetson Nano Devkit](https://mudler.pm/posts/local-ai-jetson-nano-devkit/)
+- [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/blog/low-code-llm-apps-with-local-ai-flowise-and-pulumi/)
 - [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
 - [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
 - [LocalAI meets k8sgpt](https://www.youtube.com/watch?v=PKrDNuJ_dfE)
@@ -148,17 +367,16 @@ If you utilize this repository, data in a downstream project, please consider ci

 Support the project by becoming [a backer or sponsor](https://github.com/sponsors/mudler). Your logo will show up here with a link to your website.

-A huge thank you to our generous sponsors who support this project:
+A huge thank you to our generous sponsors who support this project covering CI expenses, and our [Sponsor list](https://github.com/sponsors/mudler):

-| ![Spectro Cloud logo_600x600px_transparent bg](https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512) |
-|:-----------------------------------------------:|
-|  [Spectro Cloud](https://www.spectrocloud.com/)  |
-|  Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs!  |
-
-And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project.
-
- [Sponsor list](https://github.com/sponsors/mudler)
- JDAM00 (donating HW for the CI)
+<p align="center">
+  <a href="https://www.spectrocloud.com/" target="blank">
+    <img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
+  </a>
+  <a href="https://www.premai.io/" target="blank">
+    <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
+  </a>
+</p>

 ## 🌟 Star history

@@ -168,7 +386,7 @@ And a huge shout-out to individuals sponsoring the project by donating hardware

 LocalAI is a community-driven project created by [Ettore Di Giacinto](https://github.com/mudler/).

-MIT - Author Ettore Di Giacinto
+MIT - Author Ettore Di Giacinto <mudler@localai.io>

 ## 🙇 Acknowledgements

@@ -180,7 +398,6 @@ LocalAI couldn't have been built without the help of great software already avai
 - https://github.com/antimatter15/alpaca.cpp
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/ggerganov/whisper.cpp
- https://github.com/saharNooby/rwkv.cpp
 - https://github.com/rhasspy/piper

 ## 🤗 Contributors
--- a/aio/cpu/README.md
+++ b/aio/cpu/README.md
@@ -0,0 +1,5 @@
+## AIO CPU size
+
+Use this image with CPU-only.
+
+Please keep using only C++ backends so the base image is as small as possible (without CUDA, cuDNN, python, etc).
--- a/embedded/models/all-minilm-l6-v2.yaml
+++ b/embedded/models/all-minilm-l6-v2.yaml
@@ -1,13 +1,13 @@
-name: all-minilm-l6-v2
-backend: sentencetransformers
 embeddings: true
+name: text-embedding-ada-002
+backend: llama-cpp
 parameters:
-  model: all-MiniLM-L6-v2
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf

 usage: |
    You can test this model with curl like this:

    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
      "input": "Your text string goes here",
-      "model": "all-minilm-l6-v2"
+      "model": "text-embedding-ada-002"
    }'
--- a/aio/cpu/image-gen.yaml
+++ b/aio/cpu/image-gen.yaml
@@ -0,0 +1,23 @@
+name: stablediffusion
+backend: stablediffusion-ggml
+cfg_scale: 4.5
+
+options:
+- sampler:euler
+parameters:
+  model: stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf
+step: 25
+
+download_files:
+- filename: "stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
+  sha256: "b8944e9fe0b69b36ae1b5bb0185b3a7b8ef14347fe0fa9af6c64c4829022261f"
+  uri: "huggingface://second-state/stable-diffusion-v1-5-GGUF/stable-diffusion-v1-5-pruned-emaonly-Q4_0.gguf"
+
+usage: |
+        curl http://localhost:8080/v1/images/generations \
+          -H "Content-Type: application/json" \
+          -d '{
+            "prompt": "<positive prompt>|<negative prompt>",
+            "step": 25,
+            "size": "512x512"
+          }'
--- a/aio/cpu/rerank.yaml
+++ b/aio/cpu/rerank.yaml
@@ -0,0 +1,33 @@
+name: jina-reranker-v1-base-en
+reranking: true
+f16: true
+parameters:
+  model: jina-reranker-v1-tiny-en.f16.gguf
+backend: llama-cpp
+download_files:
+  - filename: jina-reranker-v1-tiny-en.f16.gguf
+    sha256: 5f696cf0d0f3d347c4a279eee8270e5918554cdac0ed1f632f2619e4e8341407
+    uri: huggingface://mradermacher/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en.f16.gguf 
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/embedded/models/whisper-base.yaml
+++ b/embedded/models/whisper-base.yaml
@@ -1,4 +1,4 @@
-name: whisper
+name: whisper-1
 backend: whisper
 parameters:
  model: ggml-whisper-base.bin
@@ -10,7 +10,7 @@ usage: |
    ## Send the example audio file to the transcriptions endpoint
    curl http://localhost:8080/v1/audio/transcriptions \
         -H "Content-Type: multipart/form-data" \
-         -F file="@$PWD/gb1.ogg" -F model="whisper"
+         -F file="@$PWD/gb1.ogg" -F model="whisper-1"

 download_files:
 - filename: "ggml-whisper-base.bin"
--- a/aio/cpu/text-to-speech.yaml
+++ b/aio/cpu/text-to-speech.yaml
@@ -0,0 +1,15 @@
+name: tts-1
+download_files:
+  - filename: voice-en-us-amy-low.tar.gz
+    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
+backend: piper
+parameters:
+  model: en-us-amy-low.onnx
+
+usage: |
+    To test if this model works as expected, you can use the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
+      "model":"voice-en-us-amy-low",
+      "input": "Hi, this is a test."
+    }'
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -0,0 +1,58 @@
+context_size: 8192
+f16: true
+backend: llama-cpp
+function:
+  grammar:
+    no_mixed_free_string: true
+    schema_type: llama3.1 # or JSON is supported too (json)
+  response_regex:
+  - <function=(?P<name>\w+)>(?P<arguments>.*)</function>
+mmap: true
+name: gpt-4
+parameters:
+  model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- <|eot_id|>
+- <|end_of_text|>
+template:
+  chat: |
+    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+    You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
+    {{.Input }}
+    <|start_header_id|>assistant<|end_header_id|>
+  chat_message: |
+    <|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
+    {{ if .FunctionCall -}}
+    {{ else if eq .RoleName "tool" -}}
+    The Function was executed and the response was:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content -}}
+    {{ else if .FunctionCall -}}
+    {{ range .FunctionCall }}
+    [{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
+    {{ end }}
+    {{ end -}}
+    <|eot_id|>
+  completion: |
+    {{.Input}}
+  function: |
+    <|start_header_id|>system<|end_header_id|>
+    You are an expert in composing functions. You are given a question and a set of possible functions.
+    Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
+    If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
+    If you decide to invoke any of the function(s), you MUST put it in the format as follows:
+    [func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
+    You SHOULD NOT include any other text in the response.
+    Here is a list of functions in JSON format that you can invoke.
+    {{toJson .Functions}}
+    <|eot_id|><|start_header_id|>user<|end_header_id|>
+    {{.Input}}
+    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+download_files:
+- filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+  sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
+  uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
--- a/aio/cpu/vad.yaml
+++ b/aio/cpu/vad.yaml
@@ -0,0 +1,8 @@
+backend: silero-vad
+name: silero-vad
+parameters:
+  model: silero-vad.onnx
+download_files:
+- filename: silero-vad.onnx
+  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
+  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -0,0 +1,50 @@
+context_size: 4096
+f16: true
+backend: llama-cpp
+mmap: true
+mmproj: minicpm-v-4_5-mmproj-f16.gguf
+name: gpt-4o
+parameters:
+  model: minicpm-v-4_5-Q4_K_M.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
+- <|endoftext|>
+template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content }}
+    {{ end -}}
+    {{ if .FunctionCall -}}
+    {{toJson .FunctionCall}}
+    {{ end -}}<|im_end|>
+  completion: |
+    {{.Input}}
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    For each function call return a json object with function name and arguments
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
+
+download_files:
+- filename: minicpm-v-4_5-Q4_K_M.gguf
+  sha256: c1c3c33100b15b4caf7319acce4e23c0eb0ce1cbd12f70e8d24f05aa67b7512f
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-4_5-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/mmproj-model-f16.gguf
+  sha256: 7a7225a32e8d453aaa3d22d8c579b5bf833c253f784cdb05c99c9a76fd616df8
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -0,0 +1,138 @@
+#!/bin/bash
+
+echo "===> LocalAI All-in-One (AIO) container starting..."
+
+GPU_ACCELERATION=false
+GPU_VENDOR=""
+
+function check_intel() {
+    if lspci | grep -E 'VGA|3D' | grep -iq intel; then
+        echo "Intel GPU detected"
+        if [ -d /opt/intel ]; then
+            GPU_ACCELERATION=true
+            GPU_VENDOR=intel
+        else
+            echo "Intel GPU detected, but Intel GPU drivers are not installed. GPU acceleration will not be available."
+        fi
+    fi
+}
+
+function check_nvidia_wsl() {
+    if lspci | grep -E 'VGA|3D' | grep -iq "Microsoft Corporation Device 008e"; then
+        # We make the assumption this WSL2 cars is NVIDIA, then check for nvidia-smi
+        # Make sure the container was run with `--gpus all` as the only required parameter
+        echo "NVIDIA GPU detected via WSL2"
+        # nvidia-smi should be installed in the container
+        if nvidia-smi; then
+            GPU_ACCELERATION=true
+            GPU_VENDOR=nvidia
+        else
+            echo "NVIDIA GPU detected via WSL2, but nvidia-smi is not installed. GPU acceleration will not be available."
+        fi
+    fi
+}
+
+function check_amd() {
+    if lspci | grep -E 'VGA|3D' | grep -iq amd; then
+        echo "AMD GPU detected"
+        # Check if ROCm is installed
+        if [ -d /opt/rocm ]; then
+            GPU_ACCELERATION=true
+            GPU_VENDOR=amd
+        else
+            echo "AMD GPU detected, but ROCm is not installed. GPU acceleration will not be available."
+        fi
+    fi
+}
+
+function check_nvidia() {
+    if lspci | grep -E 'VGA|3D' | grep -iq nvidia; then
+        echo "NVIDIA GPU detected"
+        # nvidia-smi should be installed in the container
+        if nvidia-smi; then
+            GPU_ACCELERATION=true
+            GPU_VENDOR=nvidia
+        else
+            echo "NVIDIA GPU detected, but nvidia-smi is not installed. GPU acceleration will not be available."
+        fi
+    fi
+}
+
+function check_metal() {
+    if system_profiler SPDisplaysDataType | grep -iq 'Metal'; then
+        echo "Apple Metal supported GPU detected"
+        GPU_ACCELERATION=true
+        GPU_VENDOR=apple
+    fi
+}
+
+function detect_gpu() {
+    case "$(uname -s)" in
+        Linux)
+            check_nvidia
+            check_amd
+            check_intel
+            check_nvidia_wsl
+            ;;
+        Darwin)
+            check_metal
+            ;;
+    esac
+}
+
+function detect_gpu_size() {
+    # Attempting to find GPU memory size for NVIDIA GPUs
+    if [ "$GPU_ACCELERATION" = true ] && [ "$GPU_VENDOR" = "nvidia" ]; then
+        echo "NVIDIA GPU detected. Attempting to find memory size..."
+        # Using head -n 1 to get the total memory of the 1st NVIDIA GPU detected.
+        # If handling multiple GPUs is required in the future, this is the place to do it
+        nvidia_sm=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -n 1)
+        if [ ! -z "$nvidia_sm" ]; then
+            echo "Total GPU Memory: $nvidia_sm MiB"
+            # if bigger than 8GB, use 16GB
+            #if [ "$nvidia_sm" -gt 8192 ]; then
+            #    GPU_SIZE=gpu-16g
+            #else
+            GPU_SIZE=gpu-8g
+            #fi
+        else
+            echo "Unable to determine NVIDIA GPU memory size. Falling back to CPU."
+            GPU_SIZE=gpu-8g
+        fi
+    elif [ "$GPU_ACCELERATION" = true ] && [ "$GPU_VENDOR" = "intel" ]; then
+        GPU_SIZE=intel
+    # Default to a generic GPU size until we implement GPU size detection for non NVIDIA GPUs
+    elif [ "$GPU_ACCELERATION" = true ]; then
+        echo "Non-NVIDIA GPU detected. Specific GPU memory size detection is not implemented."
+        GPU_SIZE=gpu-8g
+
+    # default to cpu if GPU_SIZE is not set
+    else
+        echo "GPU acceleration is not enabled or supported. Defaulting to CPU."
+        GPU_SIZE=cpu
+    fi
+}
+
+function check_vars() {
+    if [ -z "$MODELS" ]; then
+        echo "MODELS environment variable is not set. Please set it to a comma-separated list of model YAML files to load."
+        exit 1
+    fi
+
+    if [ -z "$PROFILE" ]; then
+        echo "PROFILE environment variable is not set. Please set it to one of the following: cpu, gpu-8g, gpu-16g, apple"
+        exit 1
+    fi
+}
+
+detect_gpu
+detect_gpu_size
+
+PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vad.yaml,/aio/${PROFILE}/vision.yaml}"
+
+check_vars
+
+echo "===> Starting LocalAI[$PROFILE] with the following models: $MODELS"
+
+exec /entrypoint.sh "$@"
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@@ -0,0 +1,13 @@
+embeddings: true
+name: text-embedding-ada-002
+backend: llama-cpp
+parameters:
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
+      "input": "Your text string goes here",
+      "model": "text-embedding-ada-002"
+    }'
--- a/aio/gpu-8g/image-gen.yaml
+++ b/aio/gpu-8g/image-gen.yaml
@@ -0,0 +1,25 @@
+name: stablediffusion
+parameters:
+  model: DreamShaper_8_pruned.safetensors
+backend: diffusers
+step: 25
+f16: true
+
+diffusers:
+  pipeline_type: StableDiffusionPipeline
+  cuda: true
+  enable_parameters: "negative_prompt,num_inference_steps"
+  scheduler_type: "k_dpmpp_2m"
+
+download_files:
+- filename: DreamShaper_8_pruned.safetensors
+  uri: huggingface://Lykon/DreamShaper/DreamShaper_8_pruned.safetensors
+
+usage: |
+        curl http://localhost:8080/v1/images/generations \
+          -H "Content-Type: application/json" \
+          -d '{
+            "prompt": "<positive prompt>|<negative prompt>",
+            "step": 25,
+            "size": "512x512"
+          }'
--- a/aio/gpu-8g/rerank.yaml
+++ b/aio/gpu-8g/rerank.yaml
@@ -0,0 +1,33 @@
+name: jina-reranker-v1-base-en
+reranking: true
+f16: true
+parameters:
+  model: jina-reranker-v1-tiny-en.f16.gguf
+backend: llama-cpp
+download_files:
+  - filename: jina-reranker-v1-tiny-en.f16.gguf
+    sha256: 5f696cf0d0f3d347c4a279eee8270e5918554cdac0ed1f632f2619e4e8341407
+    uri: huggingface://mradermacher/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en.f16.gguf 
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/gpu-8g/speech-to-text.yaml
+++ b/aio/gpu-8g/speech-to-text.yaml
@@ -0,0 +1,18 @@
+name: whisper-1
+backend: whisper
+parameters:
+  model: ggml-whisper-base.bin
+
+usage: |
+    ## example audio file
+    wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
+
+    ## Send the example audio file to the transcriptions endpoint
+    curl http://localhost:8080/v1/audio/transcriptions \
+         -H "Content-Type: multipart/form-data" \
+         -F file="@$PWD/gb1.ogg" -F model="whisper-1"
+
+download_files:
+- filename: "ggml-whisper-base.bin"
+  sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
+  uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
--- a/embedded/models/rhasspy-voice-en-us-amy.yaml
+++ b/embedded/models/rhasspy-voice-en-us-amy.yaml
@@ -1,13 +1,15 @@
-name: voice-en-us-amy-low
+name: tts-1
 download_files:
  - filename: voice-en-us-amy-low.tar.gz
    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
-
+backend: piper
+parameters:
+  model: en-us-amy-low.onnx

 usage: |
    To test if this model works as expected, you can use the following curl command:

    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
-      "model":"en-us-amy-low.onnx",
+      "model":"tts-1",
      "input": "Hi, this is a test."
    }'
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -0,0 +1,54 @@
+context_size: 4096
+f16: true
+backend: llama-cpp
+function:
+  capture_llm_results:
+  - (?s)<Thought>(.*?)</Thought>
+  grammar:
+    properties_order: name,arguments
+  json_regex_match:
+  - (?s)<Output>(.*?)</Output>
+  replace_llm_results:
+  - key: (?s)<Thought>(.*?)</Thought>
+    value: ""
+mmap: true
+name: gpt-4
+parameters:
+  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
+template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content }}
+    {{ end -}}
+    {{ if .FunctionCall -}}
+    {{toJson .FunctionCall}}
+    {{ end -}}<|im_end|>
+  completion: |
+    {{.Input}}
+  function: |
+    <|im_start|>system
+    You are an AI assistant that executes function calls, and these are the tools at your disposal:
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
+
+download_files:
+- filename: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+  sha256: 4e7b7fe1d54b881f1ef90799219dc6cc285d29db24f559c8998d1addb35713d4
+  uri: huggingface://mudler/LocalAI-functioncall-qwen2.5-7b-v0.5-Q4_K_M-GGUF/localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
--- a/aio/gpu-8g/vad.yaml
+++ b/aio/gpu-8g/vad.yaml
@@ -0,0 +1,8 @@
+backend: silero-vad
+name: silero-vad
+parameters:
+  model: silero-vad.onnx
+download_files:
+- filename: silero-vad.onnx
+  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
+  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -0,0 +1,50 @@
+context_size: 4096
+backend: llama-cpp
+f16: true
+mmap: true
+mmproj: minicpm-v-4_5-mmproj-f16.gguf
+name: gpt-4o
+parameters:
+  model: minicpm-v-4_5-Q4_K_M.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
+- <|endoftext|>
+template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content }}
+    {{ end -}}
+    {{ if .FunctionCall -}}
+    {{toJson .FunctionCall}}
+    {{ end -}}<|im_end|>
+  completion: |
+    {{.Input}}
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    For each function call return a json object with function name and arguments
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
+
+download_files:
+- filename: minicpm-v-4_5-Q4_K_M.gguf
+  sha256: c1c3c33100b15b4caf7319acce4e23c0eb0ce1cbd12f70e8d24f05aa67b7512f
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-4_5-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/mmproj-model-f16.gguf
+  sha256: 7a7225a32e8d453aaa3d22d8c579b5bf833c253f784cdb05c99c9a76fd616df8
--- a/aio/intel/embeddings.yaml
+++ b/aio/intel/embeddings.yaml
@@ -0,0 +1,13 @@
+embeddings: true
+name: text-embedding-ada-002
+backend: llama-cpp
+parameters:
+  model: huggingface://bartowski/granite-embedding-107m-multilingual-GGUF/granite-embedding-107m-multilingual-f16.gguf
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
+      "input": "Your text string goes here",
+      "model": "text-embedding-ada-002"
+    }'
--- a/aio/intel/image-gen.yaml
+++ b/aio/intel/image-gen.yaml
@@ -0,0 +1,20 @@
+name: stablediffusion
+parameters:
+  model: Lykon/dreamshaper-8
+backend: diffusers
+step: 25
+f16: true
+diffusers:
+  pipeline_type: StableDiffusionPipeline
+  cuda: true
+  enable_parameters: "negative_prompt,num_inference_steps"
+  scheduler_type: "k_dpmpp_2m"
+
+usage: |
+        curl http://localhost:8080/v1/images/generations \
+          -H "Content-Type: application/json" \
+          -d '{
+            "prompt": "<positive prompt>|<negative prompt>",
+            "step": 25,
+            "size": "512x512"
+          }'
--- a/aio/intel/rerank.yaml
+++ b/aio/intel/rerank.yaml
@@ -0,0 +1,33 @@
+name: jina-reranker-v1-base-en
+reranking: true
+f16: true
+parameters:
+  model: jina-reranker-v1-tiny-en.f16.gguf
+backend: llama-cpp
+download_files:
+  - filename: jina-reranker-v1-tiny-en.f16.gguf
+    sha256: 5f696cf0d0f3d347c4a279eee8270e5918554cdac0ed1f632f2619e4e8341407
+    uri: huggingface://mradermacher/jina-reranker-v1-tiny-en-GGUF/jina-reranker-v1-tiny-en.f16.gguf 
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/intel/speech-to-text.yaml
+++ b/aio/intel/speech-to-text.yaml
@@ -0,0 +1,18 @@
+name: whisper-1
+backend: whisper
+parameters:
+  model: ggml-whisper-base.bin
+
+usage: |
+    ## example audio file
+    wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
+
+    ## Send the example audio file to the transcriptions endpoint
+    curl http://localhost:8080/v1/audio/transcriptions \
+         -H "Content-Type: multipart/form-data" \
+         -F file="@$PWD/gb1.ogg" -F model="whisper-1"
+
+download_files:
+- filename: "ggml-whisper-base.bin"
+  sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
+  uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
--- a/aio/intel/text-to-speech.yaml
+++ b/aio/intel/text-to-speech.yaml
@@ -0,0 +1,15 @@
+name: tts-1
+download_files:
+  - filename: voice-en-us-amy-low.tar.gz
+    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
+backend: piper
+parameters:
+  model: en-us-amy-low.onnx
+
+usage: |
+    To test if this model works as expected, you can use the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
+      "model":"tts-1",
+      "input": "Hi, this is a test."
+    }'
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -0,0 +1,54 @@
+context_size: 4096
+f16: true
+backend: llama-cpp
+function:
+  capture_llm_results:
+  - (?s)<Thought>(.*?)</Thought>
+  grammar:
+    properties_order: name,arguments
+  json_regex_match:
+  - (?s)<Output>(.*?)</Output>
+  replace_llm_results:
+  - key: (?s)<Thought>(.*?)</Thought>
+    value: ""
+mmap: true
+name: gpt-4
+parameters:
+  model: localai-functioncall-qwen2.5-7b-v0.5-q4_k_m.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
+template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content }}
+    {{ end -}}
+    {{ if .FunctionCall -}}
+    {{toJson .FunctionCall}}
+    {{ end -}}<|im_end|>
+  completion: |
+    {{.Input}}
+  function: |
+    <|im_start|>system
+    You are an AI assistant that executes function calls, and these are the tools at your disposal:
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
+
+download_files:
+- filename: localai-functioncall-phi-4-v0.3-q4_k_m.gguf
+  sha256: 23fee048ded2a6e2e1a7b6bbefa6cbf83068f194caa9552aecbaa00fec8a16d5
+  uri: huggingface://mudler/LocalAI-functioncall-phi-4-v0.3-Q4_K_M-GGUF/localai-functioncall-phi-4-v0.3-q4_k_m.gguf
--- a/aio/intel/vad.yaml
+++ b/aio/intel/vad.yaml
@@ -0,0 +1,8 @@
+backend: silero-vad
+name: silero-vad
+parameters:
+  model: silero-vad.onnx
+download_files:
+- filename: silero-vad.onnx
+  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
+  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -0,0 +1,51 @@
+context_size: 4096
+backend: llama-cpp
+f16: true
+mmap: true
+mmproj: minicpm-v-4_5-mmproj-f16.gguf
+name: gpt-4o
+parameters:
+  model: minicpm-v-4_5-Q4_K_M.gguf
+stopwords:
+- <|im_end|>
+- <dummy32000>
+- </s>
+- <|endoftext|>
+template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{ .RoleName }}
+    {{ if .FunctionCall -}}
+    Function call:
+    {{ else if eq .RoleName "tool" -}}
+    Function response:
+    {{ end -}}
+    {{ if .Content -}}
+    {{.Content }}
+    {{ end -}}
+    {{ if .FunctionCall -}}
+    {{toJson .FunctionCall}}
+    {{ end -}}<|im_end|>
+  completion: |
+    {{.Input}}
+  function: |
+    <|im_start|>system
+    You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    For each function call return a json object with function name and arguments
+    <|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
+
+
+download_files:
+- filename: minicpm-v-4_5-Q4_K_M.gguf
+  sha256: c1c3c33100b15b4caf7319acce4e23c0eb0ce1cbd12f70e8d24f05aa67b7512f
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/ggml-model-Q4_K_M.gguf
+- filename: minicpm-v-4_5-mmproj-f16.gguf
+  uri: huggingface://openbmb/MiniCPM-V-4_5-gguf/mmproj-model-f16.gguf
+  sha256: 7a7225a32e8d453aaa3d22d8c579b5bf833c253f784cdb05c99c9a76fd616df8
--- a/assets.go
+++ b/assets.go
@@ -1,6 +0,0 @@
-package main
-
-import "embed"
-
-//go:embed backend-assets/*
-var backendAssets embed.FS
--- a/backend/Dockerfile.golang
+++ b/backend/Dockerfile.golang
@@ -0,0 +1,131 @@
+ARG BASE_IMAGE=ubuntu:22.04
+
+FROM ${BASE_IMAGE} AS builder
+ARG BACKEND=rerankers
+ARG BUILD_TYPE
+ENV BUILD_TYPE=${BUILD_TYPE}
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
+ARG SKIP_DRIVERS=false
+ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
+ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETARCH
+ARG TARGETVARIANT
+ARG GO_VERSION=1.22.6
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        git ccache \
+        ca-certificates \
+        make cmake \
+        curl unzip \
+        libssl-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+
+# Cuda
+ENV PATH=/usr/local/cuda/bin:${PATH}
+
+# HipBLAS requirements
+ENV PATH=/opt/rocm/bin:${PATH}
+
+# Vulkan requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils wget gpg-agent && \
+        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+        apt-get update && \
+        apt-get install -y \
+            vulkan-sdk && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# CuBLAS requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils
+        if [ "amd64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        fi
+        if [ "arm64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
+        fi
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+        rm -f cuda-keyring_1.1-1_all.deb && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# If we are building with clblas support, we need the libraries for the builds
+RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            libclblast-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
+    ; fi
+
+RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            hipblas-dev \
+            rocblas-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
+        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
+        ldconfig \
+    ; fi
+
+# Install Go
+RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
+ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin:/usr/local/bin
+
+# Install grpc compilers
+RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+RUN echo "TARGETARCH: $TARGETARCH"
+
+# We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
+# but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
+# here so that we can generate the grpc code for the stablediffusion build
+RUN <<EOT bash
+    if [ "amd64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+    if [ "arm64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+EOT
+
+COPY . /LocalAI
+
+RUN cd /LocalAI && make protogen-go && make -C /LocalAI/backend/go/${BACKEND} build
+
+FROM scratch
+ARG BACKEND=rerankers
+
+COPY --from=builder /LocalAI/backend/go/${BACKEND}/package/. ./
--- a/backend/Dockerfile.llama-cpp
+++ b/backend/Dockerfile.llama-cpp
@@ -0,0 +1,207 @@
+ARG BASE_IMAGE=ubuntu:22.04
+ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
+
+
+# The grpc target does one thing, it builds and installs GRPC.  This is in it's own layer so that it can be effectively cached by CI.
+# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
+FROM ${GRPC_BASE_IMAGE} AS grpc
+
+# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
+ARG GRPC_MAKEFLAGS="-j4 -Otarget"
+ARG GRPC_VERSION=v1.65.0
+ARG CMAKE_FROM_SOURCE=false
+ARG CMAKE_VERSION=3.26.4
+
+ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
+
+WORKDIR /build
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ca-certificates \
+        build-essential curl libssl-dev \
+        git && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install CMake (the version in 22.04 is too old)
+RUN <<EOT bash
+    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
+        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
+    else
+        apt-get update && \
+        apt-get install -y \
+            cmake && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
+# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
+# and running make install in the target container
+RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+    mkdir -p /build/grpc/cmake/build && \
+    cd /build/grpc/cmake/build && \
+    sed -i "216i\  TESTONLY" "../../third_party/abseil-cpp/absl/container/CMakeLists.txt" && \
+    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
+    make && \
+    make install && \
+    rm -rf /build
+
+FROM ${BASE_IMAGE} AS builder
+ARG BACKEND=rerankers
+ARG BUILD_TYPE
+ENV BUILD_TYPE=${BUILD_TYPE}
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
+ARG SKIP_DRIVERS=false
+ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
+ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETARCH
+ARG TARGETVARIANT
+ARG GO_VERSION=1.22.6
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ccache git \
+        ca-certificates \
+        make \
+        curl unzip \
+        libssl-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Cuda
+ENV PATH=/usr/local/cuda/bin:${PATH}
+
+# HipBLAS requirements
+ENV PATH=/opt/rocm/bin:${PATH}
+
+# Vulkan requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils wget gpg-agent && \
+        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+        apt-get update && \
+        apt-get install -y \
+            vulkan-sdk && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# CuBLAS requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils
+        if [ "amd64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        fi
+        if [ "arm64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
+        fi
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+        rm -f cuda-keyring_1.1-1_all.deb && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# If we are building with clblas support, we need the libraries for the builds
+RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            libclblast-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
+    ; fi
+
+RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            hipblas-dev \
+            rocblas-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
+        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
+        ldconfig \
+    ; fi
+
+RUN echo "TARGETARCH: $TARGETARCH"
+
+# We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
+# but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
+# here so that we can generate the grpc code for the stablediffusion build
+RUN <<EOT bash
+    if [ "amd64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+    if [ "arm64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+EOT
+
+# Install CMake (the version in 22.04 is too old)
+RUN <<EOT bash
+    if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
+        curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
+    else
+        apt-get update && \
+        apt-get install -y \
+            cmake && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+COPY --from=grpc /opt/grpc /usr/local
+
+
+COPY . /LocalAI
+
+## Otherwise just run the normal build
+RUN <<EOT bash
+if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then \
+        cd /LocalAI/backend/cpp/llama-cpp && make llama-cpp-fallback && \
+        make llama-cpp-grpc && make llama-cpp-rpc-server; \
+    else \
+        cd /LocalAI/backend/cpp/llama-cpp && make llama-cpp-avx && \
+        make llama-cpp-avx2 && \
+        make llama-cpp-avx512 && \
+        make llama-cpp-fallback && \
+        make llama-cpp-grpc && \
+        make llama-cpp-rpc-server; \
+    fi  
+EOT
+
+
+# Copy libraries using a script to handle architecture differences
+RUN make -C /LocalAI/backend/cpp/llama-cpp package
+
+
+FROM scratch
+
+
+# Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
+COPY --from=builder /LocalAI/backend/cpp/llama-cpp/package/. ./
--- a/backend/Dockerfile.python
+++ b/backend/Dockerfile.python
@@ -0,0 +1,123 @@
+ARG BASE_IMAGE=ubuntu:22.04
+
+FROM ${BASE_IMAGE} AS builder
+ARG BACKEND=rerankers
+ARG BUILD_TYPE
+ENV BUILD_TYPE=${BUILD_TYPE}
+ARG CUDA_MAJOR_VERSION
+ARG CUDA_MINOR_VERSION
+ARG SKIP_DRIVERS=false
+ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
+ENV CUDA_MINOR_VERSION=${CUDA_MINOR_VERSION}
+ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETARCH
+ARG TARGETVARIANT
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ccache \
+        ca-certificates \
+        espeak-ng \
+        curl \
+        libssl-dev \
+        git \
+        git-lfs \
+        unzip clang \
+        upx-ucl \
+        curl python3-pip \
+        python-is-python3 \
+        python3-dev llvm \
+        python3-venv make && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    pip install --upgrade pip
+
+
+# Cuda
+ENV PATH=/usr/local/cuda/bin:${PATH}
+
+# HipBLAS requirements
+ENV PATH=/opt/rocm/bin:${PATH}
+
+# Vulkan requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "vulkan" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils wget gpg-agent && \
+        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+        apt-get update && \
+        apt-get install -y \
+            vulkan-sdk && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# CuBLAS requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils
+        if [ "amd64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+        fi
+        if [ "arm64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
+        fi
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+        rm -f cuda-keyring_1.1-1_all.deb && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
+# If we are building with clblas support, we need the libraries for the builds
+RUN if [ "${BUILD_TYPE}" = "clblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            libclblast-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
+    ; fi
+
+RUN if [ "${BUILD_TYPE}" = "hipblas" ] && [ "${SKIP_DRIVERS}" = "false" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            hipblas-dev \
+            rocblas-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
+        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
+        ldconfig \
+    ; fi
+# Install uv as a system package
+RUN curl -LsSf https://astral.sh/uv/install.sh | UV_INSTALL_DIR=/usr/bin sh
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+
+# Install grpcio-tools (the version in 22.04 is too old)
+RUN pip install --user grpcio-tools==1.71.0 grpcio==1.71.0
+
+COPY python/${BACKEND} /${BACKEND}
+COPY backend.proto /${BACKEND}/backend.proto
+COPY python/common/ /${BACKEND}/common
+
+RUN cd /${BACKEND} && PORTABLE_PYTHON=true make
+
+FROM scratch
+ARG BACKEND=rerankers
+COPY --from=builder /${BACKEND}/ /
--- a/backend/README.md
+++ b/backend/README.md
@@ -0,0 +1,213 @@
+# LocalAI Backend Architecture
+
+This directory contains the core backend infrastructure for LocalAI, including the gRPC protocol definition, multi-language Dockerfiles, and language-specific backend implementations.
+
+## Overview
+
+LocalAI uses a unified gRPC-based architecture that allows different programming languages to implement AI backends while maintaining consistent interfaces and capabilities. The backend system supports multiple hardware acceleration targets and provides a standardized way to integrate various AI models and frameworks.
+
+## Architecture Components
+
+### 1. Protocol Definition (`backend.proto`)
+
+The `backend.proto` file defines the gRPC service interface that all backends must implement. This ensures consistency across different language implementations and provides a contract for communication between LocalAI core and backend services.
+
+#### Core Services
+
+- **Text Generation**: `Predict`, `PredictStream` for LLM inference
+- **Embeddings**: `Embedding` for text vectorization
+- **Image Generation**: `GenerateImage` for stable diffusion and image models
+- **Audio Processing**: `AudioTranscription`, `TTS`, `SoundGeneration`
+- **Video Generation**: `GenerateVideo` for video synthesis
+- **Object Detection**: `Detect` for computer vision tasks
+- **Vector Storage**: `StoresSet`, `StoresGet`, `StoresFind` for RAG operations
+- **Reranking**: `Rerank` for document relevance scoring
+- **Voice Activity Detection**: `VAD` for audio segmentation
+
+#### Key Message Types
+
+- **`PredictOptions`**: Comprehensive configuration for text generation
+- **`ModelOptions`**: Model loading and configuration parameters
+- **`Result`**: Standardized response format
+- **`StatusResponse`**: Backend health and memory usage information
+
+### 2. Multi-Language Dockerfiles
+
+The backend system provides language-specific Dockerfiles that handle the build environment and dependencies for different programming languages:
+
+- `Dockerfile.python`
+- `Dockerfile.golang`
+- `Dockerfile.llama-cpp`
+
+### 3. Language-Specific Implementations
+
+#### Python Backends (`python/`)
+- **transformers**: Hugging Face Transformers framework
+- **vllm**: High-performance LLM inference
+- **mlx**: Apple Silicon optimization
+- **diffusers**: Stable Diffusion models
+- **Audio**: bark, coqui, faster-whisper, kitten-tts
+- **Vision**: mlx-vlm, rfdetr
+- **Specialized**: rerankers, chatterbox, kokoro
+
+#### Go Backends (`go/`)
+- **whisper**: OpenAI Whisper speech recognition in Go with GGML cpp backend (whisper.cpp)
+- **stablediffusion-ggml**: Stable Diffusion in Go with GGML Cpp backend
+- **huggingface**: Hugging Face model integration
+- **piper**: Text-to-speech synthesis Golang with C bindings using rhaspy/piper
+- **bark-cpp**: Bark TTS models Golang with Cpp bindings
+- **local-store**: Vector storage backend
+
+#### C++ Backends (`cpp/`)
+- **llama-cpp**: Llama.cpp integration
+- **grpc**: GRPC utilities and helpers
+
+## Hardware Acceleration Support
+
+### CUDA (NVIDIA)
+- **Versions**: CUDA 11.x, 12.x
+- **Features**: cuBLAS, cuDNN, TensorRT optimization
+- **Targets**: x86_64, ARM64 (Jetson)
+
+### ROCm (AMD)
+- **Features**: HIP, rocBLAS, MIOpen
+- **Targets**: AMD GPUs with ROCm support
+
+### Intel
+- **Features**: oneAPI, Intel Extension for PyTorch
+- **Targets**: Intel GPUs, XPUs, CPUs
+
+### Vulkan
+- **Features**: Cross-platform GPU acceleration
+- **Targets**: Windows, Linux, Android, macOS
+
+### Apple Silicon
+- **Features**: MLX framework, Metal Performance Shaders
+- **Targets**: M1/M2/M3 Macs
+
+## Backend Registry (`index.yaml`)
+
+The `index.yaml` file serves as a central registry for all available backends, providing:
+
+- **Metadata**: Name, description, license, icons
+- **Capabilities**: Hardware targets and optimization profiles
+- **Tags**: Categorization for discovery
+- **URLs**: Source code and documentation links
+
+## Building Backends
+
+### Prerequisites
+- Docker with multi-architecture support
+- Appropriate hardware drivers (CUDA, ROCm, etc.)
+- Build tools (make, cmake, compilers)
+
+### Build Commands
+
+Example of build commands with Docker
+
+```bash
+# Build Python backend
+docker build -f backend/Dockerfile.python \
+  --build-arg BACKEND=transformers \
+  --build-arg BUILD_TYPE=cublas12 \
+  --build-arg CUDA_MAJOR_VERSION=12 \
+  --build-arg CUDA_MINOR_VERSION=8 \
+  -t localai-backend-transformers .
+
+# Build Go backend
+docker build -f backend/Dockerfile.golang \
+  --build-arg BACKEND=whisper \
+  --build-arg BUILD_TYPE=cpu \
+  -t localai-backend-whisper .
+
+# Build C++ backend
+docker build -f backend/Dockerfile.llama-cpp \
+  --build-arg BACKEND=llama-cpp \
+  --build-arg BUILD_TYPE=cublas12 \
+  -t localai-backend-llama-cpp .
+```
+
+For ARM64/Mac builds, docker can't be used, and the makefile in the respective backend has to be used.
+
+### Build Types
+
+- **`cpu`**: CPU-only optimization
+- **`cublas11`**: CUDA 11.x with cuBLAS
+- **`cublas12`**: CUDA 12.x with cuBLAS
+- **`hipblas`**: ROCm with rocBLAS
+- **`intel`**: Intel oneAPI optimization
+- **`vulkan`**: Vulkan-based acceleration
+- **`metal`**: Apple Metal optimization
+
+## Backend Development
+
+### Creating a New Backend
+
+1. **Choose Language**: Select Python, Go, or C++ based on requirements
+2. **Implement Interface**: Implement the gRPC service defined in `backend.proto`
+3. **Add Dependencies**: Create appropriate requirements files
+4. **Configure Build**: Set up Dockerfile and build scripts
+5. **Register Backend**: Add entry to `index.yaml`
+6. **Test Integration**: Verify gRPC communication and functionality
+
+### Backend Structure
+
+```
+backend-name/
+├── backend.py/go/cpp    # Main implementation
+├── requirements.txt      # Dependencies
+├── Dockerfile           # Build configuration
+├── install.sh           # Installation script
+├── run.sh              # Execution script
+├── test.sh             # Test script
+└── README.md           # Backend documentation
+```
+
+### Required gRPC Methods
+
+At minimum, backends must implement:
+- `Health()` - Service health check
+- `LoadModel()` - Model loading and initialization
+- `Predict()` - Main inference endpoint
+- `Status()` - Backend status and metrics
+
+## Integration with LocalAI Core
+
+Backends communicate with LocalAI core through gRPC:
+
+1. **Service Discovery**: Core discovers available backends
+2. **Model Loading**: Core requests model loading via `LoadModel`
+3. **Inference**: Core sends requests via `Predict` or specialized endpoints
+4. **Streaming**: Core handles streaming responses for real-time generation
+5. **Monitoring**: Core tracks backend health and performance
+
+## Performance Optimization
+
+### Memory Management
+- **Model Caching**: Efficient model loading and caching
+- **Batch Processing**: Optimize for multiple concurrent requests
+- **Memory Pinning**: GPU memory optimization for CUDA/ROCm
+
+### Hardware Utilization
+- **Multi-GPU**: Support for tensor parallelism
+- **Mixed Precision**: FP16/BF16 for memory efficiency
+- **Kernel Fusion**: Optimized CUDA/ROCm kernels
+
+## Troubleshooting
+
+### Common Issues
+
+1. **GRPC Connection**: Verify backend service is running and accessible
+2. **Model Loading**: Check model paths and dependencies
+3. **Hardware Detection**: Ensure appropriate drivers and libraries
+4. **Memory Issues**: Monitor GPU memory usage and model sizes
+
+## Contributing
+
+When contributing to the backend system:
+
+1. **Follow Protocol**: Implement the exact gRPC interface
+2. **Add Tests**: Include comprehensive test coverage
+3. **Document**: Provide clear usage examples
+4. **Optimize**: Consider performance and resource usage
+5. **Validate**: Test across different hardware targets
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -14,10 +14,94 @@ service Backend {
  rpc PredictStream(PredictOptions) returns (stream Reply) {}
  rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
+  rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
  rpc TTS(TTSRequest) returns (Result) {}
+  rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
  rpc Status(HealthMessage) returns (StatusResponse) {}
+  rpc Detect(DetectOptions) returns (DetectResponse) {}
+
+  rpc StoresSet(StoresSetOptions) returns (Result) {}
+  rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
+  rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
+  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
+
+  rpc Rerank(RerankRequest) returns (RerankResult) {}
+
+  rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
+
+  rpc VAD(VADRequest) returns (VADResponse) {}
+}
+
+// Define the empty request
+message MetricsRequest {}
+
+message MetricsResponse {
+  int32 slot_id = 1;
+  string prompt_json_for_slot = 2;  // Stores the prompt as a JSON string.
+  float tokens_per_second = 3;
+  int32 tokens_generated = 4;
+  int32 prompt_tokens_processed = 5;
+}
+
+message RerankRequest {
+  string query = 1;
+  repeated string documents = 2;
+  int32 top_n = 3;
+}
+
+message RerankResult {
+  Usage usage = 1;
+  repeated DocumentResult results = 2;
+}
+
+message Usage {
+  int32 total_tokens = 1;
+  int32 prompt_tokens = 2;
+}
+
+message DocumentResult {
+  int32 index = 1;
+  string text = 2;
+  float relevance_score = 3;
+}
+
+message StoresKey {
+  repeated float Floats = 1;
+}
+
+message StoresValue {
+  bytes Bytes = 1;
+}
+
+message StoresSetOptions {
+  repeated StoresKey Keys = 1;
+  repeated StoresValue Values = 2;
+}
+
+message StoresDeleteOptions {
+  repeated StoresKey Keys = 1;
+}
+
+message StoresGetOptions {
+  repeated StoresKey Keys = 1;
+}
+
+message StoresGetResult {
+  repeated StoresKey Keys = 1;
+  repeated StoresValue Values = 2;
+}
+
+message StoresFindOptions {
+  StoresKey Key = 1;
+  int32 TopK = 2;
+}
+
+message StoresFindResult {
+  repeated StoresKey Keys = 1;
+  repeated StoresValue Values = 2;
+  repeated float Similarities = 3;
 }

 message HealthMessage {}
@@ -65,11 +149,25 @@ message PredictOptions {
  string NegativePrompt = 40;
  int32 NDraft = 41;
  repeated string Images = 42;
+  bool UseTokenizerTemplate = 43;
+  repeated Message Messages = 44;
+  repeated string Videos = 45;
+  repeated string Audios = 46;
+  string CorrelationId = 47;
 }

 // The response message containing the result
 message Reply {
  bytes message = 1;
+  int32 tokens = 2;
+  int32 prompt_tokens = 3;
+  double timing_prompt_processing = 4;
+  double timing_token_generation = 5;
+  bytes audio = 6;
+}
+
+message GrammarTrigger {
+  string word = 1;
 }

 message ModelOptions {
@@ -88,18 +186,13 @@ message ModelOptions {
  string MainGPU = 13;
  string TensorSplit = 14;
  int32 Threads = 15;
-  string LibrarySearchPath = 16;
  float RopeFreqBase = 17;
  float RopeFreqScale = 18;
  float RMSNormEps = 19;
  int32 NGQA = 20;
  string ModelFile = 21;

-  // AutoGPTQ
-  string Device = 22;
-  bool UseTriton = 23;
-  string ModelBaseName = 24;
-  bool UseFastTokenizer = 25;
+

  // Diffusers
  string PipelineType = 26;
@@ -121,7 +214,7 @@ message ModelOptions {

  bool NoMulMatQ = 37;
  string DraftModel = 39;
-  
+
  string AudioPath = 38;

  // vllm
@@ -131,6 +224,13 @@ message ModelOptions {
  bool   EnforceEager = 52;
  int32  SwapSpace = 53;
  int32  MaxModelLen = 54;
+  int32  TensorParallelSize = 55;
+  string LoadFormat = 58;
+  bool   DisableLogStatus = 66;
+  string DType = 67;
+  int32  LimitImagePerPrompt = 68;
+  int32  LimitVideoPerPrompt = 69;
+  int32  LimitAudioPerPrompt = 70;

  string MMProj = 41;

@@ -141,6 +241,25 @@ message ModelOptions {
  float YarnBetaSlow = 47;

  string Type = 49;
+
+  string FlashAttention = 56;
+  bool NoKVOffload = 57;
+
+  string ModelPath = 59;
+
+  repeated string LoraAdapters = 60;
+  repeated float LoraScales = 61;
+
+  repeated string Options = 62;
+
+  string CacheTypeKey = 63;
+  string CacheTypeValue = 64;
+
+  repeated GrammarTrigger GrammarTriggers = 65;
+
+  bool Reranking = 71;
+
+  repeated string Overrides = 72;
 }

 message Result {
@@ -156,6 +275,8 @@ message TranscriptRequest {
  string dst = 2;
  string language = 3;
  uint32 threads = 4;
+  bool translate = 5;
+  bool diarize = 6;
 }

 message TranscriptResult {
@@ -185,6 +306,24 @@ message GenerateImageRequest {
  // Diffusers
  string EnableParameters = 10;
  int32 CLIPSkip = 11;
+
+  // Reference images for models that support them (e.g., Flux Kontext)
+  repeated string ref_images = 12;
+}
+
+message GenerateVideoRequest {
+  string prompt = 1;
+  string negative_prompt = 2;  // Negative prompt for video generation
+  string start_image = 3;  // Path or base64 encoded image for the start frame
+  string end_image = 4;    // Path or base64 encoded image for the end frame
+  int32 width = 5;
+  int32 height = 6;
+  int32 num_frames = 7;    // Number of frames to generate
+  int32 fps = 8;          // Frames per second
+  int32 seed = 9;
+  float cfg_scale = 10;    // Classifier-free guidance scale
+  int32 step = 11;         // Number of inference steps
+  string dst = 12;        // Output path for the generated video
 }

 message TTSRequest {
@@ -192,6 +331,31 @@ message TTSRequest {
  string model = 2;
  string dst = 3;
  string voice = 4;
+  optional string language = 5;
+}
+
+message VADRequest {
+  repeated float audio = 1;
+}
+
+message VADSegment {
+  float start = 1;
+  float end = 2;
+}
+
+message VADResponse {
+  repeated VADSegment segments = 1;
+}
+
+message SoundGenerationRequest {
+  string text = 1;
+  string model = 2;
+  string dst = 3;
+  optional float duration = 4;
+  optional float temperature = 5;
+  optional bool sample = 6;
+  optional string src = 7;
+  optional int32 src_divisor = 8;
 }

 message TokenizationResponse {
@@ -213,4 +377,26 @@ message StatusResponse {
  }
  State state = 1;
  MemoryUsageData memory = 2;
-}
+}
+
+message Message {
+  string role = 1;
+  string content = 2;
+}
+
+message DetectOptions {
+  string src = 1;
+}
+
+message Detection {
+  float x = 1;
+  float y = 2;
+  float width = 3;
+  float height = 4;
+  float confidence = 5;
+  string class_name = 6;
+}
+
+message DetectResponse {
+  repeated Detection Detections = 1;
+}
--- a/backend/backend_grpc.pb.go
+++ b/backend/backend_grpc.pb.go
@@ -1,457 +0,0 @@
-// Code generated by protoc-gen-go-grpc. DO NOT EDIT.
-// versions:
-// - protoc-gen-go-grpc v1.2.0
-// - protoc             v4.23.4
-// source: backend/backend.proto
-
-package proto
-
-import (
-	context "context"
-	grpc "google.golang.org/grpc"
-	codes "google.golang.org/grpc/codes"
-	status "google.golang.org/grpc/status"
-)
-
-// This is a compile-time assertion to ensure that this generated file
-// is compatible with the grpc package it is being compiled against.
-// Requires gRPC-Go v1.32.0 or later.
-const _ = grpc.SupportPackageIsVersion7
-
-// BackendClient is the client API for Backend service.
-//
-// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream.
-type BackendClient interface {
-	Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error)
-	Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error)
-	LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error)
-	PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error)
-	Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error)
-	GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error)
-	AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error)
-	TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error)
-	TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error)
-	Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error)
-}
-
-type backendClient struct {
-	cc grpc.ClientConnInterface
-}
-
-func NewBackendClient(cc grpc.ClientConnInterface) BackendClient {
-	return &backendClient{cc}
-}
-
-func (c *backendClient) Health(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*Reply, error) {
-	out := new(Reply)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Health", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) Predict(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*Reply, error) {
-	out := new(Reply)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Predict", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) LoadModel(ctx context.Context, in *ModelOptions, opts ...grpc.CallOption) (*Result, error) {
-	out := new(Result)
-	err := c.cc.Invoke(ctx, "/backend.Backend/LoadModel", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) PredictStream(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (Backend_PredictStreamClient, error) {
-	stream, err := c.cc.NewStream(ctx, &Backend_ServiceDesc.Streams[0], "/backend.Backend/PredictStream", opts...)
-	if err != nil {
-		return nil, err
-	}
-	x := &backendPredictStreamClient{stream}
-	if err := x.ClientStream.SendMsg(in); err != nil {
-		return nil, err
-	}
-	if err := x.ClientStream.CloseSend(); err != nil {
-		return nil, err
-	}
-	return x, nil
-}
-
-type Backend_PredictStreamClient interface {
-	Recv() (*Reply, error)
-	grpc.ClientStream
-}
-
-type backendPredictStreamClient struct {
-	grpc.ClientStream
-}
-
-func (x *backendPredictStreamClient) Recv() (*Reply, error) {
-	m := new(Reply)
-	if err := x.ClientStream.RecvMsg(m); err != nil {
-		return nil, err
-	}
-	return m, nil
-}
-
-func (c *backendClient) Embedding(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*EmbeddingResult, error) {
-	out := new(EmbeddingResult)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Embedding", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) GenerateImage(ctx context.Context, in *GenerateImageRequest, opts ...grpc.CallOption) (*Result, error) {
-	out := new(Result)
-	err := c.cc.Invoke(ctx, "/backend.Backend/GenerateImage", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) AudioTranscription(ctx context.Context, in *TranscriptRequest, opts ...grpc.CallOption) (*TranscriptResult, error) {
-	out := new(TranscriptResult)
-	err := c.cc.Invoke(ctx, "/backend.Backend/AudioTranscription", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) TTS(ctx context.Context, in *TTSRequest, opts ...grpc.CallOption) (*Result, error) {
-	out := new(Result)
-	err := c.cc.Invoke(ctx, "/backend.Backend/TTS", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) TokenizeString(ctx context.Context, in *PredictOptions, opts ...grpc.CallOption) (*TokenizationResponse, error) {
-	out := new(TokenizationResponse)
-	err := c.cc.Invoke(ctx, "/backend.Backend/TokenizeString", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-func (c *backendClient) Status(ctx context.Context, in *HealthMessage, opts ...grpc.CallOption) (*StatusResponse, error) {
-	out := new(StatusResponse)
-	err := c.cc.Invoke(ctx, "/backend.Backend/Status", in, out, opts...)
-	if err != nil {
-		return nil, err
-	}
-	return out, nil
-}
-
-// BackendServer is the server API for Backend service.
-// All implementations must embed UnimplementedBackendServer
-// for forward compatibility
-type BackendServer interface {
-	Health(context.Context, *HealthMessage) (*Reply, error)
-	Predict(context.Context, *PredictOptions) (*Reply, error)
-	LoadModel(context.Context, *ModelOptions) (*Result, error)
-	PredictStream(*PredictOptions, Backend_PredictStreamServer) error
-	Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error)
-	GenerateImage(context.Context, *GenerateImageRequest) (*Result, error)
-	AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error)
-	TTS(context.Context, *TTSRequest) (*Result, error)
-	TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error)
-	Status(context.Context, *HealthMessage) (*StatusResponse, error)
-	mustEmbedUnimplementedBackendServer()
-}
-
-// UnimplementedBackendServer must be embedded to have forward compatible implementations.
-type UnimplementedBackendServer struct {
-}
-
-func (UnimplementedBackendServer) Health(context.Context, *HealthMessage) (*Reply, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Health not implemented")
-}
-func (UnimplementedBackendServer) Predict(context.Context, *PredictOptions) (*Reply, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Predict not implemented")
-}
-func (UnimplementedBackendServer) LoadModel(context.Context, *ModelOptions) (*Result, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method LoadModel not implemented")
-}
-func (UnimplementedBackendServer) PredictStream(*PredictOptions, Backend_PredictStreamServer) error {
-	return status.Errorf(codes.Unimplemented, "method PredictStream not implemented")
-}
-func (UnimplementedBackendServer) Embedding(context.Context, *PredictOptions) (*EmbeddingResult, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Embedding not implemented")
-}
-func (UnimplementedBackendServer) GenerateImage(context.Context, *GenerateImageRequest) (*Result, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method GenerateImage not implemented")
-}
-func (UnimplementedBackendServer) AudioTranscription(context.Context, *TranscriptRequest) (*TranscriptResult, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method AudioTranscription not implemented")
-}
-func (UnimplementedBackendServer) TTS(context.Context, *TTSRequest) (*Result, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method TTS not implemented")
-}
-func (UnimplementedBackendServer) TokenizeString(context.Context, *PredictOptions) (*TokenizationResponse, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method TokenizeString not implemented")
-}
-func (UnimplementedBackendServer) Status(context.Context, *HealthMessage) (*StatusResponse, error) {
-	return nil, status.Errorf(codes.Unimplemented, "method Status not implemented")
-}
-func (UnimplementedBackendServer) mustEmbedUnimplementedBackendServer() {}
-
-// UnsafeBackendServer may be embedded to opt out of forward compatibility for this service.
-// Use of this interface is not recommended, as added methods to BackendServer will
-// result in compilation errors.
-type UnsafeBackendServer interface {
-	mustEmbedUnimplementedBackendServer()
-}
-
-func RegisterBackendServer(s grpc.ServiceRegistrar, srv BackendServer) {
-	s.RegisterService(&Backend_ServiceDesc, srv)
-}
-
-func _Backend_Health_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(HealthMessage)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Health(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Health",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Health(ctx, req.(*HealthMessage))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_Predict_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Predict(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Predict",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Predict(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_LoadModel_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(ModelOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).LoadModel(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/LoadModel",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).LoadModel(ctx, req.(*ModelOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_PredictStream_Handler(srv interface{}, stream grpc.ServerStream) error {
-	m := new(PredictOptions)
-	if err := stream.RecvMsg(m); err != nil {
-		return err
-	}
-	return srv.(BackendServer).PredictStream(m, &backendPredictStreamServer{stream})
-}
-
-type Backend_PredictStreamServer interface {
-	Send(*Reply) error
-	grpc.ServerStream
-}
-
-type backendPredictStreamServer struct {
-	grpc.ServerStream
-}
-
-func (x *backendPredictStreamServer) Send(m *Reply) error {
-	return x.ServerStream.SendMsg(m)
-}
-
-func _Backend_Embedding_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Embedding(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Embedding",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Embedding(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_GenerateImage_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(GenerateImageRequest)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).GenerateImage(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/GenerateImage",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).GenerateImage(ctx, req.(*GenerateImageRequest))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_AudioTranscription_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(TranscriptRequest)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).AudioTranscription(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/AudioTranscription",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).AudioTranscription(ctx, req.(*TranscriptRequest))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_TTS_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(TTSRequest)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).TTS(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/TTS",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).TTS(ctx, req.(*TTSRequest))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_TokenizeString_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(PredictOptions)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).TokenizeString(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/TokenizeString",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).TokenizeString(ctx, req.(*PredictOptions))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-func _Backend_Status_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) {
-	in := new(HealthMessage)
-	if err := dec(in); err != nil {
-		return nil, err
-	}
-	if interceptor == nil {
-		return srv.(BackendServer).Status(ctx, in)
-	}
-	info := &grpc.UnaryServerInfo{
-		Server:     srv,
-		FullMethod: "/backend.Backend/Status",
-	}
-	handler := func(ctx context.Context, req interface{}) (interface{}, error) {
-		return srv.(BackendServer).Status(ctx, req.(*HealthMessage))
-	}
-	return interceptor(ctx, in, info, handler)
-}
-
-// Backend_ServiceDesc is the grpc.ServiceDesc for Backend service.
-// It's only intended for direct use with grpc.RegisterService,
-// and not to be introspected or modified (even as a copy)
-var Backend_ServiceDesc = grpc.ServiceDesc{
-	ServiceName: "backend.Backend",
-	HandlerType: (*BackendServer)(nil),
-	Methods: []grpc.MethodDesc{
-		{
-			MethodName: "Health",
-			Handler:    _Backend_Health_Handler,
-		},
-		{
-			MethodName: "Predict",
-			Handler:    _Backend_Predict_Handler,
-		},
-		{
-			MethodName: "LoadModel",
-			Handler:    _Backend_LoadModel_Handler,
-		},
-		{
-			MethodName: "Embedding",
-			Handler:    _Backend_Embedding_Handler,
-		},
-		{
-			MethodName: "GenerateImage",
-			Handler:    _Backend_GenerateImage_Handler,
-		},
-		{
-			MethodName: "AudioTranscription",
-			Handler:    _Backend_AudioTranscription_Handler,
-		},
-		{
-			MethodName: "TTS",
-			Handler:    _Backend_TTS_Handler,
-		},
-		{
-			MethodName: "TokenizeString",
-			Handler:    _Backend_TokenizeString_Handler,
-		},
-		{
-			MethodName: "Status",
-			Handler:    _Backend_Status_Handler,
-		},
-	},
-	Streams: []grpc.StreamDesc{
-		{
-			StreamName:    "PredictStream",
-			Handler:       _Backend_PredictStream_Handler,
-			ServerStreams: true,
-		},
-	},
-	Metadata: "backend/backend.proto",
-}
--- a/backend/cpp/grpc/Makefile
+++ b/backend/cpp/grpc/Makefile
@@ -5,7 +5,6 @@ SYSTEM ?= $(HOST_SYSTEM)
 TAG_LIB_GRPC?=v1.59.0
 GIT_REPO_LIB_GRPC?=https://github.com/grpc/grpc.git
 GIT_CLONE_DEPTH?=1
-NUM_BUILD_THREADS?=$(shell nproc --ignore=1)

 INSTALLED_PACKAGES=installed_packages
 GRPC_REPO=grpc_repo
@@ -47,12 +46,17 @@ endif
 $(INSTALLED_PACKAGES): grpc_build

 $(GRPC_REPO):
-	git clone --depth $(GIT_CLONE_DEPTH) -b $(TAG_LIB_GRPC) $(GIT_REPO_LIB_GRPC) $(GRPC_REPO)/grpc
-	cd $(GRPC_REPO)/grpc && git submodule update --init --recursive --depth $(GIT_CLONE_DEPTH)
-
+	mkdir -p $(GRPC_REPO)/grpc
+	cd $(GRPC_REPO)/grpc && \
+	git init && \
+	git remote add origin $(GIT_REPO_LIB_GRPC)  && \
+	git fetch origin && \
+	git checkout $(TAG_LIB_GRPC) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+	
 $(GRPC_BUILD): $(GRPC_REPO)
 	mkdir -p $(GRPC_BUILD)
-	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . -- -j ${NUM_BUILD_THREADS} && cmake --build . --target install -- -j ${NUM_BUILD_THREADS}
+	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . && cmake --build . --target install

 build: $(INSTALLED_PACKAGES)

--- a/backend/cpp/llama-cpp/CMakeLists.txt
+++ b/backend/cpp/llama-cpp/CMakeLists.txt
@@ -1,20 +1,3 @@
-
-## XXX: In some versions of CMake clip wasn't being built before llama.
-## This is an hack for now, but it should be fixed in the future.
-set(TARGET myclip)
-add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
-install(TARGETS ${TARGET} LIBRARY)
-target_include_directories(myclip PUBLIC .)
-target_include_directories(myclip PUBLIC ../..)
-target_include_directories(myclip PUBLIC ../../common)
-target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-if (NOT MSVC)
-    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
-endif()
-# END CLIP hack
-
-
 set(TARGET grpc-server)
 set(CMAKE_CXX_STANDARD 17)
 cmake_minimum_required(VERSION 3.15)
@@ -74,8 +57,12 @@ add_library(hw_grpc_proto
  ${hw_proto_srcs}
  ${hw_proto_hdrs} )

-add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
-target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp httplib.h)
+
+target_include_directories(${TARGET} PRIVATE ../llava)
+target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
+
+target_link_libraries(${TARGET} PRIVATE common llama mtmd ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
  absl::flags_parse
  gRPC::${_REFLECTION}
  gRPC::${_GRPC_GRPCPP}
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -0,0 +1,166 @@
+
+LLAMA_VERSION?=8ff206097c2bf3ca1c7aa95f9d6db779fc7bdd68
+LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
+
+CMAKE_ARGS?=
+BUILD_TYPE?=
+NATIVE?=false
+ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
+TARGET?=--target grpc-server
+JOBS?=$(shell nproc)
+
+# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF -DLLAMA_CURL=OFF
+
+CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DGGML_CUDA=ON
+# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# to CMAKE_ARGS automatically
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblas)
+	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++
+else ifeq ($(BUILD_TYPE),hipblas)
+	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	AMDGPU_TARGETS?=gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201
+	CMAKE_ARGS+=-DGGML_HIP=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DGGML_VULKAN=1
+else ifeq ($(OS),Darwin)
+	ifeq ($(BUILD_TYPE),)
+		BUILD_TYPE=metal
+	endif
+	ifneq ($(BUILD_TYPE),metal)
+		CMAKE_ARGS+=-DGGML_METAL=OFF
+	else
+		CMAKE_ARGS+=-DGGML_METAL=ON
+		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+		CMAKE_ARGS+=-DGGML_METAL_USE_BF16=ON
+		CMAKE_ARGS+=-DGGML_OPENMP=OFF
+	endif
+	TARGET+=--target ggml-metal
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f16)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl" \
+		-DGGML_SYCL_F16=ON
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f32)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DCMAKE_CXX_FLAGS="-fsycl"
+endif
+
+INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages
+INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
+ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
+				 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
+				 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
+				 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
+				 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
+build-llama-cpp-grpc-server:
+# Conditionally build grpc for the llama backend to use if needed
+ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
+	$(MAKE) -C ../../grpc build
+	_PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto \
+	_GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin \
+	PATH="${INSTALLED_PACKAGES}/bin:${PATH}" \
+	CMAKE_ARGS="${CMAKE_ARGS} ${ADDED_CMAKE_ARGS}" \
+	LLAMA_VERSION=$(LLAMA_VERSION) \
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server
+else
+	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
+	LLAMA_VERSION=$(LLAMA_VERSION) $(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(VARIANT) grpc-server
+endif
+
+llama-cpp-avx2: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build purge
+	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx2-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build/grpc-server llama-cpp-avx2
+
+llama-cpp-avx512: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build purge
+	$(info ${GREEN}I llama-cpp build info:avx512${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx512-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build/grpc-server llama-cpp-avx512
+
+llama-cpp-avx: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build purge
+	$(info ${GREEN}I llama-cpp build info:avx${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-cpp-avx-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build/grpc-server llama-cpp-avx
+
+llama-cpp-fallback: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build purge
+	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
+
+llama-cpp-grpc: llama.cpp
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
+	$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
+	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
+	cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc
+
+llama-cpp-rpc-server: llama-cpp-grpc
+	cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
+
+llama.cpp:
+	mkdir -p llama.cpp
+	cd llama.cpp && \
+	git init && \
+	git remote add origin $(LLAMA_REPO)  && \
+	git fetch origin && \
+	git checkout -b build $(LLAMA_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+llama.cpp/tools/grpc-server: llama.cpp
+	mkdir -p llama.cpp/tools/grpc-server
+	bash prepare.sh
+
+rebuild:
+	bash prepare.sh
+	rm -rf grpc-server
+	$(MAKE) grpc-server
+
+package:
+	bash package.sh
+
+purge:
+	rm -rf llama.cpp/build
+	rm -rf llama.cpp/tools/grpc-server
+	rm -rf grpc-server
+
+clean: purge
+	rm -rf llama.cpp
+
+grpc-server: llama.cpp llama.cpp/tools/grpc-server
+	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+	+bash -c "source $(ONEAPI_VARS); \
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET)"
+else
+	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release -j $(JOBS) $(TARGET)
+endif
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
--- a/backend/cpp/llama-cpp/package.sh
+++ b/backend/cpp/llama-cpp/package.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Script to copy the appropriate libraries based on architecture
+# This script is used in the final stage of the Dockerfile
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+
+# Create lib directory
+mkdir -p $CURDIR/package/lib
+
+cp -avrf $CURDIR/llama-cpp-* $CURDIR/package/
+cp -rfv $CURDIR/run.sh $CURDIR/package/
+
+# Detect architecture and copy appropriate libraries
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    # x86_64 architecture
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    # ARM64 architecture
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+echo "Packaging completed successfully" 
+ls -liah $CURDIR/package/
+ls -liah $CURDIR/package/lib/
--- a/backend/cpp/llama-cpp/patches/01-llava.patch
+++ b/backend/cpp/llama-cpp/patches/01-llava.patch
@@ -0,0 +1,13 @@
+diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
+index 3cd0d2fa..6c5e811a 100644
+--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
+@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
+                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+                 int* patches_data = (int*)malloc(ggml_nbytes(patches));
+                 for (int i = 0; i < num_patches; i++) {
+-                    patches_data[i] = i + 1;
+                    patches_data[i] = i;
+                 }
+                 ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
+                 free(patches_data);
--- a/backend/cpp/llama-cpp/prepare.sh
+++ b/backend/cpp/llama-cpp/prepare.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+## Patches
+## Apply patches from the `patches` directory
+for patch in $(ls patches); do
+    echo "Applying patch $patch"
+    patch -d llama.cpp/ -p1 < patches/$patch
+done 
+
+set -e
+
+cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
+cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/tools/server/utils.hpp llama.cpp/tools/grpc-server/
+cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/
+
+set +e
+if grep -q "grpc-server" llama.cpp/tools/CMakeLists.txt; then
+    echo "grpc-server already added"
+else
+    echo "add_subdirectory(grpc-server)" >> llama.cpp/tools/CMakeLists.txt
+fi
+set -e
+
+# Now to keep maximum compatibility with the original server.cpp, we need to remove the index.html.gz.hpp and loading.html.hpp includes
+# and remove the main function
+# TODO: upstream this to the original server.cpp by extracting the upstream main function to a separate file
+awk '
+/int[ \t]+main[ \t]*\(/ {          # If the line starts the main function
+    in_main=1;                     # Set a flag
+    open_braces=0;                 # Track number of open braces
+}
+in_main {
+    open_braces += gsub(/\{/, "{"); # Count opening braces
+    open_braces -= gsub(/\}/, "}"); # Count closing braces
+    if (open_braces == 0) {         # If all braces are closed
+        in_main=0;                  # End skipping
+    }
+    next;                           # Skip lines inside main
+}
+!in_main                           # Print lines not inside main
+' "llama.cpp/tools/server/server.cpp" > llama.cpp/tools/grpc-server/server.cpp
+
+# remove index.html.gz.hpp and loading.html.hpp includes
+if [[ "$OSTYPE" == "darwin"* ]]; then
+    # macOS
+    sed -i '' '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
+else
+    # Linux and others
+    sed -i '/#include "index\.html\.gz\.hpp"/d; /#include "loading\.html\.hpp"/d' llama.cpp/tools/grpc-server/server.cpp
+fi
--- a/Show More
+++ b/Show More