⬆️ Update ggerganov/llama.cpp (#2603 )

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
fix(single-binary): bundle ld.so (#2602 )
2026-02-03 11:13:31 -05:00 · 2024-06-19 00:28:50 +00:00 · 2024-06-18 22:43:43 +02:00 · 2024-06-18 15:10:01 +02:00 · 2024-06-18 15:09:39 +02:00 · 2024-06-18 08:42:30 +02:00
222 changed files with 28529 additions and 1257 deletions
--- a/.env
+++ b/.env
@@ -71,6 +71,11 @@
 ### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
 # LLAMACPP_PARALLEL=1

+### Define a list of GRPC Servers for llama-cpp workers to distribute the load
+# https://github.com/ggerganov/llama.cpp/pull/6829
+# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
+# LLAMACPP_GRPC_SERVERS=""
+
 ### Enable to run parallel requests
 # LOCALAI_PARALLEL_REQUESTS=true

--- a/.github/checksum_checker.sh
+++ b/.github/checksum_checker.sh
@@ -16,7 +16,7 @@ function check_and_update_checksum() {
    # Download the file and calculate new checksum using Python
    new_checksum=$(python3 -c "
 import hashlib
-from huggingface_hub import hf_hub_download
+from huggingface_hub import hf_hub_download, get_paths_info
 import requests
 import sys
 import os
@@ -46,13 +46,24 @@ def calculate_sha256(file_path):

 download_type, repo_id_or_url = parse_uri(uri)

+new_checksum =  None
+
 # Decide download method based on URI type
 if download_type == 'huggingface':
-    try:
-        file_path = hf_hub_download(repo_id=repo_id_or_url, filename=file_name)
-    except Exception as e:
-        print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
-        sys.exit(2)
+    # Use HF API to pull sha
+    for file in get_paths_info(repo_id_or_url, [file_name], repo_type='model'):
+        try:
+            new_checksum = file.lfs.sha256
+            break
+        except Exception as e:
+            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
+            sys.exit(2)
+    if new_checksum is None:
+        try:
+            file_path = hf_hub_download(repo_id=repo_id_or_url, filename=file_name)
+        except Exception as e:
+            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
+            sys.exit(2)
 else:
    response = requests.get(repo_id_or_url)
    if response.status_code == 200:
@@ -66,9 +77,13 @@ else:
        print(f'Error downloading file: {response.status_code}', file=sys.stderr)
        sys.exit(1)

-print(calculate_sha256(file_path))
-# Clean up the downloaded file
-os.remove(file_path)
+if new_checksum is None:
+    new_checksum = calculate_sha256(file_path)
+    print(new_checksum)
+    os.remove(file_path)
+else:
+    print(new_checksum)
+
 ")

    if [[ "$new_checksum" == "" ]]; then
--- a/.github/ci/modelslist.go
+++ b/.github/ci/modelslist.go
@@ -0,0 +1,297 @@
+package main
+
+import (
+	"fmt"
+	"html/template"
+	"io/ioutil"
+	"os"
+
+	"gopkg.in/yaml.v3"
+)
+
+var modelPageTemplate string = `
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>LocalAI models</title>
+    <link href="https://cdnjs.cloudflare.com/ajax/libs/flowbite/2.3.0/flowbite.min.css" rel="stylesheet" />
+    <script src="https://cdn.jsdelivr.net/npm/vanilla-lazyload@19.1.3/dist/lazyload.min.js"></script>
+
+    <link
+    rel="stylesheet"
+    href="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.8.0/build/styles/default.min.css"
+  />
+    <script
+    defer
+    src="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.8.0/build/highlight.min.js"
+  ></script>
+    <script
+    defer
+    src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"
+  ></script>
+  <script
+    defer
+    src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"
+  ></script>
+  <script
+    defer
+    src="https://cdn.jsdelivr.net/npm/dompurify@3.0.6/dist/purify.min.js"
+  ></script>
+
+  <link href="/static/general.css" rel="stylesheet" />
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=Roboto:wght@400;500&display=swap" rel="stylesheet">
+    <link
+    href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700,900&display=swap"
+    rel="stylesheet" />
+  <link
+    rel="stylesheet"
+    href="https://cdn.jsdelivr.net/npm/tw-elements/css/tw-elements.min.css" />
+  <script src="https://cdn.tailwindcss.com/3.3.0"></script>
+  <script>
+    tailwind.config = {
+      darkMode: "class",
+      theme: {
+        fontFamily: {
+          sans: ["Roboto", "sans-serif"],
+          body: ["Roboto", "sans-serif"],
+          mono: ["ui-monospace", "monospace"],
+        },
+      },
+      corePlugins: {
+        preflight: false,
+      },
+    };
+  </script>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.1.1/css/all.min.css">
+    <script src="https://unpkg.com/htmx.org@1.9.12" integrity="sha384-ujb1lZYygJmzgSwoxRggbCHcjc0rB2XoQrxeTUQyRjrOnlCoYta87iKBWq3EsdM2" crossorigin="anonymous"></script>
+</head>
+
+<body class="bg-gray-900 text-gray-200">
+<div class="flex flex-col min-h-screen">
+
+<nav class="bg-gray-800 shadow-lg">
+    <div class="container mx-auto px-4 py-4">
+        <div class="flex items-center justify-between">
+            <div class="flex items-center">
+                <a href="/" class="text-white text-xl font-bold"><img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo" class="h-10 mr-3 border-2 border-gray-300 shadow rounded"></a>
+                <a href="/" class="text-white text-xl font-bold">LocalAI</a>
+            </div>
+            <!-- Menu button for small screens -->
+            <div class="lg:hidden">
+                <button id="menu-toggle" class="text-gray-400 hover:text-white focus:outline-none">
+                    <i class="fas fa-bars fa-lg"></i>
+                </button>
+            </div>
+            <!-- Navigation links -->
+            <div class="hidden lg:flex lg:items-center lg:justify-end lg:flex-1 lg:w-0">
+                <a href="https://localai.io" class="text-gray-400 hover:text-white px-3 py-2 rounded" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
+            </div>
+        </div>
+        <!-- Collapsible menu for small screens -->
+        <div class="hidden lg:hidden" id="mobile-menu">
+            <div class="pt-4 pb-3 border-t border-gray-700">
+                
+                <a href="https://localai.io" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
+               
+            </div>
+        </div>
+    </div>
+</nav>
+
+<style>
+  .is-hidden {
+	display: none;
+	  }
+</style>
+
+<div class="container mx-auto px-4 flex-grow">
+
+<div class="models mt-12">
+	<h2 class="text-center text-3xl font-semibold text-gray-100">
+	LocalAI model gallery list </h2><br>
+
+	<h2 class="text-center text-3xl font-semibold text-gray-100">
+
+	 🖼️ Available {{.AvailableModels}} models</i> <a href="https://localai.io/models/" target="_blank" >
+			<i class="fas fa-circle-info pr-2"></i>
+		</a></h2> 
+
+	<h3>	  
+	Refer to the Model gallery <a href="https://localai.io/models/" target="_blank" ><i class="fas fa-circle-info pr-2"></i></a> for more information on how to use the models with LocalAI.<br>
+
+	You can install models with the CLI command <code>local-ai models install <model-name></code>. or by using the WebUI.
+	</h3>
+  
+	<input class="form-control appearance-none block w-full mt-5 px-3 py-2 text-base font-normal text-gray-300 pb-2 mb-5 bg-gray-800 bg-clip-padding border border-solid border-gray-600 rounded transition ease-in-out m-0 focus:text-gray-300 focus:bg-gray-900 focus:border-blue-500 focus:outline-none" type="search" 
+	id="searchbox" placeholder="Live search keyword..">
+	  <div class="dark grid grid-cols-1 grid-rows-1 md:grid-cols-3 block rounded-lg shadow-secondary-1 dark:bg-surface-dark">
+		{{ range $_, $model := .Models }}
+		<div class="box me-4 mb-2 block rounded-lg bg-white shadow-secondary-1  dark:bg-gray-800 dark:bg-surface-dark dark:text-white text-surface pb-2">
+		<div>
+		    {{ $icon := "https://upload.wikimedia.org/wikipedia/commons/6/65/No-Image-Placeholder.svg" }}
+			{{ if $model.Icon }}
+	  		{{ $icon = $model.Icon }}
+	  		{{ end }}
+			<div class="flex justify-center items-center">
+				<img data-src="{{ $icon }}" alt="{{$model.Name}}" class="rounded-t-lg max-h-48 max-w-96 object-cover mt-3 lazy">
+			</div>
+	  		<div class="p-6 text-surface dark:text-white">
+				<h5 class="mb-2 text-xl font-medium leading-tight">{{$model.Name}}</h5>
+				
+				   
+				<p class="mb-4 text-base truncate">{{ $model.Description }}</p>
+		
+			</div>
+			<div class="px-6 pt-4 pb-2">
+
+      <!-- Modal toggle -->
+      <button data-modal-target="{{ $model.Name}}-modal" data-modal-toggle="{{ $model.Name }}-modal" class="block text-white bg-blue-700 hover:bg-blue-800 focus:ring-4 focus:outline-none focus:ring-blue-300 font-medium rounded-lg text-sm px-5 py-2.5 text-center dark:bg-blue-600 dark:hover:bg-blue-700 dark:focus:ring-blue-800" type="button">
+        More info
+      </button>
+
+    <!-- Main modal -->
+    <div id="{{ $model.Name}}-modal" tabindex="-1" aria-hidden="true" class="hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full">
+        <div class="relative p-4 w-full max-w-2xl max-h-full">
+            <!-- Modal content -->
+            <div class="relative bg-white rounded-lg shadow dark:bg-gray-700">
+                <!-- Modal header -->
+                <div class="flex items-center justify-between p-4 md:p-5 border-b rounded-t dark:border-gray-600">
+                    <h3 class="text-xl font-semibold text-gray-900 dark:text-white">
+                        {{ $model.Name}}
+                    </h3>
+                    <button type="button" class="text-gray-400 bg-transparent hover:bg-gray-200 hover:text-gray-900 rounded-lg text-sm w-8 h-8 ms-auto inline-flex justify-center items-center dark:hover:bg-gray-600 dark:hover:text-white" data-modal-hide="{{$model.Name}}-modal">
+                        <svg class="w-3 h-3" aria-hidden="true" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 14 14">
+                            <path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="m1 1 6 6m0 0 6 6M7 7l6-6M7 7l-6 6"/>
+                        </svg>
+                        <span class="sr-only">Close modal</span>
+                    </button>
+                </div>
+                <!-- Modal body -->
+                <div class="p-4 md:p-5 space-y-4">
+                    <div class="flex justify-center items-center">
+                    <img data-src="{{ $icon }}" alt="{{$model.Name}}" class="lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3">
+                  </div>
+
+                    <p class="text-base leading-relaxed text-gray-500 dark:text-gray-400">
+                    {{ $model.Description }}
+
+                    </p>
+                    
+                    <p class="text-base leading-relaxed text-gray-500 dark:text-gray-400">
+                    To install the model with the CLI, run: <br>
+                    <code> local-ai models install {{$model.Name}} </code> <br>
+
+                    <hr>
+                    See also <a href="https://localai.io/models/" target="_blank" >
+                    Installation <i class="fas fa-circle-info pr-2"></i>
+                    </a> to see how to install models with the REST API.
+                    </p>
+
+                    <p class="text-base leading-relaxed text-gray-500 dark:text-gray-400">
+                    <ul>
+                    {{ range $_, $u := $model.URLs }}
+                    <li><a href="{{ $u }}" target=_blank><i class="fa-solid fa-link"></i> {{ $u }}</a></li>
+                    {{ end }}  
+                    </ul>
+                    </p>
+                </div>
+                <!-- Modal footer -->
+                <div class="flex items-center p-4 md:p-5 border-t border-gray-200 rounded-b dark:border-gray-600">
+                    <button data-modal-hide="{{ $model.Name}}-modal" type="button" class="py-2.5 px-5 ms-3 text-sm font-medium text-gray-900 focus:outline-none bg-white rounded-lg border border-gray-200 hover:bg-gray-100 hover:text-blue-700 focus:z-10 focus:ring-4 focus:ring-gray-100 dark:focus:ring-gray-700 dark:bg-gray-800 dark:text-gray-400 dark:border-gray-600 dark:hover:text-white dark:hover:bg-gray-700">Close</button>
+                </div>
+            </div>
+        </div>
+    </div>
+
+
+			</div>
+		</div>
+		</div>
+		{{ end }}      
+
+		</div>
+  </div>
+</div>
+
+<script>
+var lazyLoadInstance = new LazyLoad({
+  // Your custom settings go here
+});
+
+let cards = document.querySelectorAll('.box')
+    
+function liveSearch() {
+    let search_query = document.getElementById("searchbox").value;
+    
+    //Use innerText if all contents are visible
+    //Use textContent for including hidden elements
+    for (var i = 0; i < cards.length; i++) {
+        if(cards[i].textContent.toLowerCase()
+                .includes(search_query.toLowerCase())) {
+            cards[i].classList.remove("is-hidden");
+        } else {
+            cards[i].classList.add("is-hidden");
+        }
+    }
+}
+
+//A little delay
+let typingTimer;               
+let typeInterval = 500;  
+let searchInput = document.getElementById('searchbox');
+
+searchInput.addEventListener('keyup', () => {
+    clearTimeout(typingTimer);
+    typingTimer = setTimeout(liveSearch, typeInterval);
+});
+</script>
+
+</div>
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/flowbite/2.3.0/flowbite.min.js"></script>
+</body>
+</html>
+`
+
+type GalleryModel struct {
+	Name        string   `json:"name" yaml:"name"`
+	URLs        []string `json:"urls" yaml:"urls"`
+	Icon        string   `json:"icon" yaml:"icon"`
+	Description string   `json:"description" yaml:"description"`
+}
+
+func main() {
+	// read the YAML file which contains the models
+
+	f, err := ioutil.ReadFile(os.Args[1])
+	if err != nil {
+		fmt.Println("Error reading file:", err)
+		return
+	}
+
+	models := []*GalleryModel{}
+	err = yaml.Unmarshal(f, &models)
+	if err != nil {
+		// write to stderr
+		os.Stderr.WriteString("Error unmarshaling YAML: " + err.Error() + "\n")
+		return
+	}
+
+	// render the template
+	data := struct {
+		Models          []*GalleryModel
+		AvailableModels int
+	}{
+		Models:          models,
+		AvailableModels: len(models),
+	}
+	tmpl := template.Must(template.New("modelPage").Parse(modelPageTemplate))
+
+	err = tmpl.Execute(os.Stdout, data)
+	if err != nil {
+		fmt.Println("Error executing template:", err)
+		return
+	}
+}
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -17,7 +17,7 @@ jobs:
        include:
          - grpc-base-image: ubuntu:22.04
            runs-on: 'ubuntu-latest'
-            platforms: 'linux/amd64'
+            platforms: 'linux/amd64,linux/arm64'
    runs-on: ${{matrix.runs-on}}
    steps:
      - name: Release space from worker
@@ -84,7 +84,7 @@ jobs:
          build-args: |
            GRPC_BASE_IMAGE=${{ matrix.grpc-base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.63.0
+            GRPC_VERSION=v1.64.0
          context: .
          file: ./Dockerfile
          cache-to: type=gha,ignore-error=true
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -0,0 +1,59 @@
+name: 'generate and publish intel docker caches'
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+
+concurrency:
+  group: intel-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  generate_caches:
+    strategy:
+      matrix:
+        include:
+          - base-image: intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
+            runs-on: 'ubuntu-latest'
+            platforms: 'linux/amd64'
+    runs-on: ${{matrix.runs-on}}
+    steps:
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+
+      - name: Login to quay
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Cache Intel images
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BASE_IMAGE=${{ matrix.base-image }}
+          context: .
+          file: ./Dockerfile
+          tags: quay.io/go-skynet/intel-oneapi-base:latest
+          push: true
+          target: intel
+          platforms: ${{ matrix.platforms }}
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -68,7 +68,7 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: 'sycl-f16-ffmpeg'
            ffmpeg: 'true'
@@ -110,7 +110,7 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: 'sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -148,7 +148,7 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg'
            ffmpeg: 'true'
@@ -161,7 +161,7 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'auto'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg'
            ffmpeg: 'true'
@@ -175,7 +175,7 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-core'
            ffmpeg: 'false'
@@ -185,7 +185,7 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-core'
            ffmpeg: 'false'
@@ -195,7 +195,7 @@ jobs:
          - build-type: 'sycl_f16'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f16-ffmpeg-core'
            ffmpeg: 'true'
@@ -205,7 +205,7 @@ jobs:
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            base-image: "intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04"
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
            grpc-base-image: "ubuntu:22.04"
            tag-suffix: '-sycl-f32-ffmpeg-core'
            ffmpeg: 'true'
@@ -260,7 +260,7 @@ jobs:
      matrix:
        include:
          - build-type: ''
-            platforms: 'linux/amd64'
+            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'auto'
            tag-suffix: '-ffmpeg-core'
            ffmpeg: 'true'
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -136,6 +136,7 @@ jobs:

      - name: Docker meta
        id: meta
+        if: github.event_name != 'pull_request'
        uses: docker/metadata-action@v5
        with:
          images: |
@@ -148,7 +149,20 @@ jobs:
          flavor: |
            latest=${{ inputs.tag-latest }}
            suffix=${{ inputs.tag-suffix }}
-
+      - name: Docker meta for PR
+        id: meta_pull_request
+        if: github.event_name == 'pull_request'
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            ttl.sh/localai-ci-pr-${{ github.event.number }}
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.tag-suffix }}
      - name: Docker meta AIO (quay.io)
        if: inputs.aio != ''
        id: meta_aio
@@ -174,7 +188,6 @@ jobs:
            type=ref,event=branch
            type=semver,pattern={{raw}}
          flavor: |
-            latest=${{ inputs.tag-latest }}
            suffix=${{ inputs.aio }}

      - name: Set up QEMU
@@ -203,6 +216,7 @@ jobs:

      - name: Build and push
        uses: docker/build-push-action@v5
+        if: github.event_name != 'pull_request'
        with:
          builder: ${{ steps.buildx.outputs.name }}
          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
@@ -218,7 +232,7 @@ jobs:
            BASE_IMAGE=${{ inputs.base-image }}
            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
-            GRPC_VERSION=v1.63.0
+            GRPC_VERSION=v1.64.0
            MAKEFLAGS=${{ inputs.makeflags }}
          context: .
          file: ./Dockerfile
@@ -227,7 +241,39 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
-
+### Start testing image
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        if: github.event_name == 'pull_request'
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
+          # This is why some build args like GRPC_VERSION and MAKEFLAGS are hardcoded
+          build-args: |
+            BUILD_TYPE=${{ inputs.build-type }}
+            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
+            FFMPEG=${{ inputs.ffmpeg }}
+            IMAGE_TYPE=${{ inputs.image-type }}
+            BASE_IMAGE=${{ inputs.base-image }}
+            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
+            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
+            GRPC_VERSION=v1.64.0
+            MAKEFLAGS=${{ inputs.makeflags }}
+          context: .
+          file: ./Dockerfile
+          cache-from: type=gha
+          platforms: ${{ inputs.platforms }}
+          push: true
+          tags: ${{ steps.meta_pull_request.outputs.tags }}
+          labels: ${{ steps.meta_pull_request.outputs.labels }}
+      - name: Testing image
+        if: github.event_name == 'pull_request'
+        run: |
+          echo "Image is available at ttl.sh/localai-ci-pr-${{ github.event.number }}:${{ steps.meta_pull_request.outputs.version }}" >> $GITHUB_STEP_SUMMARY
+## End testing image
      - name: Build and push AIO image
        if: inputs.aio != ''
        uses: docker/build-push-action@v5
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -5,7 +5,7 @@ on:
 - pull_request

 env:
-  GRPC_VERSION: v1.63.0
+  GRPC_VERSION: v1.64.0

 permissions:
  contents: write
@@ -15,7 +15,8 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  build-linux:
+
+  build-linux-arm:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
@@ -26,10 +27,133 @@ jobs:
        with:
          go-version: '1.21.x'
          cache: false
+
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
+          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
+      - name: Install CUDA Dependencies
+        run: |
+          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          sudo apt-get install -y cuda-cross-aarch64 cuda-nvcc-cross-aarch64-${CUDA_VERSION} libcublas-cross-aarch64-${CUDA_VERSION}
+        env:
+          CUDA_VERSION: 12-4
+      - name: Cache grpc
+        id: cache-grpc
+        uses: actions/cache@v4
+        with:
+          path: grpc
+          key: ${{ runner.os }}-arm-grpc-${{ env.GRPC_VERSION }}
+      - name: Build grpc
+        if: steps.cache-grpc.outputs.cache-hit != 'true'
+        run: |
+
+          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+            -DgRPC_BUILD_TESTS=OFF \
+            ../.. && sudo make --jobs 5 --output-sync=target
+      - name: Install gRPC
+        run: |
+          GNU_HOST=aarch64-linux-gnu
+          C_COMPILER_ARM_LINUX=$GNU_HOST-gcc
+          CXX_COMPILER_ARM_LINUX=$GNU_HOST-g++
+
+          CROSS_TOOLCHAIN=/usr/$GNU_HOST
+          CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
+          CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
+
+          # https://cmake.org/cmake/help/v3.13/manual/cmake-toolchains.7.html#cross-compiling-for-linux
+          echo "set(CMAKE_SYSTEM_NAME Linux)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_SYSTEM_PROCESSOR arm)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_STAGING_PREFIX $CROSS_STAGING_PREFIX)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_SYSROOT ${CROSS_TOOLCHAIN}/sysroot)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_C_COMPILER /usr/bin/$C_COMPILER_ARM_LINUX)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_CXX_COMPILER /usr/bin/$CXX_COMPILER_ARM_LINUX)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)" >> $CMAKE_CROSS_TOOLCHAIN
+          GRPC_DIR=$PWD/grpc
+          cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install && \
+          GRPC_CROSS_BUILD_DIR=$GRPC_DIR/cmake/cross_build && \
+          mkdir -p $GRPC_CROSS_BUILD_DIR && \
+          cd $GRPC_CROSS_BUILD_DIR && \
+          cmake -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CROSS_TOOLCHAIN \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DCMAKE_INSTALL_PREFIX=$CROSS_TOOLCHAIN/grpc_install \
+            ../.. && \
+          sudo make -j`nproc` install
+      - name: Build
+        id: build
+        run: |
+          GNU_HOST=aarch64-linux-gnu
+          C_COMPILER_ARM_LINUX=$GNU_HOST-gcc
+          CXX_COMPILER_ARM_LINUX=$GNU_HOST-g++
+
+          CROSS_TOOLCHAIN=/usr/$GNU_HOST
+          CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
+          CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+          export PATH=$PATH:$GOPATH/bin
+          export PATH=/usr/local/cuda/bin:$PATH
+          sudo rm -rf /usr/aarch64-linux-gnu/lib/libstdc++.so.6
+          sudo cp -rf /usr/aarch64-linux-gnu/lib/libstdc++.so* /usr/aarch64-linux-gnu/lib/libstdc++.so.6
+          sudo cp /usr/aarch64-linux-gnu/lib/ld-linux-aarch64.so.1 ld.so
+          GO_TAGS=p2p \
+          BACKEND_LIBS="./grpc/cmake/cross_build/third_party/re2/libre2.a ./grpc/cmake/cross_build/libgrpc.a ./grpc/cmake/cross_build/libgrpc++.a ./grpc/cmake/cross_build/third_party/protobuf/libprotobuf.a /usr/aarch64-linux-gnu/lib/libc.so.6 /usr/aarch64-linux-gnu/lib/libstdc++.so.6 /usr/aarch64-linux-gnu/lib/libgomp.so.1 /usr/aarch64-linux-gnu/lib/libm.so.6 /usr/aarch64-linux-gnu/lib/libgcc_s.so.1 /usr/aarch64-linux-gnu/lib/libdl.so.2 /usr/aarch64-linux-gnu/lib/libpthread.so.0 ./ld.so" \
+          GOOS=linux \
+          GOARCH=arm64 \
+          CMAKE_ARGS="-DProtobuf_INCLUDE_DIRS=$CROSS_STAGING_PREFIX/include -DProtobuf_DIR=$CROSS_STAGING_PREFIX/lib/cmake/protobuf -DgRPC_DIR=$CROSS_STAGING_PREFIX/lib/cmake/grpc -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CROSS_TOOLCHAIN -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++" make dist-cross-linux-arm64
+      - uses: actions/upload-artifact@v4
+        with:
+          name: LocalAI-linux-arm64
+          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
+  build-linux:
+    runs-on: arc-runner-set
+    steps:
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21.x'
+          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg protobuf-compiler
+          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache cmake
+      - name: Intel Dependencies
+        run: |
+          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+          echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
+          sudo apt update
+          sudo apt install -y intel-basekit
      - name: Install CUDA Dependencies
        run: |
          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
@@ -38,6 +162,31 @@ jobs:
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
        env:
          CUDA_VERSION: 12-3
+      - name: "Install Hipblas"
+        env:
+          ROCM_VERSION: "6.1"
+          AMDGPU_VERSION: "6.1"
+        run: |
+            set -ex 
+
+            sudo apt-get update
+            sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates curl libnuma-dev gnupg 
+            
+            curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add - 
+              
+            printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | sudo tee /etc/apt/sources.list.d/rocm.list
+            
+            printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list
+            printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
+            sudo apt-get update
+
+            sudo DEBIAN_FRONTEND=noninteractive apt-get install -y \
+                hipblas-dev rocm-dev \
+                rocblas-dev
+          
+            sudo apt-get clean
+            sudo rm -rf /var/lib/apt/lists/*
+            sudo ldconfig 
      - name: Cache grpc
        id: cache-grpc
        uses: actions/cache@v4
@@ -54,14 +203,20 @@ jobs:
      - name: Install gRPC
        run: |
          cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install
+      # BACKEND_LIBS needed for gpu-workload: /opt/intel/oneapi/*/lib/libiomp5.so /opt/intel/oneapi/*/lib/libmkl_core.so /opt/intel/oneapi/*/lib/libmkl_core.so.2 /opt/intel/oneapi/*/lib/libmkl_intel_ilp64.so /opt/intel/oneapi/*/lib/libmkl_intel_ilp64.so.2 /opt/intel/oneapi/*/lib/libmkl_sycl_blas.so /opt/intel/oneapi/*/lib/libmkl_sycl_blas.so.4 /opt/intel/oneapi/*/lib/libmkl_tbb_thread.so /opt/intel/oneapi/*/lib/libmkl_tbb_thread.so.2 /opt/intel/oneapi/*/lib/libsycl.so /opt/intel/oneapi/*/lib/libsycl.so.7 /opt/intel/oneapi/*/lib/libsycl.so.7.1.0 /opt/rocm-*/lib/libamdhip64.so /opt/rocm-*/lib/libamdhip64.so.5 /opt/rocm-*/lib/libamdhip64.so.6 /opt/rocm-*/lib/libamdhip64.so.6.1.60100 /opt/rocm-*/lib/libhipblas.so /opt/rocm-*/lib/libhipblas.so.2 /opt/rocm-*/lib/libhipblas.so.2.1.60100 /opt/rocm-*/lib/librocblas.so /opt/rocm-*/lib/librocblas.so.4 /opt/rocm-*/lib/librocblas.so.4.1.60100 /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /usr/lib/x86_64-linux-gnu/libOpenCL.so.1 /usr/lib/x86_64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/x86_64-linux-gnu/libm.so.6 /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 /usr/lib/x86_64-linux-gnu/libc.so.6 /usr/lib/x86_64-linux-gnu/librt.so.1 /usr/local/cuda-*/targets/x86_64-linux/lib/libcublas.so /usr/local/cuda-*/targets/x86_64-linux/lib/libcublasLt.so /usr/local/cuda-*/targets/x86_64-linux/lib/libcudart.so /usr/local/cuda-*/targets/x86_64-linux/lib/stubs/libcuda.so
      - name: Build
        id: build
        run: |
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
          export PATH=$PATH:$GOPATH/bin
          export PATH=/usr/local/cuda/bin:$PATH
-          make dist
+          export PATH=/opt/rocm/bin:$PATH
+          source /opt/intel/oneapi/setvars.sh
+          sudo cp /lib64/ld-linux-x86-64.so.2 ld.so
+          GO_TAGS=p2p \
+          BACKEND_LIBS="./ld.so /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /usr/lib/x86_64-linux-gnu/libm.so.6 /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 /usr/lib/x86_64-linux-gnu/libc.so.6 /usr/lib/x86_64-linux-gnu/libgomp.so.1" \
+          make -j4 dist
      - uses: actions/upload-artifact@v4
        with:
          name: LocalAI-linux
@@ -72,7 +227,13 @@ jobs:
        with:
          files: |
            release/*
-
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
  build-stablediffusion:
    runs-on: ubuntu-latest
    steps:
@@ -86,18 +247,27 @@ jobs:
          cache: false
      - name: Dependencies
        run: |
-          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
      - name: Build stablediffusion
        run: |
          export PATH=$PATH:$GOPATH/bin
          make backend-assets/grpc/stablediffusion
          mkdir -p release && cp backend-assets/grpc/stablediffusion release
+        env:
+          GO_TAGS: stablediffusion
      - uses: actions/upload-artifact@v4
        with:
          name: stablediffusion
          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*

  build-macOS-arm64:
    runs-on: macos-14
@@ -113,15 +283,16 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
      - name: Build
        id: build
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
-          make dist
+          
+          BACKEND_LIBS="$(ls /opt/homebrew/opt/grpc/lib/*.dylib /opt/homebrew/opt/re2/lib/*.dylib /opt/homebrew/opt/openssl@3/lib/*.dylib /opt/homebrew/opt/protobuf/lib/*.dylib /opt/homebrew/opt/abseil/lib/*.dylib | xargs)" GO_TAGS=p2p make dist
      - uses: actions/upload-artifact@v4
        with:
          name: LocalAI-MacOS-arm64
@@ -132,3 +303,10 @@ jobs:
        with:
          files: |
            release/*
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -29,7 +29,7 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.63.0
+          pip install --user grpcio-tools==1.64.0
          
      - name: Test transformers
        run: |
@@ -51,7 +51,7 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.63.0
+          pip install --user grpcio-tools==1.64.0
          
      - name: Test sentencetransformers
        run: |
@@ -74,7 +74,7 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.63.0
+          pip install --user grpcio-tools==1.64.0

      - name: Test rerankers
        run: |
@@ -96,7 +96,7 @@ jobs:
          sudo apt-get install -y libopencv-dev
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
-          pip install --user grpcio-tools==1.63.0
+          pip install --user grpcio-tools==1.64.0
      - name: Test diffusers
        run: |
          make --jobs=5 --output-sync=target -C backend/python/diffusers
@@ -117,12 +117,34 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.63.0
+          pip install --user grpcio-tools==1.64.0

      - name: Test parler-tts
        run: |
           make --jobs=5 --output-sync=target -C backend/python/parler-tts
           make --jobs=5 --output-sync=target -C backend/python/parler-tts test
+  
+  tests-openvoice:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.64.0
+
+      - name: Test openvoice
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/openvoice
+           make --jobs=5 --output-sync=target -C backend/python/openvoice test

  tests-transformers-musicgen:
    runs-on: ubuntu-latest
@@ -139,7 +161,7 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.63.0
+          pip install --user grpcio-tools==1.64.0

      - name: Test transformers-musicgen
        run: |
@@ -163,7 +185,7 @@ jobs:
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user grpcio-tools==1.63.0
+  #         pip install --user grpcio-tools==1.64.0

  #     - name: Test petals
  #       run: |
@@ -227,7 +249,7 @@ jobs:
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user grpcio-tools==1.63.0
+  #         pip install --user grpcio-tools==1.64.0

  #     - name: Test bark
  #       run: |
@@ -252,7 +274,7 @@ jobs:
  #         curl -LsSf https://astral.sh/uv/install.sh | sh
  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
  #         sudo apt-get install -y libopencv-dev
-  #         pip install --user grpcio-tools==1.63.0
+  #         pip install --user grpcio-tools==1.64.0
  #     - name: Test vllm
  #       run: |
  #          make --jobs=5 --output-sync=target -C backend/python/vllm
@@ -272,7 +294,7 @@ jobs:
          curl -LsSf https://astral.sh/uv/install.sh | sh
          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
          sudo apt-get install -y libopencv-dev
-          pip install --user grpcio-tools==1.63.0
+          pip install --user grpcio-tools==1.64.0
      - name: Test vall-e-x
        run: |
           make --jobs=5 --output-sync=target -C backend/python/vall-e-x
@@ -292,7 +314,7 @@ jobs:
          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng python3-pip
          # Install UV
          curl -LsSf https://astral.sh/uv/install.sh | sh
-          pip install --user grpcio-tools==1.63.0
+          pip install --user grpcio-tools==1.64.0
      - name: Test coqui
        run: |
          make --jobs=5 --output-sync=target -C backend/python/coqui
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -10,7 +10,7 @@ on:
      - '*'

 env:
-  GRPC_VERSION: v1.63.0
+  GRPC_VERSION: v1.64.0

 concurrency:
  group: ci-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -93,8 +93,8 @@ jobs:
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
          export CUDACXX=/usr/local/cuda/bin/nvcc

-          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b

          # The python3-grpc-tools package in 22.04 is too old
          pip install --user grpcio-tools
@@ -178,7 +178,7 @@ jobs:
          submodules: true
      - name: Build images
        run: |
-          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=core --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
+          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
      - name: Test
        run: |
@@ -213,7 +213,7 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
-          pip install --user grpcio-tools==1.63.0
+          pip install --user grpcio-tools==1.64.0
      - name: Test
        run: |
          export C_INCLUDE_PATH=/usr/local/include
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,9 @@ get-sources
 prepare-sources
 /backend/cpp/llama/grpc-server
 /backend/cpp/llama/llama.cpp
+/backend/cpp/llama-*
+
+*.log

 go-ggml-transformers
 go-gpt2
@@ -39,6 +42,7 @@ backend-assets/*
 !backend-assets/.keep
 prepare
 /ggml-metal.metal
+docs/static/gallery.html

 # Protobuf generated files
 *.pb.go
@@ -49,4 +53,4 @@ prepare
 .scannerwork

 # backend virtual environments
-**/venv
+**/venv
--- a/93
+++ b/93
@@ -1,45 +1,40 @@
 ARG IMAGE_TYPE=extras
 ARG BASE_IMAGE=ubuntu:22.04
 ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
+ARG INTEL_BASE_IMAGE=${BASE_IMAGE}

 # The requirements-core target is common to all images.  It should not be placed in requirements-core unless every single build will use it.
 FROM ${BASE_IMAGE} AS requirements-core

 USER root

-ARG GO_VERSION=1.21.7
+ARG GO_VERSION=1.22.4
 ARG TARGETARCH
 ARG TARGETVARIANT

 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"

-ARG GO_TAGS="stablediffusion tinydream tts"

 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
+        ccache \
        ca-certificates \
        cmake \
        curl \
        git \
-        python3-pip \
-        python-is-python3 \
        unzip && \
    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* && \
-    pip install --upgrade pip
+    rm -rf /var/lib/apt/lists/*

 # Install Go
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH $PATH:/root/go/bin:/usr/local/go/bin

 # Install grpc compilers
-RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@latest && \
-    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
-
-# Install grpcio-tools (the version in 22.04 is too old)
-RUN pip install --user grpcio-tools
+RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.1 && \
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af

 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
@@ -84,10 +79,16 @@ RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        espeak-ng \
        espeak \
+        python3-pip \
+        python-is-python3 \
        python3-dev \
        python3-venv && \
    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/* && \
+    pip install --upgrade pip
+
+# Install grpcio-tools (the version in 22.04 is too old)
+RUN pip install --user grpcio-tools

 ###################################
 ###################################
@@ -98,21 +99,48 @@ FROM requirements-${IMAGE_TYPE} AS requirements-drivers

 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
-ARG CUDA_MINOR_VERSION=7
+ARG CUDA_MINOR_VERSION=8

 ENV BUILD_TYPE=${BUILD_TYPE}

 # CuBLAS requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+                        software-properties-common pciutils
+        if [ "amd64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            fi
+        if [ "arm64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
+        fi
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+            rm -f cuda-keyring_1.1-1_all.deb && \
+            apt-get update && \
+            apt-get install -y --no-install-recommends \
+                cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
+            apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
 RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
        apt-get update && \
        apt-get install -y  --no-install-recommends \
-            software-properties-common && \
+            software-properties-common pciutils && \
        curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
@@ -145,13 +173,24 @@ RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
 ###################################
 ###################################

+# Temporary workaround for Intel's repository to work correctly
+# https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/APT-Repository-not-working-signatures-invalid/m-p/1599436/highlight/true#M36143
+# This is a temporary workaround until Intel fixes their repository
+FROM ${INTEL_BASE_IMAGE} AS intel
+RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \
+gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
+RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" > /etc/apt/sources.list.d/intel-graphics.list
+
+###################################
+###################################
+
 # The grpc target does one thing, it builds and installs GRPC.  This is in it's own layer so that it can be effectively cached by CI.
 # You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
 FROM ${GRPC_BASE_IMAGE} AS grpc

 # This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
 ARG GRPC_MAKEFLAGS="-j4 -Otarget"
-ARG GRPC_VERSION=v1.58.0
+ARG GRPC_VERSION=v1.64.2

 ENV MAKEFLAGS=${GRPC_MAKEFLAGS}

@@ -184,7 +223,7 @@ RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shall
 # Adjustments to the build process should likely be made here.
 FROM requirements-drivers AS builder

-ARG GO_TAGS="stablediffusion tts"
+ARG GO_TAGS="stablediffusion tts p2p"
 ARG GRPC_BACKENDS
 ARG MAKEFLAGS

@@ -206,9 +245,18 @@ RUN make prepare
 # We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
 # but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
 # here so that we can generate the grpc code for the stablediffusion build
-RUN curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
-    unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-    rm protoc.zip
+RUN <<EOT bash
+    if [ "amd64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+    if [ "arm64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+EOT

 # stablediffusion does not tolerate a newer version of abseil, build it first
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
@@ -305,6 +353,9 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/vall-e-x \
    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/openvoice \
+    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "petals" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/petals \
    ; fi && \
@@ -340,7 +391,7 @@ RUN mkdir -p /build/models
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f ${HEALTHCHECK_ENDPOINT} || exit 1
-  
+
 VOLUME /build/models
 EXPOSE 8080
 ENTRYPOINT [ "/build/entrypoint.sh" ]
--- a/122
+++ b/122
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai

 # llama.cpp versions
 GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=dc685be46622a8fabfd57cfa804237c8f15679b8
+CPPLLAMA_VERSION?=37bef8943312d91183ff06d8f1214082a17344a5

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -16,10 +16,10 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=4ef8d9f44eb402c528ab6d990ab50a9f4f666347
+WHISPER_CPP_VERSION?=b29b3b29240aac8b71ce8e5a4360c1f1562ad66f

 # bert.cpp version
-BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
+BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4

 # go-piper version
 PIPER_VERSION?=9d0100873a7dbb0824dfea40e8cec70a1b110759
@@ -100,7 +100,7 @@ ifeq ($(BUILD_TYPE),cublas)
 	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
 	export LLAMA_CUBLAS=1
 	export WHISPER_CUDA=1
-	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda
+	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
 endif

 ifeq ($(BUILD_TYPE),hipblas)
@@ -112,7 +112,7 @@ ifeq ($(BUILD_TYPE),hipblas)
 	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
 	export WHISPER_HIPBLAS=1
-	GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
+	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
 	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
@@ -158,6 +158,8 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
+ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
+ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
 ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@@ -311,28 +313,53 @@ build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
 	$(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
+ifneq ($(BACKEND_LIBS),)
+	$(MAKE) backend-assets/lib
+	cp $(BACKEND_LIBS) backend-assets/lib/
+endif
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./

 build-minimal:
-	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp" GO_TAGS=none $(MAKE) build
+	BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=none $(MAKE) build

 build-api:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build

+backend-assets/lib:
+	mkdir -p backend-assets/lib
+
 dist:
-	STATIC=true $(MAKE) backend-assets/grpc/llama-cpp-avx2
+	$(MAKE) backend-assets/grpc/llama-cpp-avx2
 ifeq ($(OS),Darwin)
-	$(info ${GREEN}I Skip CUDA build on MacOS${RESET})
+	$(info ${GREEN}I Skip CUDA/hipblas build on MacOS${RESET})
 else
 	$(MAKE) backend-assets/grpc/llama-cpp-cuda
+	$(MAKE) backend-assets/grpc/llama-cpp-hipblas
+	$(MAKE) backend-assets/grpc/llama-cpp-sycl_f16
+	$(MAKE) backend-assets/grpc/llama-cpp-sycl_f32
 endif
-	$(MAKE) build
+	STATIC=true $(MAKE) build
 	mkdir -p release
 # if BUILD_ID is empty, then we don't append it to the binary name
 ifeq ($(BUILD_ID),)
 	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(OS)-$(ARCH)
+	shasum -a 256 release/$(BINARY_NAME)-$(OS)-$(ARCH) > release/$(BINARY_NAME)-$(OS)-$(ARCH).sha256
 else
 	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH)
+	shasum -a 256 release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH) > release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH).sha256
+endif
+
+dist-cross-linux-arm64: 
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" \
+	STATIC=true $(MAKE) build
+	mkdir -p release
+# if BUILD_ID is empty, then we don't append it to the binary name
+ifeq ($(BUILD_ID),)
+	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(OS)-arm64
+	shasum -a 256 release/$(BINARY_NAME)-$(OS)-arm64 > release/$(BINARY_NAME)-$(OS)-arm64.sha256
+else
+	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-arm64
+	shasum -a 256 release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-arm64 > release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-arm64.sha256
 endif

 osx-signed: build
@@ -443,7 +470,7 @@ protogen-clean: protogen-go-clean protogen-python-clean
 .PHONY: protogen-go
 protogen-go:
 	mkdir -p pkg/grpc/proto
-	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
+	protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
    backend/backend.proto

 .PHONY: protogen-go-clean
@@ -452,10 +479,10 @@ protogen-go-clean:
 	$(RM) bin/*

 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen
+protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen petals-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen

 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean
+protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean petals-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean

 .PHONY: autogptq-protogen
 autogptq-protogen:
@@ -569,6 +596,14 @@ vall-e-x-protogen:
 vall-e-x-protogen-clean:
 	$(MAKE) -C backend/python/vall-e-x protogen-clean

+.PHONY: openvoice-protogen
+openvoice-protogen:
+	$(MAKE) -C backend/python/openvoice protogen
+
+.PHONY: openvoice-protogen-clean
+openvoice-protogen-clean:
+	$(MAKE) -C backend/python/openvoice protogen-clean
+
 .PHONY: vllm-protogen
 vllm-protogen:
 	$(MAKE) -C backend/python/vllm protogen
@@ -592,6 +627,7 @@ prepare-extra-conda-environments: protogen-python
 	$(MAKE) -C backend/python/transformers-musicgen
 	$(MAKE) -C backend/python/parler-tts
 	$(MAKE) -C backend/python/vall-e-x
+	$(MAKE) -C backend/python/openvoice
 	$(MAKE) -C backend/python/exllama
 	$(MAKE) -C backend/python/petals
 	$(MAKE) -C backend/python/exllama2
@@ -659,6 +695,14 @@ else
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server
 endif

+# This target is for manually building a variant with-auto detected flags
+backend-assets/grpc/llama-cpp: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-cpp
+	$(MAKE) -C backend/cpp/llama-cpp purge
+	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
+	$(MAKE) VARIANT="llama-cpp" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-cpp/grpc-server backend-assets/grpc/llama-cpp
+
 backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-avx2
 	$(MAKE) -C backend/cpp/llama-avx2 purge
@@ -691,6 +735,38 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
 	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda

+backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-hipblas
+	$(MAKE) -C backend/cpp/llama-hipblas purge
+	$(info ${GREEN}I llama-cpp build info:hipblas${RESET})
+	BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
+
+backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16
+	$(MAKE) -C backend/cpp/llama-sycl_f16 purge
+	$(info ${GREEN}I llama-cpp build info:sycl_f16${RESET})
+	BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16
+
+backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32
+	$(MAKE) -C backend/cpp/llama-sycl_f32 purge
+	$(info ${GREEN}I llama-cpp build info:sycl_f32${RESET})
+	BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32
+
+backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-grpc
+	$(MAKE) -C backend/cpp/llama-grpc purge
+	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
+
+backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
+	mkdir -p backend-assets/util/
+	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
+
 backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
@@ -764,3 +840,25 @@ docker-image-intel-xpu:
 .PHONY: swagger
 swagger:
 	swag init -g core/http/app.go --output swagger
+
+.PHONY: gen-assets
+gen-assets:
+	$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
+
+## Documentation
+docs/layouts/_default: 
+	mkdir -p docs/layouts/_default
+
+docs/static/gallery.html: docs/layouts/_default
+	$(GOCMD) run ./.github/ci/modelslist.go ./gallery/index.yaml > docs/static/gallery.html
+
+docs/public: docs/layouts/_default docs/static/gallery.html
+	cd docs && hugo --minify
+
+docs-clean:
+	rm -rf docs/public
+	rm -rf docs/static/gallery.html
+
+.PHONY: docs
+docs: docs/static/gallery.html
+	cd docs && hugo serve
--- a/README.md
+++ b/README.md
@@ -46,18 +46,32 @@

 **LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

+![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)
+
+```bash
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
+# Alternative images:
+# - if you have an Nvidia GPU:
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
+# - without preconfigured models
+# docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
+# - without preconfigured models for Nvidia GPUs
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12 
+```
+
+[💻 Getting started](https://localai.io/basics/getting_started/index.html)
+
 ## 🔥🔥 Hot topics / Roadmap

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

+- 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
+- 🔥🔥 Decentralized llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs  https://localai.io/features/distribute/
+- 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
+- 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
+- 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
 - Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
 - Reranker API: https://github.com/mudler/LocalAI/pull/2121
- Gallery WebUI: https://github.com/mudler/LocalAI/pull/2104
- llama3: https://github.com/mudler/LocalAI/discussions/2076
- Parler-TTS: https://github.com/mudler/LocalAI/pull/2027
- Openvino support: https://github.com/mudler/LocalAI/pull/1892
- Vector store: https://github.com/mudler/LocalAI/pull/1795
- All-in-one container image: https://github.com/mudler/LocalAI/issues/1855

 Hot topics (looking for contributors):

@@ -70,30 +84,19 @@ Hot topics (looking for contributors):

 If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22

-## 💻 [Getting started](https://localai.io/basics/getting_started/index.html)
-
-For a detailed step-by-step introduction, refer to the [Getting Started](https://localai.io/basics/getting_started/index.html) guide. 
-
-For those in a hurry, here's a straightforward one-liner to launch a LocalAI AIO(All-in-one) Image using `docker`:
-
-```bash
-docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
-# or, if you have an Nvidia GPU:
-# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
-```
-
 ## 🚀 [Features](https://localai.io/features/)

 - 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
 - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
 - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
 - 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
- 🔥 [OpenAI functions](https://localai.io/features/openai-functions/) 🆕
+- 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) 
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
 - 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
- 🆕 [Reranker API](https://localai.io/features/reranker/)
+- 📈 [Reranker API](https://localai.io/features/reranker/)
+- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)

 ## 💻 Usage

@@ -107,6 +110,7 @@ Build and deploy custom containers:
 WebUIs:
 - https://github.com/Jirubizu/localai-admin
 - https://github.com/go-skynet/LocalAI-frontend
+- QA-Pilot(An interactive chat project that leverages LocalAI LLMs for rapid understanding and navigation of GitHub code repository) https://github.com/reid41/QA-Pilot

 Model galleries
 - https://github.com/go-skynet/model-gallery
@@ -116,16 +120,17 @@ Other:
 - VSCode extension https://github.com/badgooooor/localai-vscode-plugin
 - Terminal utility https://github.com/djcopley/ShellOracle
 - Local Smart assistant https://github.com/mudler/LocalAGI
- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation
+- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
 - Discord bot https://github.com/mudler/LocalAGI/tree/main/examples/discord
 - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
+- Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
 - Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
 - Examples: https://github.com/mudler/LocalAI/tree/master/examples/
  

 ### 🔗 Resources

- 🆕 New! [LLM finetuning guide](https://localai.io/docs/advanced/fine-tuning/)
+- [LLM finetuning guide](https://localai.io/docs/advanced/fine-tuning/)
 - [How to build locally](https://localai.io/basics/build/index.html)
 - [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
 - [Projects integrating LocalAI](https://localai.io/docs/integrations/)
@@ -133,6 +138,7 @@ Other:

 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

+- 🆕 [Run LocalAI on Jetson Nano Devkit](https://mudler.pm/posts/local-ai-jetson-nano-devkit/)
 - [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/blog/low-code-llm-apps-with-local-ai-flowise-and-pulumi/)
 - [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
 - [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
@@ -160,17 +166,16 @@ If you utilize this repository, data in a downstream project, please consider ci

 Support the project by becoming [a backer or sponsor](https://github.com/sponsors/mudler). Your logo will show up here with a link to your website.

-A huge thank you to our generous sponsors who support this project:
+A huge thank you to our generous sponsors who support this project covering CI expenses, and our [Sponsor list](https://github.com/sponsors/mudler):

-| ![Spectro Cloud logo_600x600px_transparent bg](https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512) |
-|:-----------------------------------------------:|
-|  [Spectro Cloud](https://www.spectrocloud.com/)  |
-|  Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs!  |
-
-And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project.
-
- [Sponsor list](https://github.com/sponsors/mudler)
- JDAM00 (donating HW for the CI)
+<p align="center">
+  <a href="https://www.spectrocloud.com/" target="blank">
+    <img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
+  </a>
+  <a href="https://www.premai.io/" target="blank">
+    <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
+  </a>
+</p>

 ## 🌟 Star history

@@ -180,7 +185,7 @@ And a huge shout-out to individuals sponsoring the project by donating hardware

 LocalAI is a community-driven project created by [Ettore Di Giacinto](https://github.com/mudler/).

-MIT - Author Ettore Di Giacinto
+MIT - Author Ettore Di Giacinto <mudler@localai.io>

 ## 🙇 Acknowledgements

--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -2,8 +2,63 @@ name: gpt-4
 mmap: true
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+context_size: 8192
+
+stopwords:
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""

 template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
    {{- if .FunctionCall }}
@@ -22,38 +77,25 @@ template:
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
-  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
-  function: |
+  completion: |
+    {{.Input}}
+  function: |-
    <|im_start|>system
-    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    You are a function calling AI model.
+    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    </tools>
-    Use the following pydantic model json schema for each tool call you will make:
-    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
-    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
-    {'arguments': <args-dict>, 'name': <function-name>}
+    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
-    <tool_call>
-  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
-  completion: |
-    {{.Input}}
-context_size: 4096
-f16: true
-stopwords:
- <|im_end|>
- <dummy32000>
- "\n</tool_call>"
- "\n\n\n"
-usage: |
-      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-          "model": "gpt-4",
-          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
-      }'
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -2,8 +2,63 @@ name: gpt-4
 mmap: true
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+context_size: 8192
+
+stopwords:
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""

 template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
    {{- if .FunctionCall }}
@@ -22,38 +77,25 @@ template:
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
-  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
-  function: |
+  completion: |
+    {{.Input}}
+  function: |-
    <|im_start|>system
-    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    You are a function calling AI model.
+    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    </tools>
-    Use the following pydantic model json schema for each tool call you will make:
-    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
-    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
-    {'arguments': <args-dict>, 'name': <function-name>}
+    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
-    <|im_start|>assistant
-    <tool_call>
-  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
-  completion: |
-    {{.Input}}
-context_size: 4096
-f16: true
-stopwords:
- <|im_end|>
- <dummy32000>
- "\n</tool_call>"
- "\n\n\n"
-usage: |
-      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-          "model": "gpt-4",
-          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
-      }'
+    <|im_start|>assistant
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -1,10 +1,66 @@
 name: gpt-4
 mmap: false
+context_size: 8192
+
 f16: false
 parameters:
  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf

+stopwords:
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+
 template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
  chat_message: |
    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
    {{- if .FunctionCall }}
@@ -23,37 +79,25 @@ template:
    {{- else if eq .RoleName "tool" }}
    </tool_response>
    {{- end }}<|im_end|>
-  # https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
-  function: |
+  completion: |
+    {{.Input}}
+  function: |-
    <|im_start|>system
-    You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+    You are a function calling AI model.
+    Here are the available tools:
    <tools>
    {{range .Functions}}
    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
    {{end}}
    </tools>
-    Use the following pydantic model json schema for each tool call you will make:
-    {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
-    For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
    <tool_call>
-    {'arguments': <args-dict>, 'name': <function-name>}
+    {"arguments": <args-dict>, "name": <function-name>}
    </tool_call><|im_end|>
    {{.Input -}}
    <|im_start|>assistant
-    <tool_call>
-  chat: |
-    {{.Input -}}
-    <|im_start|>assistant
-  completion: |
-    {{.Input}}
-context_size: 4096
-stopwords:
- <|im_end|>
- "\n</tool_call>"
- <dummy32000>
- "\n\n\n"
-usage: |
-      curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-          "model": "gpt-4",
-          "messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
-      }'
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -266,6 +266,7 @@ message TTSRequest {
  string model = 2;
  string dst = 3;
  string voice = 4;
+  optional string language = 5;
 }

 message TokenizationResponse {
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -791,7 +791,7 @@ struct llama_server_context
                    sampler_names.emplace_back(sampler_name);
                }
            }
-            slot->sparams.samplers_sequence = sampler_types_from_names(sampler_names, false);
+            slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
        }
        else
        {
@@ -1146,7 +1146,7 @@ struct llama_server_context
        std::vector<std::string> samplers_sequence;
        for (const auto &sampler_type : slot.sparams.samplers_sequence)
        {
-            samplers_sequence.emplace_back(sampler_type_to_name_string(sampler_type));
+            samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
        }

        return json {
@@ -2217,6 +2217,12 @@ static void params_parse(const backend::ModelOptions* request,
    } else {
        params.n_parallel = 1;
    }
+
+    const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
+    if (llama_grpc_servers != NULL) {
+        params.rpc_servers = std::string(llama_grpc_servers);
+    }
+    
    // TODO: Add yarn

    if (!request->tensorsplit().empty()) {
--- a/backend/python/autogptq/requirements-hipblas.txt
+++ b/backend/python/autogptq/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 intel-extension-for-pytorch
 torch
-optimum[openvino]
+optimum[openvino]
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
 torch
 certifi
--- a/backend/python/bark/requirements-hipblas.txt
+++ b/backend/python/bark/requirements-hipblas.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
+torchaudio
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -2,4 +2,5 @@
 intel-extension-for-pytorch
 torch
 torchaudio
-optimum[openvino]
+optimum[openvino]
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 bark==0.1.5
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
 certifi
 transformers
--- a/backend/python/common/template/requirements-hipblas.txt
+++ b/backend/python/common/template/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@@ -66,7 +66,21 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

    def TTS(self, request, context):
        try:
-            self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=COQUI_LANGUAGE, file_path=request.dst)
+            # if model is multilangual add language from request or env as fallback
+            lang = request.language or COQUI_LANGUAGE
+            if lang == "":
+                lang = None
+            if self.tts.is_multi_lingual and lang is None:
+               return backend_pb2.Result(success=False, message=f"Model is multi-lingual, but no language was provided")
+
+            # if model is multi-speaker, use speaker_wav or the speaker_id from request.voice
+            if self.tts.is_multi_speaker and self.AudioPath is None and request.voice is None:
+                return backend_pb2.Result(success=False, message=f"Model is multi-speaker, but no speaker was provided")
+
+            if self.tts.is_multi_speaker and request.voice is not None:
+               self.tts.tts_to_file(text=request.text, speaker=request.voice, language=lang, file_path=request.dst)
+            else:
+                self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=lang, file_path=request.dst)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(success=True)
--- a/backend/python/coqui/requirements-hipblas.txt
+++ b/backend/python/coqui/requirements-hipblas.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
+torchaudio
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -2,4 +2,5 @@
 intel-extension-for-pytorch
 torch
 torchaudio
-optimum[openvino]
+optimum[openvino]
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 TTS==0.22.0
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
 certifi
 transformers
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -17,7 +17,7 @@ import backend_pb2_grpc

 import grpc

-from diffusers import StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler
+from diffusers import StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler
 from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image,export_to_video
@@ -225,6 +225,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        torch_dtype=torchType, 
                        use_safetensors=True, 
                        variant=variant)
+            elif request.PipelineType == "StableDiffusion3Pipeline":
+                if fromSingleFile:
+                    self.pipe = StableDiffusion3Pipeline.from_single_file(modelFile,
+                                                               torch_dtype=torchType,
+                                                               use_safetensors=True)
+                else:
+                    self.pipe = StableDiffusion3Pipeline.from_pretrained(
+                        request.Model, 
+                        torch_dtype=torchType, 
+                        use_safetensors=True, 
+                        variant=variant)

            if CLIPSKIP and request.CLIPSkip != 0:
                self.clip_skip = request.CLIPSkip
--- a/backend/python/diffusers/requirements-hipblas.txt
+++ b/backend/python/diffusers/requirements-hipblas.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
+torchvision
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -2,4 +2,5 @@
 intel-extension-for-pytorch
 torch
 torchvision
-optimum[openvino]
+optimum[openvino]
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,10 +1,11 @@
 accelerate
 compel
 diffusers
-grpcio==1.63.0
+grpcio==1.64.0
 opencv-python
 pillow
 protobuf
+sentencepiece
 torch
 transformers
-certifi
+certifi
--- a/backend/python/exllama/requirements.txt
+++ b/backend/python/exllama/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
 torch
 transformers
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,5 +1,5 @@
 accelerate
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
 certifi
 torch
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,6 +1,6 @@
 causal-conv1d==1.2.0.post2
 mamba-ssm==1.2.0.post1
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
 certifi
 transformers
--- a/backend/python/openvoice/Makefile
+++ b/backend/python/openvoice/Makefile
@@ -0,0 +1,25 @@
+.DEFAULT_GOAL := install
+
+.PHONY: install
+install: protogen
+	bash install.sh
+
+.PHONY: protogen
+protogen: backend_pb2_grpc.py backend_pb2.py
+
+.PHONY: protogen-clean
+protogen-clean:
+	$(RM) backend_pb2_grpc.py backend_pb2.py
+
+backend_pb2_grpc.py backend_pb2.py:
+	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
+
+.PHONY: clean
+clean: protogen-clean
+	rm -rf venv __pycache__
+
+.PHONY: test
+test: protogen
+	@echo "Testing openvoice..."
+	bash test.sh
+	@echo "openvoice tested."
--- a/backend/python/openvoice/backend.py
+++ b/backend/python/openvoice/backend.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+"""
+Extra gRPC server for OpenVoice models.
+"""
+from concurrent import futures
+
+import argparse
+import signal
+import sys
+import os
+import torch
+from openvoice import se_extractor
+from openvoice.api import ToneColorConverter
+from melo.api import TTS
+
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+_ONE_DAY_IN_SECONDS = 60 * 60 * 24
+
+# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
+MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
+
+# Implement the BackendServicer class with the service methods
+class BackendServicer(backend_pb2_grpc.BackendServicer):
+    """
+    A gRPC servicer for the backend service.
+
+    This class implements the gRPC methods for the backend service, including Health, LoadModel, and Embedding.
+    """
+    def Health(self, request, context):
+        """
+        A gRPC method that returns the health status of the backend service.
+
+        Args:
+            request: A HealthRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Reply object that contains the health status of the backend service.
+        """
+        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
+
+    def LoadModel(self, request, context):
+        """
+        A gRPC method that loads a model into memory.
+
+        Args:
+            request: A LoadModelRequest object that contains the request parameters.
+            context: A grpc.ServicerContext object that provides information about the RPC.
+
+        Returns:
+            A Result object that contains the result of the LoadModel operation.
+        """
+        model_name = request.Model
+        try:
+
+            self.clonedVoice = False
+            # Assume directory from request.ModelFile.
+            # Only if request.LoraAdapter it's not an absolute path
+            if request.AudioPath and request.ModelFile != "" and not os.path.isabs(request.AudioPath):
+                # get base path of modelFile
+                modelFileBase = os.path.dirname(request.ModelFile)
+                request.AudioPath = os.path.join(modelFileBase, request.AudioPath)
+            if request.AudioPath != "":
+                self.clonedVoice = True
+
+            self.modelpath = request.ModelFile
+            self.speaker = request.Type
+            self.ClonedVoicePath = request.AudioPath
+            
+            ckpt_converter = request.Model+'/converter'
+            device = "cuda:0" if torch.cuda.is_available() else "cpu"
+            self.device = device
+            self.tone_color_converter = None
+            if self.clonedVoice:
+                self.tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
+                self.tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
+       
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+
+        return backend_pb2.Result(message="Model loaded successfully", success=True)
+
+    def TTS(self, request, context):
+        model_name = request.model
+        if model_name == "":
+            return backend_pb2.Result(success=False, message="request.model is required")
+        try:
+            # Speed is adjustable
+            speed = 1.0
+            voice = "EN"
+            if request.voice:
+                voice = request.voice
+            model = TTS(language=voice, device=self.device)
+            speaker_ids = model.hps.data.spk2id
+            speaker_key = self.speaker
+            modelpath = self.modelpath
+            for s in speaker_ids.keys():
+                print(f"Speaker: {s} - ID: {speaker_ids[s]}")
+            speaker_id = speaker_ids[speaker_key]
+            speaker_key = speaker_key.lower().replace('_', '-')
+            source_se = torch.load(f'{modelpath}/base_speakers/ses/{speaker_key}.pth', map_location=self.device)
+            model.tts_to_file(request.text, speaker_id, request.dst, speed=speed)
+            if self.clonedVoice:
+                reference_speaker = self.ClonedVoicePath
+                target_se, audio_name = se_extractor.get_se(reference_speaker, self.tone_color_converter, vad=False)
+                # Run the tone color converter
+                encode_message = "@MyShell"
+                self.tone_color_converter.convert(
+                    audio_src_path=request.dst, 
+                    src_se=source_se, 
+                    tgt_se=target_se, 
+                    output_path=request.dst,
+                    message=encode_message)
+           
+            print("[OpenVoice] TTS generated!", file=sys.stderr)
+            print("[OpenVoice] TTS saved to", request.dst, file=sys.stderr)
+            print(request, file=sys.stderr)
+        except Exception as err:
+            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
+        return backend_pb2.Result(success=True)
+
+def serve(address):
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
+    server.add_insecure_port(address)
+    server.start()
+    print("[OpenVoice] Server started. Listening on: " + address, file=sys.stderr)
+
+    # Define the signal handler function
+    def signal_handler(sig, frame):
+        print("[OpenVoice] Received termination signal. Shutting down...")
+        server.stop(0)
+        sys.exit(0)
+
+    # Set the signal handlers for SIGINT and SIGTERM
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        while True:
+            time.sleep(_ONE_DAY_IN_SECONDS)
+    except KeyboardInterrupt:
+        server.stop(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run the gRPC server.")
+    parser.add_argument(
+        "--addr", default="localhost:50051", help="The address to bind the server to."
+    )
+    args = parser.parse_args()
+    print(f"[OpenVoice] startup: {args}", file=sys.stderr)
+    serve(args.addr)
--- a/backend/python/openvoice/install.sh
+++ b/backend/python/openvoice/install.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
+# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
+# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
+# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
+if [ "x${BUILD_PROFILE}" == "xintel" ]; then
+    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+fi
+
+installRequirements
+
+python -m unidic download
--- a/backend/python/openvoice/requirements-hipblas.txt
+++ b/backend/python/openvoice/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -0,0 +1,23 @@
+--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+intel-extension-for-pytorch
+torch
+optimum[openvino]
+grpcio==1.64.0
+protobuf
+librosa==0.9.1
+faster-whisper==0.9.0
+pydub==0.25.1
+wavmark==0.0.3
+numpy==1.22.0
+eng_to_ipa==0.0.2
+inflect==7.0.0
+unidecode==1.3.7
+whisper-timestamped==1.14.2
+openai
+python-dotenv
+pypinyin==0.50.0
+cn2an==0.5.22
+jieba==0.42.1
+gradio==3.48.0
+langid==1.1.6
+git+https://github.com/myshell-ai/MeloTTS.git
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -0,0 +1,20 @@
+grpcio==1.64.0
+protobuf
+librosa==0.9.1
+faster-whisper==0.9.0
+pydub==0.25.1
+wavmark==0.0.3
+numpy==1.22.0
+eng_to_ipa==0.0.2
+inflect==7.0.0
+unidecode==1.3.7
+whisper-timestamped==1.14.2
+openai
+python-dotenv
+pypinyin==0.50.0
+cn2an==0.5.22
+jieba==0.42.1
+gradio==3.48.0
+langid==1.1.6
+git+https://github.com/myshell-ai/MeloTTS.git
+git+https://github.com/myshell-ai/OpenVoice.git
--- a/backend/python/openvoice/run.sh
+++ b/backend/python/openvoice/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+source $(dirname $0)/../common/libbackend.sh
+
+startBackend $@
--- a/backend/python/openvoice/test.py
+++ b/backend/python/openvoice/test.py
@@ -0,0 +1,82 @@
+"""
+A test script to test the gRPC service
+"""
+import unittest
+import subprocess
+import time
+import backend_pb2
+import backend_pb2_grpc
+
+import grpc
+
+
+class TestBackendServicer(unittest.TestCase):
+    """
+    TestBackendServicer is the class that tests the gRPC service
+    """
+    def setUp(self):
+        """
+        This method sets up the gRPC service by starting the server
+        """
+        self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
+        time.sleep(10)
+
+    def tearDown(self) -> None:
+        """
+        This method tears down the gRPC service by terminating the server
+        """
+        self.service.terminate()
+        self.service.wait()
+
+    def test_server_startup(self):
+        """
+        This method tests if the server starts up successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.Health(backend_pb2.HealthMessage())
+                self.assertEqual(response.message, b'OK')
+        except Exception as err:
+            print(err)
+            self.fail("Server failed to start")
+        finally:
+            self.tearDown()
+
+    def test_load_model(self):
+        """
+        This method tests if the model is loaded successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="checkpoints_v2", 
+                                                                    Type="en-us"))
+                self.assertTrue(response.success)
+                self.assertEqual(response.message, "Model loaded successfully")
+        except Exception as err:
+            print(err)
+            self.fail("LoadModel service failed")
+        finally:
+            self.tearDown()
+
+    def test_tts(self):
+        """
+        This method tests if the embeddings are generated successfully
+        """
+        try:
+            self.setUp()
+            with grpc.insecure_channel("localhost:50051") as channel:
+                stub = backend_pb2_grpc.BackendStub(channel)
+                response = stub.LoadModel(backend_pb2.ModelOptions(Model="dingzhen"))
+                self.assertTrue(response.success)
+                tts_request = backend_pb2.TTSRequest(text="80s TV news production music hit for tonight's biggest story", voice="EN")
+                tts_response = stub.TTS(tts_request)
+                self.assertIsNotNone(tts_response)
+        except Exception as err:
+            print(err)
+            self.fail("TTS service failed")
+        finally:
+            self.tearDown()
--- a/backend/python/openvoice/test.sh
+++ b/backend/python/openvoice/test.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+source $(dirname $0)/../common/libbackend.sh
+
+# Download checkpoints if not present
+if [ ! -d "checkpoints_v2" ]; then
+    wget https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_v2_0417.zip -O checkpoints_v2.zip
+    unzip checkpoints_v2.zip
+fi
+
+runUnittests
--- a/backend/python/parler-tts/requirements-hipblas.txt
+++ b/backend/python/parler-tts/requirements-hipblas.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
+torchaudio
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -2,4 +2,5 @@
 intel-extension-for-pytorch
 torch
 torchaudio
-optimum[openvino]
+optimum[openvino]
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,5 +1,5 @@
 accelerate
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
 torch
 git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16
--- a/backend/python/petals/requirements-hipblas.txt
+++ b/backend/python/petals/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/petals/requirements-intel.txt
+++ b/backend/python/petals/requirements-intel.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 intel-extension-for-pytorch
 torch
-optimum[openvino]
+optimum[openvino]
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/rerankers/requirements-hipblas.txt
+++ b/backend/python/rerankers/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/rerankers/requirements-intel.txt
+++ b/backend/python/rerankers/requirements-intel.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 intel-extension-for-pytorch
 torch
-optimum[openvino]
+optimum[openvino]
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 rerankers[transformers]
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
 certifi
 transformers
--- a/backend/python/sentencetransformers/requirements-hipblas.txt
+++ b/backend/python/sentencetransformers/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/sentencetransformers/requirements-intel.txt
+++ b/backend/python/sentencetransformers/requirements-intel.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 intel-extension-for-pytorch
 torch
-optimum[openvino]
+optimum[openvino]
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 sentence-transformers==2.5.1
 transformers
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
 certifi
--- a/backend/python/transformers-musicgen/requirements-hipblas.txt
+++ b/backend/python/transformers-musicgen/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/transformers-musicgen/requirements-intel.txt
+++ b/backend/python/transformers-musicgen/requirements-intel.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 intel-extension-for-pytorch
 torch
-optimum[openvino]
+optimum[openvino]
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 transformers
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
 torch
 scipy==1.13.0
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -21,10 +21,7 @@ import torch.cuda


 XPU=os.environ.get("XPU", "0") == "1"
-if XPU:
-    from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer
-else:
-    from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig, TextIteratorStreamer
+from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria


 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -77,11 +74,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        """
        model_name = request.Model

-        compute = "auto"
+        compute = torch.float16
        if request.F16Memory == True:
            compute=torch.bfloat16

-        self.CUDA = request.CUDA
+        self.CUDA = torch.cuda.is_available()
        self.OV=False

        device_map="cpu"
@@ -89,6 +86,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        quantization = None

        if self.CUDA:
+            from transformers import BitsAndBytesConfig, AutoModelForCausalLM
            if request.MainGPU:
                device_map=request.MainGPU
            else:
@@ -107,7 +105,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    bnb_4bit_compute_dtype = None,
                    load_in_8bit=True,                                   
                )
-                                               
+
        try:
            if request.Type == "AutoModelForCausalLM":
                if XPU:
@@ -189,6 +187,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                                                                device=device_map)
                self.OV = True
            else:
+                print("Automodel", file=sys.stderr)
                self.model = AutoModel.from_pretrained(model_name, 
                                                       trust_remote_code=request.TrustRemoteCode,  
                                                       use_safetensors=True,  
@@ -246,28 +245,28 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
-#        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
-#        print("Embeddings:", sentence_embeddings, file=sys.stderr)
        return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0])

    async def _predict(self, request, context, streaming=False): 
        set_seed(request.Seed)
-        if request.TopP == 0:
-            request.TopP = 0.9
+        if request.TopP < 0 or request.TopP > 1:
+            request.TopP = 1
        
-        if request.TopK == 0:
-            request.TopK = 40
+        if request.TopK <= 0:
+            request.TopK = 50
+
+        if request.Temperature > 0 :
+            sample=True
+        else:
+            sample=False
+            request.TopP == None
+            request.TopK == None
+            request.Temperature == None

        prompt = request.Prompt
        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:    
            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)

-        eos_token_id = self.tokenizer.eos_token_id
-        if request.StopPrompts:
-            eos_token_id = []
-            for word in request.StopPrompts:
-                eos_token_id.append(self.tokenizer.convert_tokens_to_ids(word))
-
        inputs = self.tokenizer(prompt, return_tensors="pt")

        if request.Tokens > 0:
@@ -281,6 +280,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            inputs = inputs.to("xpu")
            streaming = False

+        criteria=[]
+        if request.StopPrompts:
+            criteria = StoppingCriteriaList(
+                [
+                    StopStringCriteria(tokenizer=self.tokenizer, stop_strings=request.StopPrompts),
+                ]
+            )
+
        if streaming:
            streamer=TextIteratorStreamer(self.tokenizer,
                                        skip_prompt=True,
@@ -290,11 +297,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        temperature=request.Temperature, 
                        top_p=request.TopP,
                        top_k=request.TopK, 
-                        do_sample=True,
+                        do_sample=sample,
                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=eos_token_id,
+                        eos_token_id=self.tokenizer.eos_token_id,
                        pad_token_id=self.tokenizer.eos_token_id,
-                        streamer=streamer)
+                        streamer=streamer,
+                        stopping_criteria=criteria,
+                        use_cache=True,
+                        )
            thread=Thread(target=self.model.generate, kwargs=config)
            thread.start()
            generated_text = ""
@@ -311,18 +321,20 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                                    temperature=request.Temperature, 
                                    top_p=request.TopP,
                                    top_k=request.TopK, 
-                                    do_sample=True,
+                                    do_sample=sample,
                                    pad_token=self.tokenizer.eos_token_id)
            else:
-                outputs = self.model.generate(inputs["input_ids"],
+                outputs = self.model.generate(**inputs,
                        max_new_tokens=max_tokens, 
                        temperature=request.Temperature, 
                        top_p=request.TopP,
                        top_k=request.TopK, 
-                        do_sample=True,
-                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=eos_token_id,
-                        pad_token_id=self.tokenizer.eos_token_id)
+                        do_sample=sample,
+                        eos_token_id=self.tokenizer.eos_token_id,
+                        pad_token_id=self.tokenizer.eos_token_id,
+                        stopping_criteria=criteria,
+                        use_cache=True,
+                        )
            generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]

        if streaming:
--- a/backend/python/transformers/requirements-hipblas.txt
+++ b/backend/python/transformers/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/transformers/requirements-intel.txt
+++ b/backend/python/transformers/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,6 +1,9 @@
 accelerate
 transformers
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
 torch
-certifi
+certifi
+intel-extension-for-transformers
+bitsandbytes
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers/run.sh
+++ b/backend/python/transformers/run.sh
@@ -1,4 +1,10 @@
 #!/bin/bash
 source $(dirname $0)/../common/libbackend.sh

+if [ -d "/opt/intel" ]; then
+    # Assumes we are using the Intel oneAPI container image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    export XPU=1
+fi
+
 startBackend $@
--- a/backend/python/vall-e-x/requirements-hipblas.txt
+++ b/backend/python/vall-e-x/requirements-hipblas.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
+torchaudio
--- a/backend/python/vall-e-x/requirements-intel.txt
+++ b/backend/python/vall-e-x/requirements-intel.txt
@@ -2,4 +2,5 @@
 intel-extension-for-pytorch
 torch
 torchaudio
-optimum[openvino]
+optimum[openvino]
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,4 +1,4 @@
 accelerate
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
 certifi
--- a/backend/python/vllm/requirements-hipblas.txt
+++ b/backend/python/vllm/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
 intel-extension-for-pytorch
 torch
-optimum[openvino]
+optimum[openvino]
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 vllm
-grpcio==1.63.0
+grpcio==1.64.0
 protobuf
 certifi
 transformers
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -57,7 +57,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 		if _, err := os.Stat(modelFile); os.IsNotExist(err) {
 			utils.ResetDownloadTimers()
 			// if we failed to load the model, we try to download it
-			err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
+			err := gallery.InstallModelFromGallery(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
 			if err != nil {
 				return nil, err
 			}
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -29,7 +29,16 @@ func generateUniqueFileName(dir, baseName, ext string) string {
 	}
 }

-func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
+func ModelTTS(
+	backend,
+	text,
+	modelFile,
+	voice ,
+	language string,
+	loader *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	backendConfig config.BackendConfig,
+) (string, *proto.Result, error) {
 	bb := backend
 	if bb == "" {
 		bb = model.PiperBackend
@@ -83,7 +92,13 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,
 		Model: modelPath,
 		Voice: voice,
 		Dst:   filePath,
+		Language: &language,
 	})

+	// return RPC error if any
+	if !res.Success {
+		return "", nil, fmt.Errorf(res.Message)
+	}
+
 	return filePath, res, err
 }
--- a/core/cli/cli.go
+++ b/core/cli/cli.go
@@ -1,20 +1,17 @@
 package cli

-import "embed"
-
-type Context struct {
-	Debug    bool    `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
-	LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"`
-
-	// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
-	BackendAssets embed.FS `kong:"-"`
-}
+import (
+	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
+	"github.com/go-skynet/LocalAI/core/cli/worker"
+)

 var CLI struct {
-	Context `embed:""`
+	cliContext.Context `embed:""`

 	Run        RunCMD        `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
 	Models     ModelsCMD     `cmd:"" help:"Manage LocalAI models and definitions"`
 	TTS        TTSCMD        `cmd:"" help:"Convert text to speech"`
 	Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
+	Worker     worker.Worker `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
+	Util       UtilCMD       `cmd:"" help:"Utility commands"`
 }
--- a/core/cli/context/context.go
+++ b/core/cli/context/context.go
@@ -0,0 +1,11 @@
+package cliContext
+
+import "embed"
+
+type Context struct {
+	Debug    bool    `env:"LOCALAI_DEBUG,DEBUG" default:"false" hidden:"" help:"DEPRECATED, use --log-level=debug instead. Enable debug logging"`
+	LogLevel *string `env:"LOCALAI_LOG_LEVEL" enum:"error,warn,info,debug,trace" help:"Set the level of logs to output [${enum}]"`
+
+	// This field is not a command line argument/flag, the struct tag excludes it from the parsed CLI
+	BackendAssets embed.FS `kong:"-"`
+}
--- a/core/cli/models.go
+++ b/core/cli/models.go
@@ -4,13 +4,16 @@ import (
 	"encoding/json"
 	"fmt"

+	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
+
 	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/go-skynet/LocalAI/pkg/startup"
 	"github.com/rs/zerolog/log"
 	"github.com/schollz/progressbar/v3"
 )

 type ModelsCMDFlags struct {
-	Galleries  string `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models"`
+	Galleries  string `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models" default:"${galleries}"`
 	ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 }

@@ -29,7 +32,7 @@ type ModelsCMD struct {
 	Install ModelsInstall `cmd:"" help:"Install a model from the gallery"`
 }

-func (ml *ModelsList) Run(ctx *Context) error {
+func (ml *ModelsList) Run(ctx *cliContext.Context) error {
 	var galleries []gallery.Gallery
 	if err := json.Unmarshal([]byte(ml.Galleries), &galleries); err != nil {
 		log.Error().Err(err).Msg("unable to load galleries")
@@ -49,30 +52,44 @@ func (ml *ModelsList) Run(ctx *Context) error {
 	return nil
 }

-func (mi *ModelsInstall) Run(ctx *Context) error {
-	modelName := mi.ModelArgs[0]
-
+func (mi *ModelsInstall) Run(ctx *cliContext.Context) error {
 	var galleries []gallery.Gallery
 	if err := json.Unmarshal([]byte(mi.Galleries), &galleries); err != nil {
 		log.Error().Err(err).Msg("unable to load galleries")
 	}

-	progressBar := progressbar.NewOptions(
-		1000,
-		progressbar.OptionSetDescription(fmt.Sprintf("downloading model %s", modelName)),
-		progressbar.OptionShowBytes(false),
-		progressbar.OptionClearOnFinish(),
-	)
-	progressCallback := func(fileName string, current string, total string, percentage float64) {
-		v := int(percentage * 10)
-		err := progressBar.Set(v)
-		if err != nil {
-			log.Error().Err(err).Str("filename", fileName).Int("value", v).Msg("error while updating progress bar")
+	for _, modelName := range mi.ModelArgs {
+
+		progressBar := progressbar.NewOptions(
+			1000,
+			progressbar.OptionSetDescription(fmt.Sprintf("downloading model %s", modelName)),
+			progressbar.OptionShowBytes(false),
+			progressbar.OptionClearOnFinish(),
+		)
+		progressCallback := func(fileName string, current string, total string, percentage float64) {
+			v := int(percentage * 10)
+			err := progressBar.Set(v)
+			if err != nil {
+				log.Error().Err(err).Str("filename", fileName).Int("value", v).Msg("error while updating progress bar")
+			}
+		}
+		//startup.InstallModels()
+		models, err := gallery.AvailableGalleryModels(galleries, mi.ModelsPath)
+		if err != nil {
+			return err
+		}
+
+		model := gallery.FindModel(models, modelName, mi.ModelsPath)
+		if model == nil {
+			log.Error().Str("model", modelName).Msg("model not found")
+			return err
+		}
+
+		log.Info().Str("model", modelName).Str("license", model.License).Msg("installing model")
+		err = startup.InstallModels(galleries, "", mi.ModelsPath, progressCallback, modelName)
+		if err != nil {
+			return err
 		}
-	}
-	err := gallery.InstallModelFromGallery(galleries, modelName, mi.ModelsPath, gallery.GalleryModel{}, progressCallback)
-	if err != nil {
-		return err
 	}
 	return nil
 }
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -1,12 +1,15 @@
 package cli

 import (
+	"context"
 	"fmt"
 	"strings"
 	"time"

+	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/core/http"
+	"github.com/go-skynet/LocalAI/core/p2p"
 	"github.com/go-skynet/LocalAI/core/startup"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
@@ -34,16 +37,20 @@ type RunCMD struct {
 	PreloadModelsConfig string   `env:"LOCALAI_PRELOAD_MODELS_CONFIG,PRELOAD_MODELS_CONFIG" help:"A List of models to apply at startup. Path to a YAML config file" group:"models"`

 	F16         bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
-	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" default:"4" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
+	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
 	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`

-	Address          string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
-	CORS             bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
-	CORSAllowOrigins string   `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
-	UploadLimit      int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
-	APIKeys          []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
-	DisableWebUI     bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
-
+	Address              string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
+	CORS                 bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
+	CORSAllowOrigins     string   `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
+	LibraryPath          string   `env:"LOCALAI_LIBRARY_PATH,LIBRARY_PATH" help:"Path to the library directory (for e.g. external libraries used by backends)" default:"/usr/share/local-ai/libs" group:"backends"`
+	CSRF                 bool     `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
+	UploadLimit          int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
+	APIKeys              []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
+	DisableWebUI         bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
+	OpaqueErrors         bool     `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"api"`
+	Peer2Peer            bool     `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
+	Peer2PeerToken       string   `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
 	ParallelRequests     bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
 	SingleActiveBackend  bool     `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
 	PreloadBackendOnly   bool     `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
@@ -54,7 +61,7 @@ type RunCMD struct {
 	WatchdogBusyTimeout  string   `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
 }

-func (r *RunCMD) Run(ctx *Context) error {
+func (r *RunCMD) Run(ctx *cliContext.Context) error {
 	opts := []config.AppOption{
 		config.WithConfigFile(r.ModelsConfigFile),
 		config.WithJSONStringPreload(r.PreloadModels),
@@ -73,12 +80,40 @@ func (r *RunCMD) Run(ctx *Context) error {
 		config.WithModelLibraryURL(r.RemoteLibrary),
 		config.WithCors(r.CORS),
 		config.WithCorsAllowOrigins(r.CORSAllowOrigins),
+		config.WithCsrf(r.CSRF),
+		config.WithLibPath(r.LibraryPath),
 		config.WithThreads(r.Threads),
 		config.WithBackendAssets(ctx.BackendAssets),
 		config.WithBackendAssetsOutput(r.BackendAssetsPath),
 		config.WithUploadLimitMB(r.UploadLimit),
 		config.WithApiKeys(r.APIKeys),
 		config.WithModelsURL(append(r.Models, r.ModelArgs...)...),
+		config.WithOpaqueErrors(r.OpaqueErrors),
+	}
+
+	if r.Peer2Peer || r.Peer2PeerToken != "" {
+		log.Info().Msg("P2P mode enabled")
+		token := r.Peer2PeerToken
+		if token == "" {
+			// IF no token is provided, and p2p is enabled,
+			// we generate one and wait for the user to pick up the token (this is for interactive)
+			log.Info().Msg("No token provided, generating one")
+			token = p2p.GenerateToken()
+			log.Info().Msg("Generated Token:")
+			fmt.Println(token)
+
+			log.Info().Msg("To use the token, you can run the following command in another node or terminal:")
+			fmt.Printf("export TOKEN=\"%s\"\nlocal-ai worker p2p-llama-cpp-rpc\n", token)
+
+			// Ask for user confirmation
+			log.Info().Msg("Press a button to proceed")
+			var input string
+			fmt.Scanln(&input)
+		}
+		log.Info().Msg("Starting P2P server discovery...")
+		if err := p2p.LLamaCPPRPCServerDiscoverer(context.Background(), token); err != nil {
+			return err
+		}
 	}

 	idleWatchDog := r.EnableWatchdogIdle
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -6,6 +6,7 @@ import (
 	"fmt"

 	"github.com/go-skynet/LocalAI/core/backend"
+	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
@@ -22,14 +23,14 @@ type TranscriptCMD struct {
 	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
 }

-func (t *TranscriptCMD) Run(ctx *Context) error {
+func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 	opts := &config.ApplicationConfig{
 		ModelPath:         t.ModelsPath,
 		Context:           context.Background(),
 		AssetsDestination: t.BackendAssetsPath,
 	}

-	cl := config.NewBackendConfigLoader()
+	cl := config.NewBackendConfigLoader(t.ModelsPath)
 	ml := model.NewModelLoader(opts.ModelPath)
 	if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
 		return err
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -8,6 +8,7 @@ import (
 	"strings"

 	"github.com/go-skynet/LocalAI/core/backend"
+	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
 	"github.com/go-skynet/LocalAI/core/config"
 	"github.com/go-skynet/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
@@ -19,12 +20,13 @@ type TTSCMD struct {
 	Backend           string `short:"b" default:"piper" help:"Backend to run the TTS model"`
 	Model             string `short:"m" required:"" help:"Model name to run the TTS"`
 	Voice             string `short:"v" help:"Voice name to run the TTS"`
+	Language          string `short:"l" help:"Language to use with the TTS"`
 	OutputFile        string `short:"o" type:"path" help:"The path to write the output wav file"`
 	ModelsPath        string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
 }

-func (t *TTSCMD) Run(ctx *Context) error {
+func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 	outputFile := t.OutputFile
 	outputDir := t.BackendAssetsPath
 	if outputFile != "" {
@@ -51,7 +53,7 @@ func (t *TTSCMD) Run(ctx *Context) error {
 	options := config.BackendConfig{}
 	options.SetDefaults()

-	filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, ml, opts, options)
+	filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, t.Language, ml, opts, options)
 	if err != nil {
 		return err
 	}
--- a/core/cli/util.go
+++ b/core/cli/util.go
@@ -0,0 +1,55 @@
+package cli
+
+import (
+	"fmt"
+
+	"github.com/rs/zerolog/log"
+
+	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
+	gguf "github.com/thxcode/gguf-parser-go"
+)
+
+type UtilCMD struct {
+	GGUFInfo GGUFInfoCMD `cmd:"" name:"gguf-info" help:"Get information about a GGUF file"`
+}
+
+type GGUFInfoCMD struct {
+	Args   []string `arg:"" optional:"" name:"args" help:"Arguments to pass to the utility command"`
+	Header bool     `optional:"" default:"false" name:"header" help:"Show header information"`
+}
+
+func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
+	if u.Args == nil || len(u.Args) == 0 {
+		return fmt.Errorf("no GGUF file provided")
+	}
+	// We try to guess only if we don't have a template defined already
+	f, err := gguf.ParseGGUFFile(u.Args[0])
+	if err != nil {
+		// Only valid for gguf files
+		log.Error().Msgf("guessDefaultsFromFile: %s", "not a GGUF file")
+		return err
+	}
+
+	log.Info().
+		Any("eosTokenID", f.Tokenizer().EOSTokenID).
+		Any("bosTokenID", f.Tokenizer().BOSTokenID).
+		Any("modelName", f.Model().Name).
+		Any("architecture", f.Architecture().Architecture).Msgf("GGUF file loaded: %s", u.Args[0])
+
+	log.Info().Any("tokenizer", fmt.Sprintf("%+v", f.Tokenizer())).Msg("Tokenizer")
+	log.Info().Any("architecture", fmt.Sprintf("%+v", f.Architecture())).Msg("Architecture")
+
+	v, exists := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if exists {
+		log.Info().Msgf("chat_template: %s", v.ValueString())
+	}
+
+	if u.Header {
+		for _, metadata := range f.Header.MetadataKV {
+			log.Info().Msgf("%s: %+v", metadata.Key, metadata.Value)
+		}
+		//	log.Info().Any("header", fmt.Sprintf("%+v", f.Header)).Msg("Header")
+	}
+
+	return nil
+}
--- a/core/cli/worker/worker.go
+++ b/core/cli/worker/worker.go
@@ -0,0 +1,10 @@
+package worker
+
+type WorkerFlags struct {
+	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
+}
+
+type Worker struct {
+	P2P      P2P      `cmd:"" name:"p2p-llama-cpp-rpc" help:"Starts a LocalAI llama.cpp worker in P2P mode (requires a token)"`
+	LLamaCPP LLamaCPP `cmd:"" name:"llama-cpp-rpc" help:"Starts a llama.cpp worker in standalone mode"`
+}
--- a/core/cli/worker/worker_llamacpp.go
+++ b/core/cli/worker/worker_llamacpp.go
@@ -0,0 +1,43 @@
+package worker
+
+import (
+	"fmt"
+	"os"
+	"syscall"
+
+	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
+	"github.com/go-skynet/LocalAI/pkg/assets"
+	"github.com/rs/zerolog/log"
+)
+
+type LLamaCPP struct {
+	Args        []string `arg:"" optional:"" name:"models" help:"Model configuration URLs to load"`
+	WorkerFlags `embed:""`
+}
+
+func (r *LLamaCPP) Run(ctx *cliContext.Context) error {
+	// Extract files from the embedded FS
+	err := assets.ExtractFiles(ctx.BackendAssets, r.BackendAssetsPath)
+	log.Debug().Msgf("Extracting backend assets files to %s", r.BackendAssetsPath)
+	if err != nil {
+		log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
+	}
+
+	if len(os.Args) < 4 {
+		return fmt.Errorf("usage: local-ai worker llama-cpp-rpc -- <llama-rpc-server-args>")
+	}
+
+	return syscall.Exec(
+		assets.ResolvePath(
+			r.BackendAssetsPath,
+			"util",
+			"llama-cpp-rpc-server",
+		),
+		append([]string{
+			assets.ResolvePath(
+				r.BackendAssetsPath,
+				"util",
+				"llama-cpp-rpc-server",
+			)}, os.Args[4:]...),
+		os.Environ())
+}
--- a/core/cli/worker/worker_nop2p.go
+++ b/core/cli/worker/worker_nop2p.go
@@ -0,0 +1,16 @@
+//go:build !p2p
+// +build !p2p
+
+package worker
+
+import (
+	"fmt"
+
+	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
+)
+
+type P2P struct{}
+
+func (r *P2P) Run(ctx *cliContext.Context) error {
+	return fmt.Errorf("p2p mode is not enabled in this build")
+}
--- a/core/cli/worker/worker_p2p.go
+++ b/core/cli/worker/worker_p2p.go
@@ -0,0 +1,104 @@
+//go:build p2p
+// +build p2p
+
+package worker
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"time"
+
+	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
+	"github.com/go-skynet/LocalAI/core/p2p"
+	"github.com/go-skynet/LocalAI/pkg/assets"
+	"github.com/phayes/freeport"
+	"github.com/rs/zerolog/log"
+)
+
+type P2P struct {
+	WorkerFlags       `embed:""`
+	Token             string   `env:"LOCALAI_TOKEN,TOKEN" help:"JSON list of galleries"`
+	NoRunner          bool     `env:"LOCALAI_NO_RUNNER,NO_RUNNER" help:"Do not start the llama-cpp-rpc-server"`
+	RunnerAddress     string   `env:"LOCALAI_RUNNER_ADDRESS,RUNNER_ADDRESS" help:"Address of the llama-cpp-rpc-server"`
+	RunnerPort        string   `env:"LOCALAI_RUNNER_PORT,RUNNER_PORT" help:"Port of the llama-cpp-rpc-server"`
+	ExtraLLamaCPPArgs []string `env:"LOCALAI_EXTRA_LLAMA_CPP_ARGS,EXTRA_LLAMA_CPP_ARGS" help:"Extra arguments to pass to llama-cpp-rpc-server"`
+}
+
+func (r *P2P) Run(ctx *cliContext.Context) error {
+	// Extract files from the embedded FS
+	err := assets.ExtractFiles(ctx.BackendAssets, r.BackendAssetsPath)
+	log.Debug().Msgf("Extracting backend assets files to %s", r.BackendAssetsPath)
+	if err != nil {
+		log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
+	}
+
+	// Check if the token is set
+	// as we always need it.
+	if r.Token == "" {
+		return fmt.Errorf("Token is required")
+	}
+
+	port, err := freeport.GetFreePort()
+	if err != nil {
+		return err
+	}
+
+	address := "127.0.0.1"
+
+	if r.NoRunner {
+		// Let override which port and address to bind if the user
+		// configure the llama-cpp service on its own
+		p := fmt.Sprint(port)
+		if r.RunnerAddress != "" {
+			address = r.RunnerAddress
+		}
+		if r.RunnerPort != "" {
+			p = r.RunnerPort
+		}
+
+		err = p2p.BindLLamaCPPWorker(context.Background(), address, p, r.Token)
+		if err != nil {
+			return err
+		}
+		log.Info().Msgf("You need to start llama-cpp-rpc-server on '%s:%s'", address, p)
+
+		return nil
+	}
+
+	// Start llama.cpp directly from the version we have pre-packaged
+	go func() {
+		for {
+			log.Info().Msgf("Starting llama-cpp-rpc-server on '%s:%d'", address, port)
+			cmd := exec.Command(
+				assets.ResolvePath(
+					r.BackendAssetsPath,
+					"util",
+					"llama-cpp-rpc-server",
+				),
+				append([]string{"--host", address, "--port", fmt.Sprint(port)}, r.ExtraLLamaCPPArgs...)...,
+			)
+
+			cmd.Env = os.Environ()
+
+			cmd.Stderr = os.Stdout
+			cmd.Stdout = os.Stdout
+
+			if err := cmd.Start(); err != nil {
+				log.Error().Err(err).Msg("Failed to start llama-cpp-rpc-server")
+			}
+
+			cmd.Wait()
+		}
+	}()
+
+	err = p2p.BindLLamaCPPWorker(context.Background(), address, fmt.Sprint(port), r.Token)
+	if err != nil {
+		return err
+	}
+
+	for {
+		time.Sleep(1 * time.Second)
+	}
+}
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -7,6 +7,7 @@ import (
 	"time"

 	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/go-skynet/LocalAI/pkg/xsysinfo"
 	"github.com/rs/zerolog/log"
 )

@@ -14,6 +15,7 @@ type ApplicationConfig struct {
 	Context                             context.Context
 	ConfigFile                          string
 	ModelPath                           string
+	LibPath                             string
 	UploadLimitMB, Threads, ContextSize int
 	DisableWebUI                        bool
 	F16                                 bool
@@ -25,10 +27,12 @@ type ApplicationConfig struct {
 	DynamicConfigsDir                   string
 	DynamicConfigsDirPollInterval       time.Duration
 	CORS                                bool
+	CSRF                                bool
 	PreloadJSONModels                   string
 	PreloadModelsFromPath               string
 	CORSAllowOrigins                    string
 	ApiKeys                             []string
+	OpaqueErrors                        bool

 	ModelLibraryURL string

@@ -59,7 +63,6 @@ func NewApplicationConfig(o ...AppOption) *ApplicationConfig {
 	opt := &ApplicationConfig{
 		Context:       context.Background(),
 		UploadLimitMB: 15,
-		Threads:       1,
 		ContextSize:   512,
 		Debug:         true,
 	}
@@ -87,12 +90,24 @@ func WithCors(b bool) AppOption {
 	}
 }

+func WithCsrf(b bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.CSRF = b
+	}
+}
+
 func WithModelLibraryURL(url string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.ModelLibraryURL = url
 	}
 }

+func WithLibPath(path string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.LibPath = path
+	}
+}
+
 var EnableWatchDog = func(o *ApplicationConfig) {
 	o.WatchDog = true
 }
@@ -213,6 +228,9 @@ func WithUploadLimitMB(limit int) AppOption {

 func WithThreads(threads int) AppOption {
 	return func(o *ApplicationConfig) {
+		if threads == 0 { // 0 is not allowed
+			threads = xsysinfo.CPUPhysicalCores()
+		}
 		o.Threads = threads
 	}
 }
@@ -277,6 +295,12 @@ func WithApiKeys(apiKeys []string) AppOption {
 	}
 }

+func WithOpaqueErrors(opaque bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.OpaqueErrors = opaque
+	}
+}
+
 // ToConfigLoaderOptions returns a slice of ConfigLoader Option.
 // Some options defined at the application level are going to be passed as defaults for
 // all the configuration for the models.
@@ -289,6 +313,7 @@ func (o *ApplicationConfig) ToConfigLoaderOptions() []ConfigLoaderOption {
 		LoadOptionDebug(o.Debug),
 		LoadOptionF16(o.F16),
 		LoadOptionThreads(o.Threads),
+		ModelPath(o.ModelPath),
 	}
 }

--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -2,6 +2,8 @@ package config

 import (
 	"os"
+	"regexp"
+	"strings"

 	"github.com/go-skynet/LocalAI/core/schema"
 	"github.com/go-skynet/LocalAI/pkg/downloader"
@@ -13,6 +15,15 @@ const (
 	RAND_SEED = -1
 )

+type TTSConfig struct {
+
+	// Voice wav path or id
+	Voice string `yaml:"voice"`
+
+	// Vall-e-x
+	VallE VallE `yaml:"vall-e"`
+}
+
 type BackendConfig struct {
 	schema.PredictionOptions `yaml:"parameters"`
 	Name                     string `yaml:"name"`
@@ -25,9 +36,11 @@ type BackendConfig struct {
 	Backend        string            `yaml:"backend"`
 	TemplateConfig TemplateConfig    `yaml:"template"`

-	PromptStrings, InputStrings                []string `yaml:"-"`
-	InputToken                                 [][]int  `yaml:"-"`
-	functionCallString, functionCallNameString string   `yaml:"-"`
+	PromptStrings, InputStrings                []string               `yaml:"-"`
+	InputToken                                 [][]int                `yaml:"-"`
+	functionCallString, functionCallNameString string                 `yaml:"-"`
+	ResponseFormat                             string                 `yaml:"-"`
+	ResponseFormatMap                          map[string]interface{} `yaml:"-"`

 	FunctionsConfig functions.FunctionsConfig `yaml:"function"`

@@ -45,8 +58,8 @@ type BackendConfig struct {
 	// GRPC Options
 	GRPC GRPC `yaml:"grpc"`

-	// Vall-e-x
-	VallE VallE `yaml:"vall-e"`
+	// TTS specifics
+	TTSConfig `yaml:"tts"`

 	// CUDA
 	// Explicitly enable CUDA or not (some backends might need it)
@@ -93,6 +106,8 @@ type Diffusers struct {
 	ControlNet       string  `yaml:"control_net"`
 }

+// LLMConfig is a struct that holds the configuration that are
+// generic for most of the LLM backends.
 type LLMConfig struct {
 	SystemPrompt    string   `yaml:"system_prompt"`
 	TensorSplit     string   `yaml:"tensor_split"`
@@ -144,6 +159,7 @@ type LLMConfig struct {
 	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
 }

+// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
 type AutoGPTQ struct {
 	ModelBaseName    string `yaml:"model_base_name"`
 	Device           string `yaml:"device"`
@@ -151,13 +167,31 @@ type AutoGPTQ struct {
 	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
 }

+// TemplateConfig is a struct that holds the configuration of the templating system
 type TemplateConfig struct {
-	Chat                 string `yaml:"chat"`
-	ChatMessage          string `yaml:"chat_message"`
-	Completion           string `yaml:"completion"`
-	Edit                 string `yaml:"edit"`
-	Functions            string `yaml:"function"`
-	UseTokenizerTemplate bool   `yaml:"use_tokenizer_template"`
+	// Chat is the template used in the chat completion endpoint
+	Chat string `yaml:"chat"`
+
+	// ChatMessage is the template used for chat messages
+	ChatMessage string `yaml:"chat_message"`
+
+	// Completion is the template used for completion requests
+	Completion string `yaml:"completion"`
+
+	// Edit is the template used for edit completion requests
+	Edit string `yaml:"edit"`
+
+	// Functions is the template used when tools are present in the client requests
+	Functions string `yaml:"function"`
+
+	// UseTokenizerTemplate is a flag that indicates if the tokenizer template should be used.
+	// Note: this is mostly consumed for backends such as vllm and transformers
+	// that can use the tokenizers specified in the JSON config files of the models
+	UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`
+
+	// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
+	// It defaults to \n
+	JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
 }

 func (c *BackendConfig) SetFunctionCallString(s string) {
@@ -334,4 +368,41 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	if debug {
 		cfg.Debug = &trueV
 	}
+
+	guessDefaultsFromFile(cfg, lo.modelPath)
+}
+
+func (c *BackendConfig) Validate() bool {
+	downloadedFileNames := []string{}
+	for _, f := range c.DownloadFiles {
+		downloadedFileNames = append(downloadedFileNames, f.Filename)
+	}
+	validationTargets := []string{c.Backend, c.Model, c.MMProj}
+	validationTargets = append(validationTargets, downloadedFileNames...)
+	// Simple validation to make sure the model can be correctly loaded
+	for _, n := range validationTargets {
+		if n == "" {
+			continue
+		}
+		if strings.HasPrefix(n, string(os.PathSeparator)) ||
+			strings.Contains(n, "..") {
+			return false
+		}
+	}
+
+	if c.Name == "" {
+		return false
+	}
+
+	if c.Backend != "" {
+		// a regex that checks that is a string name with no special characters, except '-' and '_'
+		re := regexp.MustCompile(`^[a-zA-Z0-9-_]+$`)
+		return re.MatchString(c.Backend)
+	}
+
+	return true
+}
+
+func (c *BackendConfig) HasTemplate() bool {
+	return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != ""
 }
--- a/core/config/backend_config_loader.go
+++ b/core/config/backend_config_loader.go
@@ -19,11 +19,20 @@ import (
 )

 type BackendConfigLoader struct {
-	configs map[string]BackendConfig
+	configs   map[string]BackendConfig
+	modelPath string
 	sync.Mutex
 }

+func NewBackendConfigLoader(modelPath string) *BackendConfigLoader {
+	return &BackendConfigLoader{
+		configs:   make(map[string]BackendConfig),
+		modelPath: modelPath,
+	}
+}
+
 type LoadOptions struct {
+	modelPath        string
 	debug            bool
 	threads, ctxSize int
 	f16              bool
@@ -47,6 +56,12 @@ func LoadOptionContextSize(ctxSize int) ConfigLoaderOption {
 	}
 }

+func ModelPath(modelPath string) ConfigLoaderOption {
+	return func(o *LoadOptions) {
+		o.modelPath = modelPath
+	}
+}
+
 func LoadOptionF16(f16 bool) ConfigLoaderOption {
 	return func(o *LoadOptions) {
 		o.f16 = f16
@@ -61,46 +76,8 @@ func (lo *LoadOptions) Apply(options ...ConfigLoaderOption) {
 	}
 }

-// Load a config file for a model
-func (cl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
-
-	// Load a config file if present after the model name
-	cfg := &BackendConfig{
-		PredictionOptions: schema.PredictionOptions{
-			Model: modelName,
-		},
-	}
-
-	cfgExisting, exists := cl.GetBackendConfig(modelName)
-	if exists {
-		cfg = &cfgExisting
-	} else {
-		// Try loading a model config file
-		modelConfig := filepath.Join(modelPath, modelName+".yaml")
-		if _, err := os.Stat(modelConfig); err == nil {
-			if err := cl.LoadBackendConfig(
-				modelConfig, opts...,
-			); err != nil {
-				return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
-			}
-			cfgExisting, exists = cl.GetBackendConfig(modelName)
-			if exists {
-				cfg = &cfgExisting
-			}
-		}
-	}
-
-	cfg.SetDefaults(opts...)
-
-	return cfg, nil
-}
-
-func NewBackendConfigLoader() *BackendConfigLoader {
-	return &BackendConfigLoader{
-		configs: make(map[string]BackendConfig),
-	}
-}
-func ReadBackendConfigFile(file string, opts ...ConfigLoaderOption) ([]*BackendConfig, error) {
+// TODO: either in the next PR or the next commit, I want to merge these down into a single function that looks at the first few characters of the file to determine if we need to deserialize to []BackendConfig or BackendConfig
+func readMultipleBackendConfigsFromFile(file string, opts ...ConfigLoaderOption) ([]*BackendConfig, error) {
 	c := &[]*BackendConfig{}
 	f, err := os.ReadFile(file)
 	if err != nil {
@@ -117,7 +94,7 @@ func ReadBackendConfigFile(file string, opts ...ConfigLoaderOption) ([]*BackendC
 	return *c, nil
 }

-func ReadBackendConfig(file string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
+func readBackendConfigFromFile(file string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
 	lo := &LoadOptions{}
 	lo.Apply(opts...)

@@ -134,44 +111,86 @@ func ReadBackendConfig(file string, opts ...ConfigLoaderOption) (*BackendConfig,
 	return c, nil
 }

-func (cm *BackendConfigLoader) LoadBackendConfigFile(file string, opts ...ConfigLoaderOption) error {
-	cm.Lock()
-	defer cm.Unlock()
-	c, err := ReadBackendConfigFile(file, opts...)
+// Load a config file for a model
+func (bcl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath string, opts ...ConfigLoaderOption) (*BackendConfig, error) {
+
+	// Load a config file if present after the model name
+	cfg := &BackendConfig{
+		PredictionOptions: schema.PredictionOptions{
+			Model: modelName,
+		},
+	}
+
+	cfgExisting, exists := bcl.GetBackendConfig(modelName)
+	if exists {
+		cfg = &cfgExisting
+	} else {
+		// Try loading a model config file
+		modelConfig := filepath.Join(modelPath, modelName+".yaml")
+		if _, err := os.Stat(modelConfig); err == nil {
+			if err := bcl.LoadBackendConfig(
+				modelConfig, opts...,
+			); err != nil {
+				return nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
+			}
+			cfgExisting, exists = bcl.GetBackendConfig(modelName)
+			if exists {
+				cfg = &cfgExisting
+			}
+		}
+	}
+
+	cfg.SetDefaults(opts...)
+
+	return cfg, nil
+}
+
+// This format is currently only used when reading a single file at startup, passed in via ApplicationConfig.ConfigFile
+func (bcl *BackendConfigLoader) LoadMultipleBackendConfigsSingleFile(file string, opts ...ConfigLoaderOption) error {
+	bcl.Lock()
+	defer bcl.Unlock()
+	c, err := readMultipleBackendConfigsFromFile(file, opts...)
 	if err != nil {
 		return fmt.Errorf("cannot load config file: %w", err)
 	}

 	for _, cc := range c {
-		cm.configs[cc.Name] = *cc
+		if cc.Validate() {
+			bcl.configs[cc.Name] = *cc
+		}
 	}
 	return nil
 }

-func (cl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoaderOption) error {
-	cl.Lock()
-	defer cl.Unlock()
-	c, err := ReadBackendConfig(file, opts...)
+func (bcl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoaderOption) error {
+	bcl.Lock()
+	defer bcl.Unlock()
+	c, err := readBackendConfigFromFile(file, opts...)
 	if err != nil {
 		return fmt.Errorf("cannot read config file: %w", err)
 	}

-	cl.configs[c.Name] = *c
+	if c.Validate() {
+		bcl.configs[c.Name] = *c
+	} else {
+		return fmt.Errorf("config is not valid")
+	}
+
 	return nil
 }

-func (cl *BackendConfigLoader) GetBackendConfig(m string) (BackendConfig, bool) {
-	cl.Lock()
-	defer cl.Unlock()
-	v, exists := cl.configs[m]
+func (bcl *BackendConfigLoader) GetBackendConfig(m string) (BackendConfig, bool) {
+	bcl.Lock()
+	defer bcl.Unlock()
+	v, exists := bcl.configs[m]
 	return v, exists
 }

-func (cl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
-	cl.Lock()
-	defer cl.Unlock()
+func (bcl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
+	bcl.Lock()
+	defer bcl.Unlock()
 	var res []BackendConfig
-	for _, v := range cl.configs {
+	for _, v := range bcl.configs {
 		res = append(res, v)
 	}

@@ -182,26 +201,16 @@ func (cl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
 	return res
 }

-func (cl *BackendConfigLoader) RemoveBackendConfig(m string) {
-	cl.Lock()
-	defer cl.Unlock()
-	delete(cl.configs, m)
-}
-
-func (cl *BackendConfigLoader) ListBackendConfigs() []string {
-	cl.Lock()
-	defer cl.Unlock()
-	var res []string
-	for k := range cl.configs {
-		res = append(res, k)
-	}
-	return res
+func (bcl *BackendConfigLoader) RemoveBackendConfig(m string) {
+	bcl.Lock()
+	defer bcl.Unlock()
+	delete(bcl.configs, m)
 }

 // Preload prepare models if they are not local but url or huggingface repositories
-func (cl *BackendConfigLoader) Preload(modelPath string) error {
-	cl.Lock()
-	defer cl.Unlock()
+func (bcl *BackendConfigLoader) Preload(modelPath string) error {
+	bcl.Lock()
+	defer bcl.Unlock()

 	status := func(fileName, current, total string, percent float64) {
 		utils.DisplayDownloadFunction(fileName, current, total, percent)
@@ -223,7 +232,7 @@ func (cl *BackendConfigLoader) Preload(modelPath string) error {
 		}
 	}

-	for i, config := range cl.configs {
+	for i, config := range bcl.configs {

 		// Download files and verify their SHA
 		for i, file := range config.DownloadFiles {
@@ -252,10 +261,10 @@ func (cl *BackendConfigLoader) Preload(modelPath string) error {
 				}
 			}

-			cc := cl.configs[i]
+			cc := bcl.configs[i]
 			c := &cc
 			c.PredictionOptions.Model = modelFileName
-			cl.configs[i] = *c
+			bcl.configs[i] = *c
 		}

 		if config.IsMMProjURL() {
@@ -269,22 +278,22 @@ func (cl *BackendConfigLoader) Preload(modelPath string) error {
 				}
 			}

-			cc := cl.configs[i]
+			cc := bcl.configs[i]
 			c := &cc
 			c.MMProj = modelFileName
-			cl.configs[i] = *c
+			bcl.configs[i] = *c
 		}

-		if cl.configs[i].Name != "" {
-			glamText(fmt.Sprintf("**Model name**: _%s_", cl.configs[i].Name))
+		if bcl.configs[i].Name != "" {
+			glamText(fmt.Sprintf("**Model name**: _%s_", bcl.configs[i].Name))
 		}
-		if cl.configs[i].Description != "" {
+		if bcl.configs[i].Description != "" {
 			//glamText("**Description**")
-			glamText(cl.configs[i].Description)
+			glamText(bcl.configs[i].Description)
 		}
-		if cl.configs[i].Usage != "" {
+		if bcl.configs[i].Usage != "" {
 			//glamText("**Usage**")
-			glamText(cl.configs[i].Usage)
+			glamText(bcl.configs[i].Usage)
 		}
 	}
 	return nil
@@ -292,12 +301,12 @@ func (cl *BackendConfigLoader) Preload(modelPath string) error {

 // LoadBackendConfigsFromPath reads all the configurations of the models from a path
 // (non-recursive)
-func (cm *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
-	cm.Lock()
-	defer cm.Unlock()
+func (bcl *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
+	bcl.Lock()
+	defer bcl.Unlock()
 	entries, err := os.ReadDir(path)
 	if err != nil {
-		return err
+		return fmt.Errorf("cannot read directory '%s': %w", path, err)
 	}
 	files := make([]fs.FileInfo, 0, len(entries))
 	for _, entry := range entries {
@@ -313,9 +322,15 @@ func (cm *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...C
 			strings.HasPrefix(file.Name(), ".") {
 			continue
 		}
-		c, err := ReadBackendConfig(filepath.Join(path, file.Name()), opts...)
-		if err == nil {
-			cm.configs[c.Name] = *c
+		c, err := readBackendConfigFromFile(filepath.Join(path, file.Name()), opts...)
+		if err != nil {
+			log.Error().Err(err).Msgf("cannot read config file: %s", file.Name())
+			continue
+		}
+		if c.Validate() {
+			bcl.configs[c.Name] = *c
+		} else {
+			log.Error().Err(err).Msgf("config is not valid")
 		}
 	}

--- a/core/config/backend_config_test.go
+++ b/core/config/backend_config_test.go
@@ -0,0 +1,63 @@
+package config
+
+import (
+	"io"
+	"net/http"
+	"os"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Test cases for config related functions", func() {
+	Context("Test Read configuration functions", func() {
+		It("Test Validate", func() {
+			tmp, err := os.CreateTemp("", "config.yaml")
+			Expect(err).To(BeNil())
+			defer os.Remove(tmp.Name())
+			_, err = tmp.WriteString(
+				`backend: "foo-bar"
+parameters:
+  model: "foo-bar"`)
+			Expect(err).ToNot(HaveOccurred())
+			config, err := readBackendConfigFromFile(tmp.Name())
+			Expect(err).To(BeNil())
+			Expect(config).ToNot(BeNil())
+			Expect(config.Validate()).To(BeFalse())
+		})
+		It("Test Validate", func() {
+			tmp, err := os.CreateTemp("", "config.yaml")
+			Expect(err).To(BeNil())
+			defer os.Remove(tmp.Name())
+			_, err = tmp.WriteString(
+				`name: bar-baz
+backend: "foo-bar"
+parameters:
+  model: "foo-bar"`)
+			Expect(err).ToNot(HaveOccurred())
+			config, err := readBackendConfigFromFile(tmp.Name())
+			Expect(err).To(BeNil())
+			Expect(config).ToNot(BeNil())
+			// two configs in config.yaml
+			Expect(config.Name).To(Equal("bar-baz"))
+			Expect(config.Validate()).To(BeTrue())
+
+			// download https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml
+			httpClient := http.Client{}
+			resp, err := httpClient.Get("https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml")
+			Expect(err).To(BeNil())
+			defer resp.Body.Close()
+			tmp, err = os.CreateTemp("", "config.yaml")
+			Expect(err).To(BeNil())
+			defer os.Remove(tmp.Name())
+			_, err = io.Copy(tmp, resp.Body)
+			Expect(err).To(BeNil())
+			config, err = readBackendConfigFromFile(tmp.Name())
+			Expect(err).To(BeNil())
+			Expect(config).ToNot(BeNil())
+			// two configs in config.yaml
+			Expect(config.Name).To(Equal("hermes-2-pro-mistral"))
+			Expect(config.Validate()).To(BeTrue())
+		})
+	})
+})
--- a/core/config/config_suite_test.go
+++ b/core/config/config_suite_test.go
@@ -0,0 +1,13 @@
+package config_test
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestConfig(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Config test suite")
+}
--- a/core/config/config_test.go
+++ b/core/config/config_test.go
@@ -1,10 +1,8 @@
-package config_test
+package config

 import (
 	"os"

-	. "github.com/go-skynet/LocalAI/core/config"
-
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
@@ -17,8 +15,8 @@ var _ = Describe("Test cases for config related functions", func() {

 	Context("Test Read configuration functions", func() {
 		configFile = os.Getenv("CONFIG_FILE")
-		It("Test ReadConfigFile", func() {
-			config, err := ReadBackendConfigFile(configFile)
+		It("Test readConfigFile", func() {
+			config, err := readMultipleBackendConfigsFromFile(configFile)
 			Expect(err).To(BeNil())
 			Expect(config).ToNot(BeNil())
 			// two configs in config.yaml
@@ -27,26 +25,28 @@ var _ = Describe("Test cases for config related functions", func() {
 		})

 		It("Test LoadConfigs", func() {
-			cm := NewBackendConfigLoader()
-			opts := NewApplicationConfig()
-			err := cm.LoadBackendConfigsFromPath(opts.ModelPath)
+
+			bcl := NewBackendConfigLoader(os.Getenv("MODELS_PATH"))
+			err := bcl.LoadBackendConfigsFromPath(os.Getenv("MODELS_PATH"))
+
 			Expect(err).To(BeNil())
-			Expect(cm.ListBackendConfigs()).ToNot(BeNil())
+			configs := bcl.GetAllBackendConfigs()
+			loadedModelNames := []string{}
+			for _, v := range configs {
+				loadedModelNames = append(loadedModelNames, v.Name)
+			}
+			Expect(configs).ToNot(BeNil())

-			// config should includes gpt4all models's api.config
-			Expect(cm.ListBackendConfigs()).To(ContainElements("gpt4all"))
-
-			// config should includes gpt2 models's api.config
-			Expect(cm.ListBackendConfigs()).To(ContainElements("gpt4all-2"))
+			Expect(loadedModelNames).To(ContainElements("code-search-ada-code-001"))

 			// config should includes text-embedding-ada-002 models's api.config
-			Expect(cm.ListBackendConfigs()).To(ContainElements("text-embedding-ada-002"))
+			Expect(loadedModelNames).To(ContainElements("text-embedding-ada-002"))

 			// config should includes rwkv_test models's api.config
-			Expect(cm.ListBackendConfigs()).To(ContainElements("rwkv_test"))
+			Expect(loadedModelNames).To(ContainElements("rwkv_test"))

 			// config should includes whisper-1 models's api.config
-			Expect(cm.ListBackendConfigs()).To(ContainElements("whisper-1"))
+			Expect(loadedModelNames).To(ContainElements("whisper-1"))
 		})
 	})
 })
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@@ -0,0 +1,226 @@
+package config
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/rs/zerolog/log"
+
+	gguf "github.com/thxcode/gguf-parser-go"
+)
+
+type familyType uint8
+
+const (
+	Unknown familyType = iota
+	LLaMa3
+	CommandR
+	Phi3
+	ChatML
+	Mistral03
+	Gemma
+)
+
+type settingsConfig struct {
+	StopWords      []string
+	TemplateConfig TemplateConfig
+}
+
+// default settings to adopt with a given model family
+var defaultsSettings map[familyType]settingsConfig = map[familyType]settingsConfig{
+	Gemma: {
+		StopWords: []string{"<|im_end|>", "<end_of_turn>", "<start_of_turn>"},
+		TemplateConfig: TemplateConfig{
+			Chat:        "{{.Input }}\n<|start_of_turn|>model\n",
+			ChatMessage: "<|start_of_turn|>{{if eq .RoleName \"assistant\" }}model{{else}}{{ .RoleName }}{{end}}\n{{ if .Content -}}\n{{.Content -}}\n{{ end -}}<|end_of_turn|>",
+			Completion:  "{{.Input}}",
+		},
+	},
+	LLaMa3: {
+		StopWords: []string{"<|eot_id|>"},
+		TemplateConfig: TemplateConfig{
+			Chat:        "<|begin_of_text|>{{.Input }}\n<|start_header_id|>assistant<|end_header_id|>",
+			ChatMessage: "<|start_header_id|>{{ .RoleName }}<|end_header_id|>\n\n{{.Content }}<|eot_id|>",
+		},
+	},
+	CommandR: {
+		TemplateConfig: TemplateConfig{
+			Chat: "{{.Input -}}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
+			Functions: `<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
+You are a function calling AI model, you can call the following functions:
+## Available Tools
+{{range .Functions}}
+- {"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}
+{{end}}
+When using a tool, reply with JSON, for instance {"name": "tool_name", "arguments": {"param1": "value1", "param2": "value2"}}
+<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Input -}}`,
+			ChatMessage: `{{if eq .RoleName "user" -}}
+<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "system" -}}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "assistant" -}}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if eq .RoleName "tool" -}}
+<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{.Content}}<|END_OF_TURN_TOKEN|>
+{{- else if .FunctionCall -}}
+<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{toJson .FunctionCall}}}<|END_OF_TURN_TOKEN|>
+{{- end -}}`,
+		},
+		StopWords: []string{"<|END_OF_TURN_TOKEN|>"},
+	},
+	Phi3: {
+		TemplateConfig: TemplateConfig{
+			Chat:        "{{.Input}}\n<|assistant|>",
+			ChatMessage: "<|{{ .RoleName }}|>\n{{.Content}}<|end|>",
+			Completion:  "{{.Input}}",
+		},
+		StopWords: []string{"<|end|>", "<|endoftext|>"},
+	},
+	ChatML: {
+		TemplateConfig: TemplateConfig{
+			Chat: "{{.Input -}}\n<|im_start|>assistant",
+			Functions: `<|im_start|>system
+You are a function calling AI model. You are provided with functions to execute. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
+{{range .Functions}}
+{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+{{end}}
+For each function call return a json object with function name and arguments
+<|im_end|>
+{{.Input -}}
+<|im_start|>assistant`,
+			ChatMessage: `<|im_start|>{{ .RoleName }}
+{{ if .FunctionCall -}}
+Function call:
+{{ else if eq .RoleName "tool" -}}
+Function response:
+{{ end -}}
+{{ if .Content -}}
+{{.Content }}
+{{ end -}}
+{{ if .FunctionCall -}}
+{{toJson .FunctionCall}}
+{{ end -}}<|im_end|>`,
+		},
+		StopWords: []string{"<|im_end|>", "<dummy32000>", "</s>"},
+	},
+	Mistral03: {
+		TemplateConfig: TemplateConfig{
+			Chat:      "{{.Input -}}",
+			Functions: `[AVAILABLE_TOOLS] [{{range .Functions}}{"type": "function", "function": {"name": "{{.Name}}", "description": "{{.Description}}", "parameters": {{toJson .Parameters}} }}{{end}} ] [/AVAILABLE_TOOLS]{{.Input }}`,
+			ChatMessage: `{{if eq .RoleName "user" -}}
+[INST] {{.Content }} [/INST]
+{{- else if .FunctionCall -}}
+[TOOL_CALLS] {{toJson .FunctionCall}} [/TOOL_CALLS]
+{{- else if eq .RoleName "tool" -}}
+[TOOL_RESULTS] {{.Content}} [/TOOL_RESULTS]
+{{- else -}}
+{{ .Content -}}
+{{ end -}}`,
+		},
+		StopWords: []string{"<|im_end|>", "<dummy32000>", "</tool_call>", "<|eot_id|>", "<|end_of_text|>", "</s>", "[/TOOL_CALLS]", "[/ACTIONS]"},
+	},
+}
+
+// this maps well known template used in HF to model families defined above
+var knownTemplates = map[string]familyType{
+	`{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}`:                              ChatML,
+	`{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}`: Mistral03,
+}
+
+func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
+
+	if os.Getenv("LOCALAI_DISABLE_GUESSING") == "true" {
+		log.Debug().Msgf("guessDefaultsFromFile: %s", "guessing disabled with LOCALAI_DISABLE_GUESSING")
+		return
+	}
+
+	if modelPath == "" {
+		log.Debug().Msgf("guessDefaultsFromFile: %s", "modelPath is empty")
+		return
+	}
+
+	if cfg.HasTemplate() {
+		// nothing to guess here
+		log.Debug().Any("name", cfg.Name).Msgf("guessDefaultsFromFile: %s", "template already set")
+		return
+	}
+
+	// We try to guess only if we don't have a template defined already
+	f, err := gguf.ParseGGUFFile(filepath.Join(modelPath, cfg.ModelFileName()))
+	if err != nil {
+		// Only valid for gguf files
+		log.Debug().Msgf("guessDefaultsFromFile: %s", "not a GGUF file")
+		return
+	}
+
+	log.Debug().
+		Any("eosTokenID", f.Tokenizer().EOSTokenID).
+		Any("bosTokenID", f.Tokenizer().BOSTokenID).
+		Any("modelName", f.Model().Name).
+		Any("architecture", f.Architecture().Architecture).Msgf("Model file loaded: %s", cfg.ModelFileName())
+
+	// guess the name
+	if cfg.Name == "" {
+		cfg.Name = f.Model().Name
+	}
+
+	family := identifyFamily(f)
+
+	if family == Unknown {
+		log.Debug().Msgf("guessDefaultsFromFile: %s", "family not identified")
+		return
+	}
+
+	// identify template
+	settings, ok := defaultsSettings[family]
+	if ok {
+		cfg.TemplateConfig = settings.TemplateConfig
+		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: guessed template %+v", cfg.TemplateConfig)
+		if len(cfg.StopWords) == 0 {
+			cfg.StopWords = settings.StopWords
+		}
+	} else {
+		log.Debug().Any("family", family).Msgf("guessDefaultsFromFile: no template found for family")
+	}
+}
+
+func identifyFamily(f *gguf.GGUFFile) familyType {
+
+	// identify from well known templates first
+	chatTemplate, found := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if found && chatTemplate.ValueString() != "" {
+		if family, ok := knownTemplates[chatTemplate.ValueString()]; ok {
+			return family
+		}
+	}
+
+	// otherwise try to identify from the model properties
+	arch := f.Architecture().Architecture
+	eosTokenID := f.Tokenizer().EOSTokenID
+	bosTokenID := f.Tokenizer().BOSTokenID
+
+	isYI := arch == "llama" && bosTokenID == 1 && eosTokenID == 2
+	// WTF! Mistral0.3 and isYi have same bosTokenID and eosTokenID
+
+	llama3 := arch == "llama" && eosTokenID == 128009
+	commandR := arch == "command-r" && eosTokenID == 255001
+	qwen2 := arch == "qwen2"
+	phi3 := arch == "phi-3"
+	gemma := strings.HasPrefix(f.Model().Name, "gemma")
+
+	switch {
+	case gemma:
+		return Gemma
+	case llama3:
+		return LLaMa3
+	case commandR:
+		return CommandR
+	case phi3:
+		return Phi3
+	case qwen2, isYI:
+		return ChatML
+	default:
+		return Unknown
+	}
+}
--- a/core/dependencies_manager/manager.go
+++ b/core/dependencies_manager/manager.go
@@ -0,0 +1,46 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"github.com/go-skynet/LocalAI/pkg/downloader"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+	"gopkg.in/yaml.v3"
+)
+
+type Asset struct {
+	FileName string `yaml:"filename"`
+	URL      string `yaml:"url"`
+	SHA      string `yaml:"sha"`
+}
+
+func main() {
+
+	// read the YAML file which contains a list of assets
+	// and download them in the asset path
+	assets := []Asset{}
+
+	assetFile := os.Args[1]
+	destPath := os.Args[2]
+
+	// read the YAML file
+	f, err := os.ReadFile(assetFile)
+	if err != nil {
+		panic(err)
+	}
+	// unmarshal the YAML data into a struct
+	if err := yaml.Unmarshal(f, &assets); err != nil {
+		panic(err)
+	}
+
+	// download the assets
+	for _, asset := range assets {
+		if err := downloader.DownloadFile(asset.URL, filepath.Join(destPath, asset.FileName), asset.SHA, 1, 1, utils.DisplayDownloadFunction); err != nil {
+			panic(err)
+		}
+	}
+
+	fmt.Println("Finished downloading assets")
+}
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -20,6 +20,7 @@ import (
 	"github.com/gofiber/contrib/fiberzerolog"
 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/cors"
+	"github.com/gofiber/fiber/v2/middleware/csrf"
 	"github.com/gofiber/fiber/v2/middleware/favicon"
 	"github.com/gofiber/fiber/v2/middleware/filesystem"
 	"github.com/gofiber/fiber/v2/middleware/recover"
@@ -65,15 +66,19 @@ var embedDirStatic embed.FS
 // @name Authorization

 func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (*fiber.App, error) {
-	// Return errors as JSON responses
-	app := fiber.New(fiber.Config{
+
+	fiberCfg := fiber.Config{
 		Views:     renderEngine(),
 		BodyLimit: appConfig.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
 		// We disable the Fiber startup message as it does not conform to structured logging.
 		// We register a startup log line with connection information in the OnListen hook to keep things user friendly though
 		DisableStartupMessage: true,
 		// Override default error handler
-		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
+	}
+
+	if !appConfig.OpaqueErrors {
+		// Normally, return errors as JSON responses
+		fiberCfg.ErrorHandler = func(ctx *fiber.Ctx, err error) error {
 			// Status code defaults to 500
 			code := fiber.StatusInternalServerError

@@ -89,8 +94,15 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 					Error: &schema.APIError{Message: err.Error(), Code: code},
 				},
 			)
-		},
-	})
+		}
+	} else {
+		// If OpaqueErrors are required, replace everything with a blank 500.
+		fiberCfg.ErrorHandler = func(ctx *fiber.Ctx, _ error) error {
+			return ctx.Status(500).SendString("")
+		}
+	}
+
+	app := fiber.New(fiberCfg)

 	app.Hooks().OnListen(func(listenData fiber.ListenData) error {
 		scheme := "http"
@@ -167,12 +179,17 @@ func App(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *confi
 		app.Use(c)
 	}

+	if appConfig.CSRF {
+		log.Debug().Msg("Enabling CSRF middleware. Tokens are now required for state-modifying requests")
+		app.Use(csrf.New())
+	}
+
 	// Load config jsons
 	utils.LoadConfig(appConfig.UploadDir, openai.UploadedFilesFile, &openai.UploadedFiles)
 	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsConfigFile, &openai.Assistants)
 	utils.LoadConfig(appConfig.ConfigsDir, openai.AssistantsFileConfigFile, &openai.AssistantFiles)

-	galleryService := services.NewGalleryService(appConfig.ModelPath)
+	galleryService := services.NewGalleryService(appConfig)
 	galleryService.Start(appConfig.Context, cl)

 	routes.RegisterElevenLabsRoutes(app, cl, ml, appConfig, auth)
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -73,7 +73,8 @@ func getModelStatus(url string) (response map[string]interface{}) {
 }

 func getModels(url string) (response []gallery.GalleryModel) {
-	downloader.GetURI(url, func(url string, i []byte) error {
+	// TODO: No tests currently seem to exercise file:// urls. Fix?
+	downloader.GetURI(url, "", func(url string, i []byte) error {
 		// Unmarshal YAML data into a struct
 		return json.Unmarshal(i, &response)
 	})
@@ -221,6 +222,8 @@ var _ = Describe("API test", func() {
 			Expect(err).ToNot(HaveOccurred())

 			modelDir = filepath.Join(tmpdir, "models")
+			err = os.Mkdir(modelDir, 0750)
+			Expect(err).ToNot(HaveOccurred())
 			backendAssetsDir := filepath.Join(tmpdir, "backend-assets")
 			err = os.Mkdir(backendAssetsDir, 0750)
 			Expect(err).ToNot(HaveOccurred())
@@ -241,13 +244,13 @@ var _ = Describe("API test", func() {
 			}
 			out, err := yaml.Marshal(g)
 			Expect(err).ToNot(HaveOccurred())
-			err = os.WriteFile(filepath.Join(tmpdir, "gallery_simple.yaml"), out, 0600)
+			err = os.WriteFile(filepath.Join(modelDir, "gallery_simple.yaml"), out, 0600)
 			Expect(err).ToNot(HaveOccurred())

 			galleries := []gallery.Gallery{
 				{
 					Name: "test",
-					URL:  "file://" + filepath.Join(tmpdir, "gallery_simple.yaml"),
+					URL:  "file://" + filepath.Join(modelDir, "gallery_simple.yaml"),
 				},
 			}

--- a/core/http/elements/gallery.go
+++ b/core/http/elements/gallery.go
@@ -243,13 +243,13 @@ func ListModels(models []*gallery.GalleryModel, processing *xsync.SyncedMap[stri
 			},
 			elem.H5(
 				attrs.Props{
-					"class": "mb-2 text-xl font-medium leading-tight",
+					"class": "mb-2 text-xl font-bold leading-tight",
 				},
 				elem.Text(m.Name),
 			),
 			elem.P(
 				attrs.Props{
-					"class": "mb-4 text-base",
+					"class": "mb-4 text-sm [&:not(:hover)]:truncate text-base",
 				},
 				elem.Text(m.Description),
 			),
--- a/core/http/endpoints/elevenlabs/tts.go
+++ b/core/http/endpoints/elevenlabs/tts.go
@@ -52,7 +52,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)

-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, voiceID, ml, appConfig, *cfg)
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@@ -12,10 +12,13 @@ import (
 )

 // TTSEndpoint is the OpenAI Speech API endpoint https://platform.openai.com/docs/api-reference/audio/createSpeech
-// @Summary Generates audio from the input text.
-// @Param request body schema.TTSRequest true "query params"
-// @Success 200 {string} binary	 "Response"
-// @Router /v1/audio/speech [post]
+//	@Summary	Generates audio from the input text.
+//  @Accept json
+//  @Produce audio/x-wav
+//	@Param		request	body		schema.TTSRequest	true	"query params"
+//	@Success	200		{string}	binary				"generated audio/wav file"
+//	@Router		/v1/audio/speech [post]
+//	@Router		/tts [post]
 func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {

@@ -40,6 +43,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 		)

 		if err != nil {
+			log.Err(err)
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
@@ -51,7 +55,15 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 			cfg.Backend = input.Backend
 		}

-		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, input.Voice, ml, appConfig, *cfg)
+		if input.Language != "" {
+			cfg.Language = input.Language
+		}
+
+		if input.Voice != "" {
+			cfg.Voice = input.Voice
+		}
+
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -25,7 +25,7 @@ import (
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/chat/completions [post]
 func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startupOptions *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	emptyMessage := ""
+	textContentToReturn := ""
 	id := uuid.New().String()
 	created := int(time.Now().Unix())

@@ -34,7 +34,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			ID:      id,
 			Created: created,
 			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
+			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &textContentToReturn}}},
 			Object:  "chat.completion.chunk",
 		}
 		responses <- initialMessage
@@ -67,7 +67,10 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			return true
 		})

+		textContentToReturn = functions.ParseTextContent(result, config.FunctionsConfig)
+		result = functions.CleanupLLMResult(result, config.FunctionsConfig)
 		results := functions.ParseFunctionCall(result, config.FunctionsConfig)
+		log.Debug().Msgf("Text content to return: %s", textContentToReturn)
 		noActionToRun := len(results) > 0 && results[0].Name == noAction || len(results) == 0

 		switch {
@@ -76,7 +79,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 				ID:      id,
 				Created: created,
 				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
+				Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &textContentToReturn}}},
 				Object:  "chat.completion.chunk",
 			}
 			responses <- initialMessage
@@ -134,7 +137,8 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 					Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
 					Choices: []schema.Choice{{
 						Delta: &schema.Message{
-							Role: "assistant",
+							Role:    "assistant",
+							Content: &textContentToReturn,
 							ToolCalls: []schema.ToolCall{
 								{
 									Index: i,
@@ -181,8 +185,13 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			noActionDescription = config.FunctionsConfig.NoActionDescriptionName
 		}

-		if input.ResponseFormat.Type == "json_object" {
-			input.Grammar = functions.JSONBNF
+		if config.ResponseFormatMap != nil {
+			d := schema.ChatCompletionResponseFormat{}
+			dat, _ := json.Marshal(config.ResponseFormatMap)
+			_ = json.Unmarshal(dat, &d)
+			if d.Type == "json_object" {
+				input.Grammar = functions.JSONBNF
+			}
 		}

 		config.Grammar = input.Grammar
@@ -192,7 +201,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 		}

 		switch {
-		case !config.FunctionsConfig.NoGrammar && shouldUseFn:
+		case !config.FunctionsConfig.GrammarConfig.NoGrammar && shouldUseFn:
 			noActionGrammar := functions.Function{
 				Name:        noActionName,
 				Description: noActionDescription,
@@ -219,15 +228,15 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 			// Handle if we should return "name" instead of "functions"
 			if config.FunctionsConfig.FunctionName {
 				jsStruct := funcs.ToJSONNameStructure()
-				config.Grammar = jsStruct.Grammar("", config.FunctionsConfig.ParallelCalls)
+				config.Grammar = jsStruct.Grammar(config.FunctionsConfig.GrammarConfig.Options()...)
 			} else {
 				jsStruct := funcs.ToJSONFunctionStructure()
-				config.Grammar = jsStruct.Grammar("", config.FunctionsConfig.ParallelCalls)
+				config.Grammar = jsStruct.Grammar(config.FunctionsConfig.GrammarConfig.Options()...)
 			}
 		case input.JSONFunctionGrammarObject != nil:
-			config.Grammar = input.JSONFunctionGrammarObject.Grammar("", config.FunctionsConfig.ParallelCalls)
+			config.Grammar = input.JSONFunctionGrammarObject.Grammar(config.FunctionsConfig.GrammarConfig.Options()...)
 		case input.JSONFunctionGrammarObjectName != nil:
-			config.Grammar = input.JSONFunctionGrammarObjectName.Grammar("", config.FunctionsConfig.ParallelCalls)
+			config.Grammar = input.JSONFunctionGrammarObjectName.Grammar(config.FunctionsConfig.GrammarConfig.Options()...)
 		default:
 			// Force picking one of the functions by the request
 			if config.FunctionToCall() != "" {
@@ -349,7 +358,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 				mess = append(mess, content)
 			}

-			predInput = strings.Join(mess, "\n")
+			joinCharacter := "\n"
+			if config.TemplateConfig.JoinChatMessagesByCharacter != nil {
+				joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter
+			}
+
+			predInput = strings.Join(mess, joinCharacter)
 			log.Debug().Msgf("Prompt (before templating): %s", predInput)

 			templateFile := ""
@@ -423,7 +437,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 					if err != nil {
 						log.Debug().Msgf("Sending chunk failed: %v", err)
 						input.Cancel()
-						break
 					}
 					w.Flush()
 				}
@@ -443,7 +456,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 						{
 							FinishReason: finishReason,
 							Index:        0,
-							Delta:        &schema.Message{Content: &emptyMessage},
+							Delta:        &schema.Message{Content: &textContentToReturn},
 						}},
 					Object: "chat.completion.chunk",
 					Usage:  *usage,
@@ -465,7 +478,10 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 					return
 				}

+				textContentToReturn = functions.ParseTextContent(s, config.FunctionsConfig)
+				s = functions.CleanupLLMResult(s, config.FunctionsConfig)
 				results := functions.ParseFunctionCall(s, config.FunctionsConfig)
+				log.Debug().Msgf("Text content to return: %s", textContentToReturn)
 				noActionsToRun := len(results) > 0 && results[0].Name == noActionName || len(results) == 0

 				switch {
@@ -493,6 +509,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 						if len(input.Tools) > 0 {
 							// If we are using tools, we condense the function calls into
 							// a single response choice with all the tools
+							toolChoice.Message.Content = textContentToReturn
 							toolChoice.Message.ToolCalls = append(toolChoice.Message.ToolCalls,
 								schema.ToolCall{
 									ID:   id,
@@ -508,7 +525,8 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 							*c = append(*c, schema.Choice{
 								FinishReason: "function_call",
 								Message: &schema.Message{
-									Role: "assistant",
+									Role:    "assistant",
+									Content: &textContentToReturn,
 									FunctionCall: map[string]interface{}{
 										"name":      name,
 										"arguments": args,
--- a/Show More
+++ b/Show More