fix(ci-gmock): try again by injecting gmock in CMakefiles

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
models(gallery): add hathor_tahsin-l3-8b-v0.85 (#2762 )
2026-05-24 16:51:44 -04:00 · 2024-07-10 23:04:19 +02:00 · 2024-07-10 15:44:56 +02:00 · 2024-07-10 15:28:39 +02:00 · 2024-07-10 15:23:23 +02:00 · 2024-07-10 13:18:32 +02:00
253 changed files with 8892 additions and 2068 deletions
--- a/.github/check_and_update.py
+++ b/.github/check_and_update.py
@@ -0,0 +1,79 @@
+import hashlib
+from huggingface_hub import hf_hub_download, get_paths_info
+import requests
+import sys
+import os
+
+uri = sys.argv[0]
+file_name = uri.split('/')[-1]
+
+# Function to parse the URI and determine download method
+def parse_uri(uri):
+    if uri.startswith('huggingface://'):
+        repo_id = uri.split('://')[1]
+        return 'huggingface', repo_id.rsplit('/', 1)[0]
+    elif 'huggingface.co' in uri:
+        parts = uri.split('/resolve/')
+        if len(parts) > 1:
+            repo_path = parts[0].split('https://huggingface.co/')[-1]
+            return 'huggingface', repo_path
+    return 'direct', uri
+
+def calculate_sha256(file_path):
+    sha256_hash = hashlib.sha256()
+    with open(file_path, 'rb') as f:
+        for byte_block in iter(lambda: f.read(4096), b''):
+            sha256_hash.update(byte_block)
+    return sha256_hash.hexdigest()
+
+def manual_safety_check_hf(repo_id):
+    scanResponse = requests.get('https://huggingface.co/api/models/' + repo_id + "/scan")
+    scan = scanResponse.json()
+    if scan['hasUnsafeFile']:
+        return scan
+    return None
+
+download_type, repo_id_or_url = parse_uri(uri)
+
+new_checksum =  None
+
+# Decide download method based on URI type
+if download_type == 'huggingface':
+    # Check if the repo is flagged as dangerous by HF
+    hazard = manual_safety_check_hf(repo_id_or_url)
+    if hazard != None:
+        print(f'Error: HuggingFace has detected security problems for {repo_id_or_url}: {str(hazard)}', filename=file_name)
+        sys.exit(5)
+    # Use HF API to pull sha
+    for file in get_paths_info(repo_id_or_url, [file_name], repo_type='model'):
+        try:
+            new_checksum = file.lfs.sha256
+            break
+        except Exception as e:
+            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
+            sys.exit(2)
+    if new_checksum is None:
+        try:
+            file_path = hf_hub_download(repo_id=repo_id_or_url, filename=file_name)
+        except Exception as e:
+            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
+            sys.exit(2)
+else:
+    response = requests.get(repo_id_or_url)
+    if response.status_code == 200:
+        with open(file_name, 'wb') as f:
+            f.write(response.content)
+        file_path = file_name
+    elif response.status_code == 404:
+        print(f'File not found: {response.status_code}', file=sys.stderr)
+        sys.exit(2)
+    else:
+        print(f'Error downloading file: {response.status_code}', file=sys.stderr)
+        sys.exit(1)
+
+if new_checksum is None:
+    new_checksum = calculate_sha256(file_path)
+    print(new_checksum)
+    os.remove(file_path)
+else:
+    print(new_checksum)
--- a/.github/checksum_checker.sh
+++ b/.github/checksum_checker.sh
@@ -14,77 +14,14 @@ function check_and_update_checksum() {
    idx="$5"

    # Download the file and calculate new checksum using Python
-    new_checksum=$(python3 -c "
-import hashlib
-from huggingface_hub import hf_hub_download, get_paths_info
-import requests
-import sys
-import os
+    new_checksum=$(python3 ./check_and_update.py $uri)
+    result=$?

-uri = '$uri'
-file_name = uri.split('/')[-1]
-
-# Function to parse the URI and determine download method
-# Function to parse the URI and determine download method
-def parse_uri(uri):
-    if uri.startswith('huggingface://'):
-        repo_id = uri.split('://')[1]
-        return 'huggingface', repo_id.rsplit('/', 1)[0]
-    elif 'huggingface.co' in uri:
-        parts = uri.split('/resolve/')
-        if len(parts) > 1:
-            repo_path = parts[0].split('https://huggingface.co/')[-1]
-            return 'huggingface', repo_path
-    return 'direct', uri
-
-def calculate_sha256(file_path):
-    sha256_hash = hashlib.sha256()
-    with open(file_path, 'rb') as f:
-        for byte_block in iter(lambda: f.read(4096), b''):
-            sha256_hash.update(byte_block)
-    return sha256_hash.hexdigest()
-
-download_type, repo_id_or_url = parse_uri(uri)
-
-new_checksum =  None
-
-# Decide download method based on URI type
-if download_type == 'huggingface':
-    # Use HF API to pull sha
-    for file in get_paths_info(repo_id_or_url, [file_name], repo_type='model'):
-        try:
-            new_checksum = file.lfs.sha256
-            break
-        except Exception as e:
-            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
-            sys.exit(2)
-    if new_checksum is None:
-        try:
-            file_path = hf_hub_download(repo_id=repo_id_or_url, filename=file_name)
-        except Exception as e:
-            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
-            sys.exit(2)
-else:
-    response = requests.get(repo_id_or_url)
-    if response.status_code == 200:
-        with open(file_name, 'wb') as f:
-            f.write(response.content)
-        file_path = file_name
-    elif response.status_code == 404:
-        print(f'File not found: {response.status_code}', file=sys.stderr)
-        sys.exit(2)
-    else:
-        print(f'Error downloading file: {response.status_code}', file=sys.stderr)
-        sys.exit(1)
-
-if new_checksum is None:
-    new_checksum = calculate_sha256(file_path)
-    print(new_checksum)
-    os.remove(file_path)
-else:
-    print(new_checksum)
-
-")
+    if [[ result -eq 5]]; then
+        echo "Contaminated entry detected, deleting entry for $model_name..."
+        yq eval -i "del([$idx])" "$input_yaml"
+        return
+    fi

    if [[ "$new_checksum" == "" ]]; then
        echo "Error calculating checksum for $file_name. Skipping..."
@@ -94,7 +31,7 @@ else:
    echo "Checksum for $file_name: $new_checksum"

    # Compare and update the YAML file if checksums do not match
-    result=$?
+    
    if [[ $result -eq 2 ]]; then
        echo "File not found, deleting entry for $file_name..."
        # yq eval -i "del(.[$idx].files[] | select(.filename == \"$file_name\"))" "$input_yaml"
--- a/.github/ci/modelslist.go
+++ b/.github/ci/modelslist.go
@@ -0,0 +1,297 @@
+package main
+
+import (
+	"fmt"
+	"html/template"
+	"io/ioutil"
+	"os"
+
+	"gopkg.in/yaml.v3"
+)
+
+var modelPageTemplate string = `
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>LocalAI models</title>
+    <link href="https://cdnjs.cloudflare.com/ajax/libs/flowbite/2.3.0/flowbite.min.css" rel="stylesheet" />
+    <script src="https://cdn.jsdelivr.net/npm/vanilla-lazyload@19.1.3/dist/lazyload.min.js"></script>
+
+    <link
+    rel="stylesheet"
+    href="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.8.0/build/styles/default.min.css"
+  />
+    <script
+    defer
+    src="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.8.0/build/highlight.min.js"
+  ></script>
+    <script
+    defer
+    src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"
+  ></script>
+  <script
+    defer
+    src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"
+  ></script>
+  <script
+    defer
+    src="https://cdn.jsdelivr.net/npm/dompurify@3.0.6/dist/purify.min.js"
+  ></script>
+
+  <link href="/static/general.css" rel="stylesheet" />
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=Roboto:wght@400;500&display=swap" rel="stylesheet">
+    <link
+    href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700,900&display=swap"
+    rel="stylesheet" />
+  <link
+    rel="stylesheet"
+    href="https://cdn.jsdelivr.net/npm/tw-elements/css/tw-elements.min.css" />
+  <script src="https://cdn.tailwindcss.com/3.3.0"></script>
+  <script>
+    tailwind.config = {
+      darkMode: "class",
+      theme: {
+        fontFamily: {
+          sans: ["Roboto", "sans-serif"],
+          body: ["Roboto", "sans-serif"],
+          mono: ["ui-monospace", "monospace"],
+        },
+      },
+      corePlugins: {
+        preflight: false,
+      },
+    };
+  </script>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.1.1/css/all.min.css">
+    <script src="https://unpkg.com/htmx.org@1.9.12" integrity="sha384-ujb1lZYygJmzgSwoxRggbCHcjc0rB2XoQrxeTUQyRjrOnlCoYta87iKBWq3EsdM2" crossorigin="anonymous"></script>
+</head>
+
+<body class="bg-gray-900 text-gray-200">
+<div class="flex flex-col min-h-screen">
+
+<nav class="bg-gray-800 shadow-lg">
+    <div class="container mx-auto px-4 py-4">
+        <div class="flex items-center justify-between">
+            <div class="flex items-center">
+                <a href="/" class="text-white text-xl font-bold"><img src="https://github.com/mudler/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo" class="h-10 mr-3 border-2 border-gray-300 shadow rounded"></a>
+                <a href="/" class="text-white text-xl font-bold">LocalAI</a>
+            </div>
+            <!-- Menu button for small screens -->
+            <div class="lg:hidden">
+                <button id="menu-toggle" class="text-gray-400 hover:text-white focus:outline-none">
+                    <i class="fas fa-bars fa-lg"></i>
+                </button>
+            </div>
+            <!-- Navigation links -->
+            <div class="hidden lg:flex lg:items-center lg:justify-end lg:flex-1 lg:w-0">
+                <a href="https://localai.io" class="text-gray-400 hover:text-white px-3 py-2 rounded" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
+            </div>
+        </div>
+        <!-- Collapsible menu for small screens -->
+        <div class="hidden lg:hidden" id="mobile-menu">
+            <div class="pt-4 pb-3 border-t border-gray-700">
+
+                <a href="https://localai.io" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
+
+            </div>
+        </div>
+    </div>
+</nav>
+
+<style>
+  .is-hidden {
+	display: none;
+	  }
+</style>
+
+<div class="container mx-auto px-4 flex-grow">
+
+<div class="models mt-12">
+	<h2 class="text-center text-3xl font-semibold text-gray-100">
+	LocalAI model gallery list </h2><br>
+
+	<h2 class="text-center text-3xl font-semibold text-gray-100">
+
+	 🖼️ Available {{.AvailableModels}} models</i> <a href="https://localai.io/models/" target="_blank" >
+			<i class="fas fa-circle-info pr-2"></i>
+		</a></h2>
+
+	<h3>
+	Refer to the Model gallery <a href="https://localai.io/models/" target="_blank" ><i class="fas fa-circle-info pr-2"></i></a> for more information on how to use the models with LocalAI.<br>
+
+	You can install models with the CLI command <code>local-ai models install <model-name></code>. or by using the WebUI.
+	</h3>
+
+	<input class="form-control appearance-none block w-full mt-5 px-3 py-2 text-base font-normal text-gray-300 pb-2 mb-5 bg-gray-800 bg-clip-padding border border-solid border-gray-600 rounded transition ease-in-out m-0 focus:text-gray-300 focus:bg-gray-900 focus:border-blue-500 focus:outline-none" type="search"
+	id="searchbox" placeholder="Live search keyword..">
+	  <div class="dark grid grid-cols-1 grid-rows-1 md:grid-cols-3 block rounded-lg shadow-secondary-1 dark:bg-surface-dark">
+		{{ range $_, $model := .Models }}
+		<div class="box me-4 mb-2 block rounded-lg bg-white shadow-secondary-1  dark:bg-gray-800 dark:bg-surface-dark dark:text-white text-surface pb-2">
+		<div>
+		    {{ $icon := "https://upload.wikimedia.org/wikipedia/commons/6/65/No-Image-Placeholder.svg" }}
+			{{ if $model.Icon }}
+	  		{{ $icon = $model.Icon }}
+	  		{{ end }}
+			<div class="flex justify-center items-center">
+				<img data-src="{{ $icon }}" alt="{{$model.Name}}" class="rounded-t-lg max-h-48 max-w-96 object-cover mt-3 lazy">
+			</div>
+	  		<div class="p-6 text-surface dark:text-white">
+				<h5 class="mb-2 text-xl font-medium leading-tight">{{$model.Name}}</h5>
+
+
+				<p class="mb-4 text-base truncate">{{ $model.Description }}</p>
+
+			</div>
+			<div class="px-6 pt-4 pb-2">
+
+      <!-- Modal toggle -->
+      <button data-modal-target="{{ $model.Name}}-modal" data-modal-toggle="{{ $model.Name }}-modal" class="block text-white bg-blue-700 hover:bg-blue-800 focus:ring-4 focus:outline-none focus:ring-blue-300 font-medium rounded-lg text-sm px-5 py-2.5 text-center dark:bg-blue-600 dark:hover:bg-blue-700 dark:focus:ring-blue-800" type="button">
+        More info
+      </button>
+
+    <!-- Main modal -->
+    <div id="{{ $model.Name}}-modal" tabindex="-1" aria-hidden="true" class="hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full">
+        <div class="relative p-4 w-full max-w-2xl max-h-full">
+            <!-- Modal content -->
+            <div class="relative bg-white rounded-lg shadow dark:bg-gray-700">
+                <!-- Modal header -->
+                <div class="flex items-center justify-between p-4 md:p-5 border-b rounded-t dark:border-gray-600">
+                    <h3 class="text-xl font-semibold text-gray-900 dark:text-white">
+                        {{ $model.Name}}
+                    </h3>
+                    <button type="button" class="text-gray-400 bg-transparent hover:bg-gray-200 hover:text-gray-900 rounded-lg text-sm w-8 h-8 ms-auto inline-flex justify-center items-center dark:hover:bg-gray-600 dark:hover:text-white" data-modal-hide="{{$model.Name}}-modal">
+                        <svg class="w-3 h-3" aria-hidden="true" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 14 14">
+                            <path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="m1 1 6 6m0 0 6 6M7 7l6-6M7 7l-6 6"/>
+                        </svg>
+                        <span class="sr-only">Close modal</span>
+                    </button>
+                </div>
+                <!-- Modal body -->
+                <div class="p-4 md:p-5 space-y-4">
+                    <div class="flex justify-center items-center">
+                    <img data-src="{{ $icon }}" alt="{{$model.Name}}" class="lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3">
+                  </div>
+
+                    <p class="text-base leading-relaxed text-gray-500 dark:text-gray-400">
+                    {{ $model.Description }}
+
+                    </p>
+
+                    <p class="text-base leading-relaxed text-gray-500 dark:text-gray-400">
+                    To install the model with the CLI, run: <br>
+                    <code> local-ai models install {{$model.Name}} </code> <br>
+
+                    <hr>
+                    See also <a href="https://localai.io/models/" target="_blank" >
+                    Installation <i class="fas fa-circle-info pr-2"></i>
+                    </a> to see how to install models with the REST API.
+                    </p>
+
+                    <p class="text-base leading-relaxed text-gray-500 dark:text-gray-400">
+                    <ul>
+                    {{ range $_, $u := $model.URLs }}
+                    <li><a href="{{ $u }}" target=_blank><i class="fa-solid fa-link"></i> {{ $u }}</a></li>
+                    {{ end }}
+                    </ul>
+                    </p>
+                </div>
+                <!-- Modal footer -->
+                <div class="flex items-center p-4 md:p-5 border-t border-gray-200 rounded-b dark:border-gray-600">
+                    <button data-modal-hide="{{ $model.Name}}-modal" type="button" class="py-2.5 px-5 ms-3 text-sm font-medium text-gray-900 focus:outline-none bg-white rounded-lg border border-gray-200 hover:bg-gray-100 hover:text-blue-700 focus:z-10 focus:ring-4 focus:ring-gray-100 dark:focus:ring-gray-700 dark:bg-gray-800 dark:text-gray-400 dark:border-gray-600 dark:hover:text-white dark:hover:bg-gray-700">Close</button>
+                </div>
+            </div>
+        </div>
+    </div>
+
+
+			</div>
+		</div>
+		</div>
+		{{ end }}
+
+		</div>
+  </div>
+</div>
+
+<script>
+var lazyLoadInstance = new LazyLoad({
+  // Your custom settings go here
+});
+
+let cards = document.querySelectorAll('.box')
+
+function liveSearch() {
+    let search_query = document.getElementById("searchbox").value;
+
+    //Use innerText if all contents are visible
+    //Use textContent for including hidden elements
+    for (var i = 0; i < cards.length; i++) {
+        if(cards[i].textContent.toLowerCase()
+                .includes(search_query.toLowerCase())) {
+            cards[i].classList.remove("is-hidden");
+        } else {
+            cards[i].classList.add("is-hidden");
+        }
+    }
+}
+
+//A little delay
+let typingTimer;
+let typeInterval = 500;
+let searchInput = document.getElementById('searchbox');
+
+searchInput.addEventListener('keyup', () => {
+    clearTimeout(typingTimer);
+    typingTimer = setTimeout(liveSearch, typeInterval);
+});
+</script>
+
+</div>
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/flowbite/2.3.0/flowbite.min.js"></script>
+</body>
+</html>
+`
+
+type GalleryModel struct {
+	Name        string   `json:"name" yaml:"name"`
+	URLs        []string `json:"urls" yaml:"urls"`
+	Icon        string   `json:"icon" yaml:"icon"`
+	Description string   `json:"description" yaml:"description"`
+}
+
+func main() {
+	// read the YAML file which contains the models
+
+	f, err := ioutil.ReadFile(os.Args[1])
+	if err != nil {
+		fmt.Println("Error reading file:", err)
+		return
+	}
+
+	models := []*GalleryModel{}
+	err = yaml.Unmarshal(f, &models)
+	if err != nil {
+		// write to stderr
+		os.Stderr.WriteString("Error unmarshaling YAML: " + err.Error() + "\n")
+		return
+	}
+
+	// render the template
+	data := struct {
+		Models          []*GalleryModel
+		AvailableModels int
+	}{
+		Models:          models,
+		AvailableModels: len(models),
+	}
+	tmpl := template.Must(template.New("modelPage").Parse(modelPageTemplate))
+
+	err = tmpl.Execute(os.Stdout, data)
+	if err != nil {
+		fmt.Println("Error executing template:", err)
+		return
+	}
+}
--- a/.github/release.yml
+++ b/.github/release.yml
@@ -13,6 +13,9 @@ changelog:
      labels:
        - bug
        - regression
+    - title: "🖧 P2P area"
+      labels:
+         - area/p2p
    - title: Exciting New Features 🎉
      labels:
        - Semver-Minor
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -17,7 +17,7 @@ jobs:
        include:
          - grpc-base-image: ubuntu:22.04
            runs-on: 'ubuntu-latest'
-            platforms: 'linux/amd64'
+            platforms: 'linux/amd64,linux/arm64'
    runs-on: ${{matrix.runs-on}}
    steps:
      - name: Release space from worker
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -32,7 +32,7 @@ jobs:
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
+      max-parallel: ${{ github.event_name != 'pull_request' && 4 || 8 }}
      matrix:
        include:
          - build-type: ''
@@ -46,7 +46,7 @@ jobs:
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "1"
+            cuda-minor-version: "4"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg'
@@ -119,7 +119,7 @@ jobs:
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "1"
+            cuda-minor-version: "4"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg-core'
@@ -127,4 +127,13 @@ jobs:
            image-type: 'core'
            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
-            makeflags: "--jobs=4 --output-sync=target"
+            makeflags: "--jobs=4 --output-sync=target"
+          - build-type: 'vulkan'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-vulkan-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -39,7 +39,7 @@ jobs:
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
+      max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
      matrix:
        include:
          # Extra images
@@ -75,7 +75,7 @@ jobs:
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "1"
+            cuda-minor-version: "4"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12'
@@ -100,7 +100,7 @@ jobs:
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "1"
+            cuda-minor-version: "4"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda12-ffmpeg'
@@ -232,7 +232,7 @@ jobs:
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
-  
+
  core-image-build:
    uses: ./.github/workflows/image_build.yml
    with:
@@ -257,16 +257,17 @@ jobs:
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
+      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
      matrix:
        include:
          - build-type: ''
-            platforms: 'linux/amd64'
+            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'auto'
            tag-suffix: '-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            base-image: "ubuntu:22.04"
-            runs-on: 'ubuntu-latest'
+            runs-on: 'arc-runner-set'
            aio: "-aio-cpu"
            latest-image: 'latest-cpu'
            latest-image-aio: 'latest-aio-cpu'
@@ -280,18 +281,18 @@ jobs:
            ffmpeg: ''
            image-type: 'core'
            base-image: "ubuntu:22.04"
-            runs-on: 'ubuntu-latest'
+            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "1"
+            cuda-minor-version: "4"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-core'
            ffmpeg: ''
            image-type: 'core'
            base-image: "ubuntu:22.04"
-            runs-on: 'ubuntu-latest'
+            runs-on: 'arc-runner-set'
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "11"
@@ -301,17 +302,27 @@ jobs:
            tag-suffix: '-cublas-cuda11-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
-            runs-on: 'ubuntu-latest'
+            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "1"
+            cuda-minor-version: "4"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
-            runs-on: 'ubuntu-latest'
+            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=4 --output-sync=target"
+          - build-type: 'vulkan'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-vulkan-ffmpeg-core'
+            latest-image: 'latest-vulkan-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -19,11 +19,11 @@ on:
        type: string
      cuda-major-version:
        description: 'CUDA major version'
-        default: "11"
+        default: "12"
        type: string
      cuda-minor-version:
        description: 'CUDA minor version'
-        default: "7"
+        default: "4"
        type: string
      platforms:
        description: 'Platforms'
@@ -136,6 +136,7 @@ jobs:

      - name: Docker meta
        id: meta
+        if: github.event_name != 'pull_request'
        uses: docker/metadata-action@v5
        with:
          images: |
@@ -148,7 +149,20 @@ jobs:
          flavor: |
            latest=${{ inputs.tag-latest }}
            suffix=${{ inputs.tag-suffix }}
-
+      - name: Docker meta for PR
+        id: meta_pull_request
+        if: github.event_name == 'pull_request'
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            ttl.sh/localai-ci-pr-${{ github.event.number }}
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.tag-suffix }}
      - name: Docker meta AIO (quay.io)
        if: inputs.aio != ''
        id: meta_aio
@@ -202,6 +216,7 @@ jobs:

      - name: Build and push
        uses: docker/build-push-action@v5
+        if: github.event_name != 'pull_request'
        with:
          builder: ${{ steps.buildx.outputs.name }}
          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
@@ -226,7 +241,39 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.meta.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels }}
-
+### Start testing image
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        if: github.event_name == 'pull_request'
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
+          # This is why some build args like GRPC_VERSION and MAKEFLAGS are hardcoded
+          build-args: |
+            BUILD_TYPE=${{ inputs.build-type }}
+            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
+            FFMPEG=${{ inputs.ffmpeg }}
+            IMAGE_TYPE=${{ inputs.image-type }}
+            BASE_IMAGE=${{ inputs.base-image }}
+            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
+            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
+            GRPC_VERSION=v1.64.0
+            MAKEFLAGS=${{ inputs.makeflags }}
+          context: .
+          file: ./Dockerfile
+          cache-from: type=gha
+          platforms: ${{ inputs.platforms }}
+          push: true
+          tags: ${{ steps.meta_pull_request.outputs.tags }}
+          labels: ${{ steps.meta_pull_request.outputs.labels }}
+      - name: Testing image
+        if: github.event_name == 'pull_request'
+        run: |
+          echo "Image is available at ttl.sh/localai-ci-pr-${{ github.event.number }}:${{ steps.meta_pull_request.outputs.version }}" >> $GITHUB_STEP_SUMMARY
+## End testing image
      - name: Build and push AIO image
        if: inputs.aio != ''
        uses: docker/build-push-action@v5
@@ -277,7 +324,7 @@ jobs:
          docker pull quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }}
          docker tag quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
          docker push quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
-  
+
      - name: job summary
        run: |
          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -0,0 +1,170 @@
+name: Notifications for new models
+on:
+  pull_request:
+     types:
+       - closed
+
+jobs:
+  notify-discord:
+    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
+    env:
+        MODEL_NAME: hermes-2-theta-llama-3-8b
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0 # needed to checkout all branches for this Action to work
+    - name: Start LocalAI
+      run: |
+        echo "Starting LocalAI..."
+        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
+        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
+      # Check the PR diff using the current branch and the base branch of the PR
+    - uses: GrantBirki/git-diff-action@v2.7.0
+      id: git-diff-action
+      with:
+            json_diff_file_output: diff.json
+            raw_diff_file_output: diff.txt
+            file_output_only: "true"
+    - name: Summarize
+      env:
+        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
+      id: summarize
+      run: |
+            input="$(cat $DIFF)"
+
+            # Define the LocalAI API endpoint
+            API_URL="http://localhost:8080/chat/completions"
+
+            # Create a JSON payload using jq to handle special characters
+            json_payload=$(jq -n --arg input "$input" '{
+            model: "'$MODEL_NAME'",
+            messages: [
+                {
+                role: "system",
+                content: "You are LocalAI-bot. Write a discord message to notify everyone about the new model from the git diff. Make it informal. An example can include: the URL of the model, the name, and a brief description of the model if exists. Also add an hint on how to install it in LocalAI and that can be browsed over https://models.localai.io. For example: local-ai run model_name_here"
+                },
+                {
+                role: "user",
+                content: $input
+                }
+            ]
+            }')
+
+            # Send the request to LocalAI
+            response=$(curl -s -X POST $API_URL \
+            -H "Content-Type: application/json" \
+            -d "$json_payload")
+
+            # Extract the summary from the response
+            summary="$(echo $response | jq -r '.choices[0].message.content')"
+
+            # Print the summary
+            #  -H "Authorization: Bearer $API_KEY" \
+            echo "Summary:"
+            echo "$summary"
+            echo "payload sent"
+            echo "$json_payload"
+            {
+                echo 'message<<EOF'
+                echo "$summary"
+                echo EOF
+              } >> "$GITHUB_OUTPUT"
+            docker logs --tail 10 local-ai
+    - name: Discord notification
+      env:
+        DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK_URL }}
+        DISCORD_USERNAME: "LocalAI-Bot"
+        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
+      uses: Ilshidur/action-discord@master
+      with:
+        args: ${{ steps.summarize.outputs.message }}
+    - name: Setup tmate session if fails
+      if: ${{ failure() }}
+      uses: mxschmitt/action-tmate@v3.18
+      with:
+        detached: true
+        connect-timeout-seconds: 180
+        limit-access-to-actor: true
+  notify-twitter:
+    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
+    env:
+        MODEL_NAME: hermes-2-theta-llama-3-8b
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0 # needed to checkout all branches for this Action to work
+    - name: Start LocalAI
+      run: |
+        echo "Starting LocalAI..."
+        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
+        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
+      # Check the PR diff using the current branch and the base branch of the PR
+    - uses: GrantBirki/git-diff-action@v2.7.0
+      id: git-diff-action
+      with:
+            json_diff_file_output: diff.json
+            raw_diff_file_output: diff.txt
+            file_output_only: "true"
+    - name: Summarize
+      env:
+        DIFF: ${{ steps.git-diff-action.outputs.raw-diff-path }}
+      id: summarize
+      run: |
+            input="$(cat $DIFF)"
+
+            # Define the LocalAI API endpoint
+            API_URL="http://localhost:8080/chat/completions"
+
+            # Create a JSON payload using jq to handle special characters
+            json_payload=$(jq -n --arg input "$input" '{
+            model: "'$MODEL_NAME'",
+            messages: [
+                {
+                role: "system",
+                content: "You are LocalAI-bot. Write a twitter message to notify everyone about the new model from the git diff. Make it informal and really short. An example can include: the name, and a brief description of the model if exists. Also add an hint on how to install it in LocalAI. For example: local-ai run model_name_here"
+                },
+                {
+                role: "user",
+                content: $input
+                }
+            ]
+            }')
+
+            # Send the request to LocalAI
+            response=$(curl -s -X POST $API_URL \
+            -H "Content-Type: application/json" \
+            -d "$json_payload")
+
+            # Extract the summary from the response
+            summary="$(echo $response | jq -r '.choices[0].message.content')"
+
+            # Print the summary
+            #  -H "Authorization: Bearer $API_KEY" \
+            echo "Summary:"
+            echo "$summary"
+            echo "payload sent"
+            echo "$json_payload"
+            {
+                echo 'message<<EOF'
+                echo "$summary"
+                echo EOF
+              } >> "$GITHUB_OUTPUT"
+            docker logs --tail 10 local-ai
+    - uses: Eomm/why-don-t-you-tweet@v2
+      with:
+        tweet-message: ${{ steps.summarize.outputs.message }}
+      env:
+        # Get your tokens from https://developer.twitter.com/apps
+        TWITTER_CONSUMER_API_KEY: ${{ secrets.TWITTER_APP_KEY }}
+        TWITTER_CONSUMER_API_SECRET: ${{ secrets.TWITTER_APP_SECRET }}
+        TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }}
+        TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }}
+    - name: Setup tmate session if fails
+      if: ${{ failure() }}
+      uses: mxschmitt/action-tmate@v3.18
+      with:
+        detached: true
+        connect-timeout-seconds: 180
+        limit-access-to-actor: true
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@@ -0,0 +1,65 @@
+name: Release notifications
+on:
+  release:
+    types:
+      - published
+
+jobs:
+  notify-discord:
+    runs-on: ubuntu-latest
+    env:
+        RELEASE_BODY: ${{ github.event.release.body }}
+        RELEASE_TITLE: ${{ github.event.release.name }}
+        RELEASE_TAG_NAME: ${{ github.event.release.tag_name }}
+    steps:
+    - name: Start LocalAI
+      run: |
+        echo "Starting LocalAI..."
+        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
+        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
+    - name: Summarize
+      id: summarize
+      run: |
+            input="$RELEASE_TITLE\b$RELEASE_BODY"
+
+            # Define the LocalAI API endpoint
+            API_URL="http://localhost:8080/chat/completions"
+
+            # Create a JSON payload using jq to handle special characters
+            json_payload=$(jq -n --arg input "$input" '{
+            model: "'$MODEL_NAME'",
+            messages: [
+                {
+                role: "system",
+                content: "Write a discord message with a bullet point summary of the release notes."
+                },
+                {
+                role: "user",
+                content: $input
+                }
+            ]
+            }')
+
+            # Send the request to LocalAI API
+            response=$(curl -s -X POST $API_URL \
+            -H "Content-Type: application/json" \
+            -d "$json_payload")
+
+            # Extract the summary from the response
+            summary=$(echo $response | jq -r '.choices[0].message.content')
+
+            # Print the summary
+            #  -H "Authorization: Bearer $API_KEY" \
+            {
+                echo 'message<<EOF'
+                echo "$summary"
+                echo EOF
+              } >> "$GITHUB_OUTPUT"
+    - name: Discord notification
+      env:
+        DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK_URL_RELEASE }}
+        DISCORD_USERNAME: "LocalAI-Bot"
+        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
+      uses: Ilshidur/action-discord@master
+      with:
+        args: ${{ steps.summarize.outputs.message }}
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -15,7 +15,8 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  build-linux:
+
+  build-linux-arm:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
@@ -26,10 +27,146 @@ jobs:
        with:
          go-version: '1.21.x'
          cache: false
+
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache gawk
+          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
+      - name: Install CUDA Dependencies
+        run: |
+          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          sudo apt-get install -y cuda-cross-aarch64 cuda-nvcc-cross-aarch64-${CUDA_VERSION} libcublas-cross-aarch64-${CUDA_VERSION}
+        env:
+          CUDA_VERSION: 12-4
+      - name: Cache grpc
+        id: cache-grpc
+        uses: actions/cache@v4
+        with:
+          path: grpc
+          key: ${{ runner.os }}-arm-grpc-${{ env.GRPC_VERSION }}
+      - name: Build grpc
+        if: steps.cache-grpc.outputs.cache-hit != 'true'
+        run: |
+
+          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc
+          GRPC_DIR=$PWD/grpc/
+          echo "include(FetchContent)" >> $GRPC_DIR/CMakeLists.txt
+          echo "FetchContent_Declare(" >> $GRPC_DIR/CMakeLists.txt
+          echo "  googletest" >> $GRPC_DIR/CMakeLists.txt
+          echo "  URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip" >> $GRPC_DIR/CMakeLists.txt
+          echo ")" >> $GRPC_DIR/CMakeLists.txt
+          echo "FetchContent_MakeAvailable(googletest)" >> $GRPC_DIR/CMakeLists.txt
+          GRPC_DIR=$PWD/grpc/third_party/abseil-cpp
+          echo "include(FetchContent)" >> $GRPC_DIR/CMakeLists.txt
+          echo "FetchContent_Declare(" >> $GRPC_DIR/CMakeLists.txt
+          echo "  googletest" >> $GRPC_DIR/CMakeLists.txt
+          echo "  URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip" >> $GRPC_DIR/CMakeLists.txt
+          echo ")" >> $GRPC_DIR/CMakeLists.txt
+          echo "FetchContent_MakeAvailable(googletest)" >> $GRPC_DIR/CMakeLists.txt
+          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+            -DgRPC_BUILD_TESTS=OFF \
+            ../.. && sudo make --jobs 5 --output-sync=target
+      - name: Install gRPC
+        run: |
+          GNU_HOST=aarch64-linux-gnu
+          C_COMPILER_ARM_LINUX=$GNU_HOST-gcc
+          CXX_COMPILER_ARM_LINUX=$GNU_HOST-g++
+
+          CROSS_TOOLCHAIN=/usr/$GNU_HOST
+          CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
+          CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
+
+          # https://cmake.org/cmake/help/v3.13/manual/cmake-toolchains.7.html#cross-compiling-for-linux
+          echo "set(CMAKE_SYSTEM_NAME Linux)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_SYSTEM_PROCESSOR arm)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_STAGING_PREFIX $CROSS_STAGING_PREFIX)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_SYSROOT ${CROSS_TOOLCHAIN}/sysroot)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_C_COMPILER /usr/bin/$C_COMPILER_ARM_LINUX)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_CXX_COMPILER /usr/bin/$CXX_COMPILER_ARM_LINUX)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)" >> $CMAKE_CROSS_TOOLCHAIN
+          GRPC_DIR=$PWD/grpc
+          cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install && \
+          GRPC_CROSS_BUILD_DIR=$GRPC_DIR/cmake/cross_build && \
+          mkdir -p $GRPC_CROSS_BUILD_DIR && \
+          cd $GRPC_CROSS_BUILD_DIR && \
+          cmake -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CROSS_TOOLCHAIN \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DCMAKE_INSTALL_PREFIX=$CROSS_TOOLCHAIN/grpc_install \
+            ../.. && \
+          sudo make -j`nproc` install
+      - name: Build
+        id: build
+        run: |
+          GNU_HOST=aarch64-linux-gnu
+          C_COMPILER_ARM_LINUX=$GNU_HOST-gcc
+          CXX_COMPILER_ARM_LINUX=$GNU_HOST-g++
+
+          CROSS_TOOLCHAIN=/usr/$GNU_HOST
+          CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
+          CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+          export PATH=$PATH:$GOPATH/bin
+          export PATH=/usr/local/cuda/bin:$PATH
+          sudo rm -rf /usr/aarch64-linux-gnu/lib/libstdc++.so.6
+          sudo cp -rf /usr/aarch64-linux-gnu/lib/libstdc++.so* /usr/aarch64-linux-gnu/lib/libstdc++.so.6
+          sudo cp /usr/aarch64-linux-gnu/lib/ld-linux-aarch64.so.1 ld.so
+          BACKEND_LIBS="./grpc/cmake/cross_build/third_party/re2/libre2.a ./grpc/cmake/cross_build/libgrpc.a ./grpc/cmake/cross_build/libgrpc++.a ./grpc/cmake/cross_build/third_party/protobuf/libprotobuf.a /usr/aarch64-linux-gnu/lib/libc.so.6 /usr/aarch64-linux-gnu/lib/libstdc++.so.6 /usr/aarch64-linux-gnu/lib/libgomp.so.1 /usr/aarch64-linux-gnu/lib/libm.so.6 /usr/aarch64-linux-gnu/lib/libgcc_s.so.1 /usr/aarch64-linux-gnu/lib/libdl.so.2 /usr/aarch64-linux-gnu/lib/libpthread.so.0 ./ld.so" \
+          GOOS=linux \
+          GOARCH=arm64 \
+          CMAKE_ARGS="-DProtobuf_INCLUDE_DIRS=$CROSS_STAGING_PREFIX/include -DProtobuf_DIR=$CROSS_STAGING_PREFIX/lib/cmake/protobuf -DgRPC_DIR=$CROSS_STAGING_PREFIX/lib/cmake/grpc -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CROSS_TOOLCHAIN -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++" make dist-cross-linux-arm64
+      - uses: actions/upload-artifact@v4
+        with:
+          name: LocalAI-linux-arm64
+          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
+  build-linux:
+    runs-on: arc-runner-set
+    steps:
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21.x'
+          cache: false
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
+          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache gawk cmake libgmock-dev
+      - name: Intel Dependencies
+        run: |
+          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+          echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
+          sudo apt update
+          sudo apt install -y intel-basekit
      - name: Install CUDA Dependencies
        run: |
          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
@@ -38,6 +175,31 @@ jobs:
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
        env:
          CUDA_VERSION: 12-3
+      - name: "Install Hipblas"
+        env:
+          ROCM_VERSION: "6.1"
+          AMDGPU_VERSION: "6.1"
+        run: |
+            set -ex
+
+            sudo apt-get update
+            sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates curl libnuma-dev gnupg
+
+            curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add -
+
+            printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | sudo tee /etc/apt/sources.list.d/rocm.list
+
+            printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list
+            printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
+            sudo apt-get update
+
+            sudo DEBIAN_FRONTEND=noninteractive apt-get install -y \
+                hipblas-dev rocm-dev \
+                rocblas-dev
+
+            sudo apt-get clean
+            sudo rm -rf /var/lib/apt/lists/*
+            sudo ldconfig
      - name: Cache grpc
        id: cache-grpc
        uses: actions/cache@v4
@@ -54,14 +216,20 @@ jobs:
      - name: Install gRPC
        run: |
          cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install
+      # BACKEND_LIBS needed for gpu-workload: /opt/intel/oneapi/*/lib/libiomp5.so /opt/intel/oneapi/*/lib/libmkl_core.so /opt/intel/oneapi/*/lib/libmkl_core.so.2 /opt/intel/oneapi/*/lib/libmkl_intel_ilp64.so /opt/intel/oneapi/*/lib/libmkl_intel_ilp64.so.2 /opt/intel/oneapi/*/lib/libmkl_sycl_blas.so /opt/intel/oneapi/*/lib/libmkl_sycl_blas.so.4 /opt/intel/oneapi/*/lib/libmkl_tbb_thread.so /opt/intel/oneapi/*/lib/libmkl_tbb_thread.so.2 /opt/intel/oneapi/*/lib/libsycl.so /opt/intel/oneapi/*/lib/libsycl.so.7 /opt/intel/oneapi/*/lib/libsycl.so.7.1.0 /opt/rocm-*/lib/libamdhip64.so /opt/rocm-*/lib/libamdhip64.so.5 /opt/rocm-*/lib/libamdhip64.so.6 /opt/rocm-*/lib/libamdhip64.so.6.1.60100 /opt/rocm-*/lib/libhipblas.so /opt/rocm-*/lib/libhipblas.so.2 /opt/rocm-*/lib/libhipblas.so.2.1.60100 /opt/rocm-*/lib/librocblas.so /opt/rocm-*/lib/librocblas.so.4 /opt/rocm-*/lib/librocblas.so.4.1.60100 /usr/lib/x86_64-linux-gnu/libstdc++.so.6 /usr/lib/x86_64-linux-gnu/libOpenCL.so.1 /usr/lib/x86_64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/x86_64-linux-gnu/libm.so.6 /usr/lib/x86_64-linux-gnu/libgcc_s.so.1 /usr/lib/x86_64-linux-gnu/libc.so.6 /usr/lib/x86_64-linux-gnu/librt.so.1 /usr/local/cuda-*/targets/x86_64-linux/lib/libcublas.so /usr/local/cuda-*/targets/x86_64-linux/lib/libcublasLt.so /usr/local/cuda-*/targets/x86_64-linux/lib/libcudart.so /usr/local/cuda-*/targets/x86_64-linux/lib/stubs/libcuda.so
      - name: Build
        id: build
        run: |
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
          export PATH=$PATH:$GOPATH/bin
          export PATH=/usr/local/cuda/bin:$PATH
-          GO_TAGS=p2p make dist
+          export PATH=/opt/rocm/bin:$PATH
+          source /opt/intel/oneapi/setvars.sh
+          GRPC_DIR=$PWD/backend/cpp/llama
+          sudo cp /lib64/ld-linux-x86-64.so.2 ld.so
+          BACKEND_LIBS="./ld.so ./sources/go-piper/piper/build/fi/lib/libfmt.a ./sources/go-piper/piper-phonemize/pi/lib/libonnxruntime.so.1.14.1 ./sources/go-piper/piper-phonemize/pi/src/libespeak-ng/libespeak-ng.so /usr/lib/x86_64-linux-gnu/libdl.so.2 /usr/lib/x86_64-linux-gnu/librt.so.1 /usr/lib/x86_64-linux-gnu/libpthread.so.0 ./sources/go-piper/piper-phonemize/pi/lib/libpiper_phonemize.so.1 ./sources/go-piper/piper/build/si/lib/libspdlog.a ./sources/go-piper/espeak/ei/lib/libucd.so" \
+          make -j4 dist
      - uses: actions/upload-artifact@v4
        with:
          name: LocalAI-linux
@@ -72,7 +240,13 @@ jobs:
        with:
          files: |
            release/*
-
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
  build-stablediffusion:
    runs-on: ubuntu-latest
    steps:
@@ -86,9 +260,10 @@ jobs:
          cache: false
      - name: Dependencies
        run: |
+          sudo apt-get update
          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
      - name: Build stablediffusion
        run: |
          export PATH=$PATH:$GOPATH/bin
@@ -100,6 +275,54 @@ jobs:
        with:
          name: stablediffusion
          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*
+
+  build-macOS-x86_64:
+    runs-on: macos-13
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21.x'
+          cache: false
+      - name: Dependencies
+        run: |
+          brew install protobuf grpc
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+      - name: Build
+        id: build
+        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
+          export PATH=$PATH:$GOPATH/bin
+
+          make dist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: LocalAI-MacOS-x86_64
+          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true

  build-macOS-arm64:
    runs-on: macos-14
@@ -115,15 +338,16 @@ jobs:
      - name: Dependencies
        run: |
          brew install protobuf grpc
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
-          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
      - name: Build
        id: build
        run: |
          export C_INCLUDE_PATH=/usr/local/include
          export CPLUS_INCLUDE_PATH=/usr/local/include
          export PATH=$PATH:$GOPATH/bin
-          GO_TAGS=p2p make dist
+
+          make dist
      - uses: actions/upload-artifact@v4
        with:
          name: LocalAI-MacOS-arm64
@@ -134,3 +358,10 @@ jobs:
        with:
          files: |
            release/*
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -71,6 +71,7 @@ jobs:
        run: |
          sudo apt-get update
          sudo apt-get install build-essential curl ffmpeg
+          sudo apt-get install -y libgmock-dev
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
@@ -93,8 +94,8 @@ jobs:
          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
          export CUDACXX=/usr/local/cuda/bin/nvcc

-          go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
-          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b

          # The python3-grpc-tools package in 22.04 is too old
          pip install --user grpcio-tools
@@ -178,7 +179,7 @@ jobs:
          submodules: true
      - name: Build images
        run: |
-          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=core --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
+          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
      - name: Test
        run: |
@@ -220,7 +221,7 @@ jobs:
          export CPLUS_INCLUDE_PATH=/usr/local/include
          # Used to run the newer GNUMake version from brew that supports --output-sync
          export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
-          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make --jobs 4 --output-sync=target test
+          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.18
--- a/.gitignore
+++ b/.gitignore
@@ -42,6 +42,7 @@ backend-assets/*
 !backend-assets/.keep
 prepare
 /ggml-metal.metal
+docs/static/gallery.html

 # Protobuf generated files
 *.pb.go
--- a/94
+++ b/94
@@ -8,7 +8,7 @@ FROM ${BASE_IMAGE} AS requirements-core

 USER root

-ARG GO_VERSION=1.21.7
+ARG GO_VERSION=1.22.4
 ARG TARGETARCH
 ARG TARGETVARIANT

@@ -24,23 +24,17 @@ RUN apt-get update && \
        cmake \
        curl \
        git \
-        python3-pip \
-        python-is-python3 \
        unzip && \
    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* && \
-    pip install --upgrade pip
+    rm -rf /var/lib/apt/lists/*

 # Install Go
 RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
 ENV PATH $PATH:/root/go/bin:/usr/local/go/bin

 # Install grpc compilers
-RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@latest && \
-    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
-
-# Install grpcio-tools (the version in 22.04 is too old)
-RUN pip install --user grpcio-tools
+RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2 && \
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af

 COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
 RUN update-ca-certificates
@@ -85,10 +79,16 @@ RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        espeak-ng \
        espeak \
+        python3-pip \
+        python-is-python3 \
        python3-dev \
        python3-venv && \
    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/* && \
+    pip install --upgrade pip
+
+# Install grpcio-tools (the version in 22.04 is too old)
+RUN pip install --user grpcio-tools

 ###################################
 ###################################
@@ -98,22 +98,65 @@ RUN apt-get update && \
 FROM requirements-${IMAGE_TYPE} AS requirements-drivers

 ARG BUILD_TYPE
-ARG CUDA_MAJOR_VERSION=11
-ARG CUDA_MINOR_VERSION=7
+ARG CUDA_MAJOR_VERSION=12
+ARG CUDA_MINOR_VERSION=4

 ENV BUILD_TYPE=${BUILD_TYPE}

+# Vulkan requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "vulkan" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+                        software-properties-common pciutils wget gpg-agent && \
+        wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+        wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+        apt-get update && \
+            apt-get install -y \
+            vulkan-sdk && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
 # CuBLAS requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+                        software-properties-common pciutils
+        if [ "amd64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            fi
+        if [ "arm64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
+        fi
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+            rm -f cuda-keyring_1.1-1_all.deb && \
+            apt-get update && \
+            apt-get install -y --no-install-recommends \
+                cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
+            apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
 RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
        apt-get update && \
        apt-get install -y  --no-install-recommends \
-            software-properties-common && \
+            software-properties-common pciutils && \
        curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
        dpkg -i cuda-keyring_1.1-1_all.deb && \
        rm -f cuda-keyring_1.1-1_all.deb && \
        apt-get update && \
        apt-get install -y --no-install-recommends \
            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
@@ -163,7 +206,7 @@ FROM ${GRPC_BASE_IMAGE} AS grpc

 # This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
 ARG GRPC_MAKEFLAGS="-j4 -Otarget"
-ARG GRPC_VERSION=v1.58.0
+ARG GRPC_VERSION=v1.64.2

 ENV MAKEFLAGS=${GRPC_MAKEFLAGS}

@@ -218,9 +261,18 @@ RUN make prepare
 # We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
 # but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
 # here so that we can generate the grpc code for the stablediffusion build
-RUN curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
-    unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
-    rm protoc.zip
+RUN <<EOT bash
+    if [ "amd64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+    if [ "arm64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+EOT

 # stablediffusion does not tolerate a newer version of abseil, build it first
 RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
@@ -230,6 +282,8 @@ COPY --from=grpc /opt/grpc /usr/local

 # Rebuild with defaults backends
 WORKDIR /build
+
+## Build the binary
 RUN make build

 RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
@@ -256,7 +310,7 @@ ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
 ENV MAKEFLAGS=${MAKEFLAGS}

-ARG CUDA_MAJOR_VERSION=11
+ARG CUDA_MAJOR_VERSION=12
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
@@ -355,7 +409,7 @@ RUN mkdir -p /build/models
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
  CMD curl -f ${HEALTHCHECK_ENDPOINT} || exit 1
-  
+
 VOLUME /build/models
 EXPOSE 8080
 ENTRYPOINT [ "/build/entrypoint.sh" ]
--- a/170
+++ b/170
@@ -3,9 +3,11 @@ GOTEST=$(GOCMD) test
 GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai

+DETECT_LIBS?=true
+
 # llama.cpp versions
 GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=74f33adf5f8b20b08fc5a6aa17ce081abe86ef2f
+CPPLLAMA_VERSION?=fd560fe680c72fd0a0af2bc8881add20ad919071

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -16,7 +18,7 @@ RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
 RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6

 # whisper.cpp version
-WHISPER_CPP_VERSION?=22d46b7ba4620e2db1281e210d0186863cffcec0
+WHISPER_CPP_VERSION?=1c31f9d4a8936aec550e6c4dc9ca5cae3b4f304a

 # bert.cpp version
 BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4
@@ -33,9 +35,11 @@ TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057
 export BUILD_TYPE?=
 export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
 export CMAKE_ARGS?=
+export BACKEND_LIBS?=

 CGO_LDFLAGS?=
 CGO_LDFLAGS_WHISPER?=
+CGO_LDFLAGS_WHISPER+=-lggml
 CUDA_LIBPATH?=/usr/local/cuda/lib64/
 GO_TAGS?=
 BUILD_ID?=
@@ -49,12 +53,12 @@ RANDOM := $(shell bash -c 'echo $$RANDOM')
 VERSION?=$(shell git describe --always --tags || echo "dev" )
 # go tool nm ./local-ai | grep Commit
 LD_FLAGS?=
-override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Version=$(VERSION)"
-override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"
+override LD_FLAGS += -X "github.com/mudler/LocalAI/internal.Version=$(VERSION)"
+override LD_FLAGS += -X "github.com/mudler/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"

 OPTIONAL_TARGETS?=

-OS := $(shell uname -s)
+export OS := $(shell uname -s)
 ARCH := $(shell uname -m)
 GREEN  := $(shell tput -Txterm setaf 2)
 YELLOW := $(shell tput -Txterm setaf 3)
@@ -80,27 +84,40 @@ ifeq ($(OS),Darwin)
 		BUILD_TYPE=metal
 	# disable metal if on Darwin and any other value is explicitly passed.
 	else ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DLLAMA_METAL=OFF
-		export LLAMA_NO_ACCELERATE=1
+		CMAKE_ARGS+=-DGGML_METAL=OFF
+		export GGML_NO_ACCELERATE=1
+		export GGML_NO_METAL=1
 	endif

 	ifeq ($(BUILD_TYPE),metal)
 #			-lcblas 	removed: it seems to always be listed as a duplicate flag.
 		CGO_LDFLAGS += -framework Accelerate
 	endif
+else
+CGO_LDFLAGS_WHISPER+=-lgomp
 endif

 ifeq ($(BUILD_TYPE),openblas)
 	CGO_LDFLAGS+=-lopenblas
-	export WHISPER_OPENBLAS=1
+	export GGML_OPENBLAS=1
 endif

-
 ifeq ($(BUILD_TYPE),cublas)
 	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
-	export LLAMA_CUBLAS=1
-	export WHISPER_CUDA=1
-	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda
+	export GGML_CUDA=1
+	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
+endif
+
+ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DGGML_VULKAN=1
+endif
+
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+	export GGML_SYCL=1
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f16)
+	export GGML_SYCL_F16=1
 endif

 ifeq ($(BUILD_TYPE),hipblas)
@@ -111,27 +128,26 @@ ifeq ($(BUILD_TYPE),hipblas)
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
-	export WHISPER_HIPBLAS=1
-	GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
+	export GGML_HIPBLAS=1
+	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
-	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
 endif

 ifeq ($(BUILD_TYPE),metal)
 	CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
-	export LLAMA_METAL=1
-	export WHISPER_METAL=1
+	export GGML_METAL=1
 endif

 ifeq ($(BUILD_TYPE),clblas)
 	CGO_LDFLAGS+=-lOpenCL -lclblast
-	export WHISPER_CLBLAST=1
+	export GGML_OPENBLAS=1
 endif

 # glibc-static or glibc-devel-static required
 ifeq ($(STATIC),true)
-	LD_FLAGS=-linkmode external -extldflags -static
+	LD_FLAGS+=-linkmode external -extldflags -static
 endif

 ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
@@ -165,6 +181,8 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
 ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
 ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
+# Use filter-out to remove the specified backends
+ALL_GRPC_BACKENDS := $(filter-out $(SKIP_GRPC_BACKEND),$(ALL_GRPC_BACKENDS))

 GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
 TEST_PATHS?=./api/... ./pkg/... ./core/...
@@ -244,7 +262,7 @@ sources/whisper.cpp:
 	cd sources/whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1

 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
-	cd sources/whisper.cpp && $(MAKE) libwhisper.a
+	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a

 get-sources: sources/go-llama.cpp sources/gpt4all sources/go-piper sources/go-rwkv.cpp sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream

@@ -313,6 +331,10 @@ build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
 	$(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
+ifneq ($(BACKEND_LIBS),)
+	$(MAKE) backend-assets/lib
+	cp -f $(BACKEND_LIBS) backend-assets/lib/
+endif
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./

 build-minimal:
@@ -321,14 +343,27 @@ build-minimal:
 build-api:
 	BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build

+backend-assets/lib:
+	mkdir -p backend-assets/lib
+
 dist:
-	STATIC=true $(MAKE) backend-assets/grpc/llama-cpp-avx2
+	$(MAKE) backend-assets/grpc/llama-cpp-avx2
+ifeq ($(DETECT_LIBS),true)
+	scripts/prepare-libs.sh backend-assets/grpc/llama-cpp-avx2
+endif
 ifeq ($(OS),Darwin)
-	$(info ${GREEN}I Skip CUDA build on MacOS${RESET})
+	$(info ${GREEN}I Skip CUDA/hipblas build on MacOS${RESET})
 else
 	$(MAKE) backend-assets/grpc/llama-cpp-cuda
+	$(MAKE) backend-assets/grpc/llama-cpp-hipblas
+	$(MAKE) backend-assets/grpc/llama-cpp-sycl_f16
+	$(MAKE) backend-assets/grpc/llama-cpp-sycl_f32
 endif
-	$(MAKE) build
+	GO_TAGS="tts p2p" $(MAKE) build
+ifeq ($(DETECT_LIBS),true)
+	scripts/prepare-libs.sh backend-assets/grpc/piper
+endif
+	GO_TAGS="tts p2p" STATIC=true $(MAKE) build
 	mkdir -p release
 # if BUILD_ID is empty, then we don't append it to the binary name
 ifeq ($(BUILD_ID),)
@@ -339,6 +374,19 @@ else
 	shasum -a 256 release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH) > release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH).sha256
 endif

+dist-cross-linux-arm64:
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" \
+	STATIC=true $(MAKE) build
+	mkdir -p release
+# if BUILD_ID is empty, then we don't append it to the binary name
+ifeq ($(BUILD_ID),)
+	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(OS)-arm64
+	shasum -a 256 release/$(BINARY_NAME)-$(OS)-arm64 > release/$(BINARY_NAME)-$(OS)-arm64.sha256
+else
+	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-arm64
+	shasum -a 256 release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-arm64 > release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-arm64.sha256
+endif
+
 osx-signed: build
 	codesign --deep --force --sign "$(OSX_SIGNING_IDENTITY)" --entitlements "./Entitlements.plist" "./$(BINARY_NAME)"

@@ -377,7 +425,7 @@ prepare-e2e:
 	mkdir -p $(TEST_DIR)
 	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
 	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
-	docker build --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=11 --build-arg CUDA_MINOR_VERSION=7 --build-arg FFMPEG=true -t localai-tests .
+	docker build --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=12 --build-arg CUDA_MINOR_VERSION=4 --build-arg FFMPEG=true -t localai-tests .

 run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
@@ -447,7 +495,7 @@ protogen-clean: protogen-go-clean protogen-python-clean
 .PHONY: protogen-go
 protogen-go:
 	mkdir -p pkg/grpc/proto
-	protoc -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
+	protoc --experimental_allow_proto3_optional -Ibackend/ --go_out=pkg/grpc/proto/ --go_opt=paths=source_relative --go-grpc_out=pkg/grpc/proto/ --go-grpc_opt=paths=source_relative \
    backend/backend.proto

 .PHONY: protogen-go-clean
@@ -672,25 +720,33 @@ else
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server
 endif

+# This target is for manually building a variant with-auto detected flags
+backend-assets/grpc/llama-cpp: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-cpp
+	$(MAKE) -C backend/cpp/llama-cpp purge
+	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
+	$(MAKE) VARIANT="llama-cpp" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-cpp/grpc-server backend-assets/grpc/llama-cpp
+
 backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-avx2
 	$(MAKE) -C backend/cpp/llama-avx2 purge
 	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2

 backend-assets/grpc/llama-cpp-avx: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-avx
 	$(MAKE) -C backend/cpp/llama-avx purge
 	$(info ${GREEN}I llama-cpp build info:avx${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx

 backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-fallback
 	$(MAKE) -C backend/cpp/llama-fallback purge
 	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
 # TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
@@ -701,14 +757,35 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-cuda
 	$(MAKE) -C backend/cpp/llama-cuda purge
 	$(info ${GREEN}I llama-cpp build info:cuda${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda

+backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-hipblas
+	$(MAKE) -C backend/cpp/llama-hipblas purge
+	$(info ${GREEN}I llama-cpp build info:hipblas${RESET})
+	BUILD_TYPE="hipblas" $(MAKE) VARIANT="llama-hipblas" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-hipblas/grpc-server backend-assets/grpc/llama-cpp-hipblas
+
+backend-assets/grpc/llama-cpp-sycl_f16: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-sycl_f16
+	$(MAKE) -C backend/cpp/llama-sycl_f16 purge
+	$(info ${GREEN}I llama-cpp build info:sycl_f16${RESET})
+	BUILD_TYPE="sycl_f16" $(MAKE) VARIANT="llama-sycl_f16" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-sycl_f16/grpc-server backend-assets/grpc/llama-cpp-sycl_f16
+
+backend-assets/grpc/llama-cpp-sycl_f32: backend-assets/grpc
+	cp -rf backend/cpp/llama backend/cpp/llama-sycl_f32
+	$(MAKE) -C backend/cpp/llama-sycl_f32 purge
+	$(info ${GREEN}I llama-cpp build info:sycl_f32${RESET})
+	BUILD_TYPE="sycl_f32" $(MAKE) VARIANT="llama-sycl_f32" build-llama-cpp-grpc-server
+	cp -rfv backend/cpp/llama-sycl_f32/grpc-server backend-assets/grpc/llama-cpp-sycl_f32
+
 backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-grpc
 	$(MAKE) -C backend/cpp/llama-grpc purge
 	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc

 backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
@@ -736,7 +813,7 @@ backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libti
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream

 backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH=$(CURDIR)/sources/whisper.cpp LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
+	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/

 backend-assets/grpc/local-store: backend-assets/grpc
@@ -758,6 +835,17 @@ docker:
 		--build-arg BUILD_TYPE=$(BUILD_TYPE) \
 		-t $(DOCKER_IMAGE) .

+docker-cuda11:
+	docker build \
+		--build-arg CUDA_MAJOR_VERSION=11 \
+		--build-arg CUDA_MINOR_VERSION=8 \
+		--build-arg BASE_IMAGE=$(BASE_IMAGE) \
+		--build-arg IMAGE_TYPE=$(IMAGE_TYPE) \
+		--build-arg GO_TAGS="$(GO_TAGS)" \
+		--build-arg MAKEFLAGS="$(DOCKER_MAKEFLAGS)" \
+		--build-arg BUILD_TYPE=$(BUILD_TYPE) \
+		-t $(DOCKER_IMAGE)-cuda11 .
+
 docker-aio:
 	@echo "Building AIO image with base $(BASE_IMAGE) as $(DOCKER_AIO_IMAGE)"
 	docker build \
@@ -791,4 +879,22 @@ swagger:

 .PHONY: gen-assets
 gen-assets:
-	$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
+	$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
+
+## Documentation
+docs/layouts/_default:
+	mkdir -p docs/layouts/_default
+
+docs/static/gallery.html: docs/layouts/_default
+	$(GOCMD) run ./.github/ci/modelslist.go ./gallery/index.yaml > docs/static/gallery.html
+
+docs/public: docs/layouts/_default docs/static/gallery.html
+	cd docs && hugo --minify
+
+docs-clean:
+	rm -rf docs/public
+	rm -rf docs/static/gallery.html
+
+.PHONY: docs
+docs: docs/static/gallery.html
+	cd docs && hugo serve
--- a/README.md
+++ b/README.md
@@ -48,6 +48,13 @@

 ![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)

+Run the installer script:
+
+```bash
+curl https://localai.io/install.sh | sh
+```
+
+Or run with docker:
 ```bash
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
 # Alternative images:
@@ -65,7 +72,9 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu

 [Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

- 🔥🔥 Decentralized llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!)
+- 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
+- 🆕 You can browse now the model gallery without LocalAI! Check out https://models.localai.io
+- 🔥🔥 Decentralized llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs  https://localai.io/features/distribute/
 - 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
 - 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
 - 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
@@ -89,12 +98,14 @@ If you want to help and contribute, issues up for grabs: https://github.com/mudl
 - 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
 - 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
 - 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
- 🔥 [OpenAI functions](https://localai.io/features/openai-functions/) 🆕
+- 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) 
 - 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
 - ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
 - 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
 - 🥽 [Vision API](https://localai.io/features/gpt-vision/)
- 🆕 [Reranker API](https://localai.io/features/reranker/)
+- 📈 [Reranker API](https://localai.io/features/reranker/)
+- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)
+- 🌍 Integrated WebUI!

 ## 💻 Usage

@@ -108,6 +119,7 @@ Build and deploy custom containers:
 WebUIs:
 - https://github.com/Jirubizu/localai-admin
 - https://github.com/go-skynet/LocalAI-frontend
+- QA-Pilot(An interactive chat project that leverages LocalAI LLMs for rapid understanding and navigation of GitHub code repository) https://github.com/reid41/QA-Pilot

 Model galleries
 - https://github.com/go-skynet/model-gallery
@@ -120,13 +132,14 @@ Other:
 - Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
 - Discord bot https://github.com/mudler/LocalAGI/tree/main/examples/discord
 - Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
+- Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
 - Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
 - Examples: https://github.com/mudler/LocalAI/tree/master/examples/
  

 ### 🔗 Resources

- 🆕 New! [LLM finetuning guide](https://localai.io/docs/advanced/fine-tuning/)
+- [LLM finetuning guide](https://localai.io/docs/advanced/fine-tuning/)
 - [How to build locally](https://localai.io/basics/build/index.html)
 - [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
 - [Projects integrating LocalAI](https://localai.io/docs/integrations/)
@@ -134,6 +147,7 @@ Other:

 ## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

+- 🆕 [Run LocalAI on Jetson Nano Devkit](https://mudler.pm/posts/local-ai-jetson-nano-devkit/)
 - [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/blog/low-code-llm-apps-with-local-ai-flowise-and-pulumi/)
 - [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
 - [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
@@ -161,17 +175,16 @@ If you utilize this repository, data in a downstream project, please consider ci

 Support the project by becoming [a backer or sponsor](https://github.com/sponsors/mudler). Your logo will show up here with a link to your website.

-A huge thank you to our generous sponsors who support this project:
+A huge thank you to our generous sponsors who support this project covering CI expenses, and our [Sponsor list](https://github.com/sponsors/mudler):

-| ![Spectro Cloud logo_600x600px_transparent bg](https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512) |
-|:-----------------------------------------------:|
-|  [Spectro Cloud](https://www.spectrocloud.com/)  |
-|  Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs!  |
-
-And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project.
-
- [Sponsor list](https://github.com/sponsors/mudler)
- JDAM00 (donating HW for the CI)
+<p align="center">
+  <a href="https://www.spectrocloud.com/" target="blank">
+    <img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
+  </a>
+  <a href="https://www.premai.io/" target="blank">
+    <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
+  </a>
+</p>

 ## 🌟 Star history

@@ -181,7 +194,7 @@ And a huge shout-out to individuals sponsoring the project by donating hardware

 LocalAI is a community-driven project created by [Ettore Di Giacinto](https://github.com/mudler/).

-MIT - Author Ettore Di Giacinto
+MIT - Author Ettore Di Giacinto <mudler@localai.io>

 ## 🙇 Acknowledgements

--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -230,6 +230,7 @@ message TranscriptRequest {
  string dst = 2;
  string language = 3;
  uint32 threads = 4;
+  bool translate = 5;
 }

 message TranscriptResult {
@@ -266,6 +267,7 @@ message TTSRequest {
  string model = 2;
  string dst = 3;
  string voice = 4;
+  optional string language = 5;
 }

 message TokenizationResponse {
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -22,6 +22,15 @@ set(TARGET grpc-server)
 set(_PROTOBUF_LIBPROTOBUF libprotobuf)
 set(_REFLECTION grpc++_reflection)

+include(FetchContent)
+FetchContent_Declare(
+  googletest
+  URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
+)
+# For Windows: Prevent overriding the parent project's compiler/linker settings
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(googletest)
+
 if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
    # Set correct Homebrew install folder for Apple Silicon and Intel Macs
    if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -4,34 +4,44 @@ LLAMA_VERSION?=
 CMAKE_ARGS?=
 BUILD_TYPE?=
 ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
+TARGET?=--target grpc-server

-# If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
+# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
+CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
+
+# If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DLLAMA_CUBLAS=ON
-# If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+	CMAKE_ARGS+=-DGGML_CUDA=ON
+# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
-	CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
-# If build type is clblas (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
-	CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
-# If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
+# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
-else ifeq ($(OS),darwin)
+else ifeq ($(OS),Darwin)
 	ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DLLAMA_METAL=OFF
+		CMAKE_ARGS+=-DGGML_METAL=OFF
+	else
+		CMAKE_ARGS+=-DGGML_METAL=ON
+# Until this is tested properly, we disable embedded metal file
+# as we already embed it as part of the LocalAI assets
+		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=OFF
+		TARGET+=--target ggml-metal
 	endif
 endif

 ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 endif

 ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 endif

 llama.cpp:
@@ -61,9 +71,9 @@ clean: purge
 grpc-server: llama.cpp llama.cpp/examples/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
-	bash -c "source $(ONEAPI_VARS); \
-	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && $(MAKE)"
+	+bash -c "source $(ONEAPI_VARS); \
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)"
 else
-	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && $(MAKE)
+	+cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
 endif
 	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -886,6 +886,8 @@ struct llama_server_context
            {"task_id", slot->task_id},
        });

+        LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
+
        return true;
    }

--- a/backend/go/image/stablediffusion/main.go
+++ b/backend/go/image/stablediffusion/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )

 var (
--- a/backend/go/image/stablediffusion/stablediffusion.go
+++ b/backend/go/image/stablediffusion/stablediffusion.go
@@ -3,9 +3,9 @@ package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/stablediffusion"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/stablediffusion"
 )

 type Image struct {
--- a/backend/go/image/tinydream/main.go
+++ b/backend/go/image/tinydream/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )

 var (
--- a/backend/go/image/tinydream/tinydream.go
+++ b/backend/go/image/tinydream/tinydream.go
@@ -3,9 +3,9 @@ package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/tinydream"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/tinydream"
 )

 type Image struct {
--- a/backend/go/llm/bert/bert.go
+++ b/backend/go/llm/bert/bert.go
@@ -5,8 +5,8 @@ package main
 import (
 	bert "github.com/go-skynet/go-bert.cpp"

-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )

 type Embeddings struct {
--- a/backend/go/llm/bert/main.go
+++ b/backend/go/llm/bert/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )

 var (
--- a/backend/go/llm/gpt4all/gpt4all.go
+++ b/backend/go/llm/gpt4all/gpt4all.go
@@ -5,8 +5,8 @@ package main
 import (
 	"fmt"

-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	gpt4all "github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
 )

--- a/backend/go/llm/gpt4all/main.go
+++ b/backend/go/llm/gpt4all/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )

 var (
--- a/backend/go/llm/langchain/langchain.go
+++ b/backend/go/llm/langchain/langchain.go
@@ -6,9 +6,9 @@ import (
 	"fmt"
 	"os"

-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/langchain"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/langchain"
 )

 type LLM struct {
--- a/backend/go/llm/langchain/main.go
+++ b/backend/go/llm/langchain/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )

 var (
--- a/backend/go/llm/llama-ggml/llama.go
+++ b/backend/go/llm/llama-ggml/llama.go
@@ -5,9 +5,9 @@ package main
 import (
 	"fmt"

-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	"github.com/go-skynet/go-llama.cpp"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )

 type LLM struct {
--- a/backend/go/llm/llama-ggml/main.go
+++ b/backend/go/llm/llama-ggml/main.go
@@ -3,7 +3,7 @@ package main
 import (
 	"flag"

-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )

 var (
--- a/backend/go/llm/llama/llama.go
+++ b/backend/go/llm/llama/llama.go
@@ -6,9 +6,9 @@ import (
 	"fmt"
 	"path/filepath"

-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
 	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
 	"github.com/go-skynet/go-llama.cpp"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
 )

 type LLM struct {
--- a/backend/go/llm/llama/main.go
+++ b/backend/go/llm/llama/main.go
@@ -7,7 +7,7 @@ package main
 import (
 	"flag"

-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )

 var (
--- a/backend/go/llm/rwkv/main.go
+++ b/backend/go/llm/rwkv/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )

 var (
--- a/backend/go/llm/rwkv/rwkv.go
+++ b/backend/go/llm/rwkv/rwkv.go
@@ -7,8 +7,8 @@ import (
 	"path/filepath"

 	"github.com/donomii/go-rwkv.cpp"
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )

 const tokenizerSuffix = ".tokenizer.json"
@@ -31,7 +31,7 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error {
 	model := rwkv.LoadFiles(opts.ModelFile, tokenizerPath, uint32(opts.GetThreads()))

 	if model == nil {
-		return fmt.Errorf("could not load model")
+		return fmt.Errorf("rwkv could not load model")
 	}
 	llm.rwkv = model
 	return nil
--- a/backend/go/stores/main.go
+++ b/backend/go/stores/main.go
@@ -6,7 +6,7 @@ import (
 	"flag"
 	"os"

-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )
--- a/backend/go/stores/store.go
+++ b/backend/go/stores/store.go
@@ -8,8 +8,8 @@ import (
 	"math"
 	"slices"

-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"

 	"github.com/rs/zerolog/log"
 )
--- a/backend/go/transcribe/main.go
+++ b/backend/go/transcribe/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )

 var (
--- a/backend/go/transcribe/transcript.go
+++ b/backend/go/transcribe/transcript.go
@@ -8,7 +8,7 @@ import (

 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	"github.com/go-audio/wav"
-	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/schema"
 )

 func ffmpegCommand(args []string) (string, error) {
@@ -29,7 +29,7 @@ func audioToWav(src, dst string) error {
 	return nil
 }

-func Transcript(model whisper.Model, audiopath, language string, threads uint) (schema.TranscriptionResult, error) {
+func Transcript(model whisper.Model, audiopath, language string, translate bool, threads uint) (schema.TranscriptionResult, error) {
 	res := schema.TranscriptionResult{}

 	dir, err := os.MkdirTemp("", "whisper")
@@ -75,6 +75,10 @@ func Transcript(model whisper.Model, audiopath, language string, threads uint) (
 		context.SetLanguage("auto")
 	}

+	if translate {
+		context.SetTranslate(true)
+	}
+
 	if err := context.Process(data, nil, nil); err != nil {
 		return res, err
 	}
--- a/backend/go/transcribe/whisper.go
+++ b/backend/go/transcribe/whisper.go
@@ -4,9 +4,9 @@ package main
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
-	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )

 type Whisper struct {
@@ -22,5 +22,5 @@ func (sd *Whisper) Load(opts *pb.ModelOptions) error {
 }

 func (sd *Whisper) AudioTranscription(opts *pb.TranscriptRequest) (schema.TranscriptionResult, error) {
-	return Transcript(sd.whisper, opts.Dst, opts.Language, uint(opts.Threads))
+	return Transcript(sd.whisper, opts.Dst, opts.Language, opts.Translate, uint(opts.Threads))
 }
--- a/backend/go/tts/main.go
+++ b/backend/go/tts/main.go
@@ -5,7 +5,7 @@ package main
 import (
 	"flag"

-	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )

 var (
--- a/backend/go/tts/piper.go
+++ b/backend/go/tts/piper.go
@@ -7,8 +7,8 @@ import (
 	"os"
 	"path/filepath"

-	"github.com/go-skynet/LocalAI/pkg/grpc/base"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 	piper "github.com/mudler/go-piper"
 )

--- a/backend/python/autogptq/requirements-hipblas.txt
+++ b/backend/python/autogptq/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/bark/requirements-hipblas.txt
+++ b/backend/python/bark/requirements-hipblas.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
+torchaudio
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -3,4 +3,4 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -148,13 +148,13 @@ function startBackend() {
    ensureVenv

    if [ ! -z ${BACKEND_FILE} ]; then
-        python ${BACKEND_FILE} $@
+        exec python ${BACKEND_FILE} $@
    elif [ -e "${MY_DIR}/server.py" ]; then
-        python ${MY_DIR}/server.py $@
+        exec python ${MY_DIR}/server.py $@
    elif [ -e "${MY_DIR}/backend.py" ]; then
-        python ${MY_DIR}/backend.py $@
+        exec python ${MY_DIR}/backend.py $@
    elif [ -e "${MY_DIR}/${BACKEND_NAME}.py" ]; then
-        python ${MY_DIR}/${BACKEND_NAME}.py $@
+        exec python ${MY_DIR}/${BACKEND_NAME}.py $@
    fi
 }

@@ -210,4 +210,4 @@ function checkTargets() {
    echo false
 }

-init
+init
--- a/backend/python/common/template/requirements-hipblas.txt
+++ b/backend/python/common/template/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@@ -66,7 +66,21 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

    def TTS(self, request, context):
        try:
-            self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=COQUI_LANGUAGE, file_path=request.dst)
+            # if model is multilangual add language from request or env as fallback
+            lang = request.language or COQUI_LANGUAGE
+            if lang == "":
+                lang = None
+            if self.tts.is_multi_lingual and lang is None:
+               return backend_pb2.Result(success=False, message=f"Model is multi-lingual, but no language was provided")
+
+            # if model is multi-speaker, use speaker_wav or the speaker_id from request.voice
+            if self.tts.is_multi_speaker and self.AudioPath is None and request.voice is None:
+                return backend_pb2.Result(success=False, message=f"Model is multi-speaker, but no speaker was provided")
+
+            if self.tts.is_multi_speaker and request.voice is not None:
+               self.tts.tts_to_file(text=request.text, speaker=request.voice, language=lang, file_path=request.dst)
+            else:
+                self.tts.tts_to_file(text=request.text, speaker_wav=self.AudioPath, language=lang, file_path=request.dst)
        except Exception as err:
            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
        return backend_pb2.Result(success=True)
--- a/backend/python/coqui/requirements-hipblas.txt
+++ b/backend/python/coqui/requirements-hipblas.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
+torchaudio
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -3,4 +3,4 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -17,7 +17,7 @@ import backend_pb2_grpc

 import grpc

-from diffusers import StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler
+from diffusers import StableDiffusion3Pipeline, StableDiffusionXLPipeline, StableDiffusionDepth2ImgPipeline, DPMSolverMultistepScheduler, StableDiffusionPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler
 from diffusers import StableDiffusionImg2ImgPipeline, AutoPipelineForText2Image, ControlNetModel, StableVideoDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import safety_checker
 from diffusers.utils import load_image,export_to_video
@@ -225,6 +225,17 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        torch_dtype=torchType, 
                        use_safetensors=True, 
                        variant=variant)
+            elif request.PipelineType == "StableDiffusion3Pipeline":
+                if fromSingleFile:
+                    self.pipe = StableDiffusion3Pipeline.from_single_file(modelFile,
+                                                               torch_dtype=torchType,
+                                                               use_safetensors=True)
+                else:
+                    self.pipe = StableDiffusion3Pipeline.from_pretrained(
+                        request.Model, 
+                        torch_dtype=torchType, 
+                        use_safetensors=True, 
+                        variant=variant)

            if CLIPSKIP and request.CLIPSkip != 0:
                self.clip_skip = request.CLIPSkip
--- a/backend/python/diffusers/requirements-hipblas.txt
+++ b/backend/python/diffusers/requirements-hipblas.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
+torchvision
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,4 +3,4 @@ intel-extension-for-pytorch
 torch
 torchvision
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -5,6 +5,7 @@ grpcio==1.64.0
 opencv-python
 pillow
 protobuf
+sentencepiece
 torch
 transformers
-certifi
+certifi
--- a/backend/python/openvoice/requirements-hipblas.txt
+++ b/backend/python/openvoice/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/parler-tts/requirements-hipblas.txt
+++ b/backend/python/parler-tts/requirements-hipblas.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
+torchaudio
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -3,4 +3,4 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/petals/requirements-hipblas.txt
+++ b/backend/python/petals/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/petals/requirements-intel.txt
+++ b/backend/python/petals/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/rerankers/requirements-hipblas.txt
+++ b/backend/python/rerankers/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/rerankers/requirements-intel.txt
+++ b/backend/python/rerankers/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/sentencetransformers/requirements-hipblas.txt
+++ b/backend/python/sentencetransformers/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/sentencetransformers/requirements-intel.txt
+++ b/backend/python/sentencetransformers/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers-musicgen/requirements-hipblas.txt
+++ b/backend/python/transformers-musicgen/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/transformers-musicgen/requirements-intel.txt
+++ b/backend/python/transformers-musicgen/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -21,10 +21,7 @@ import torch.cuda


 XPU=os.environ.get("XPU", "0") == "1"
-if XPU:
-    from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer
-else:
-    from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, set_seed, BitsAndBytesConfig, TextIteratorStreamer
+from transformers import AutoTokenizer, AutoModel, set_seed, TextIteratorStreamer, StoppingCriteriaList, StopStringCriteria


 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
@@ -77,11 +74,11 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        """
        model_name = request.Model

-        compute = "auto"
+        compute = torch.float16
        if request.F16Memory == True:
            compute=torch.bfloat16

-        self.CUDA = request.CUDA
+        self.CUDA = torch.cuda.is_available()
        self.OV=False

        device_map="cpu"
@@ -89,6 +86,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        quantization = None

        if self.CUDA:
+            from transformers import BitsAndBytesConfig, AutoModelForCausalLM
            if request.MainGPU:
                device_map=request.MainGPU
            else:
@@ -107,7 +105,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                    bnb_4bit_compute_dtype = None,
                    load_in_8bit=True,                                   
                )
-                                               
+
        try:
            if request.Type == "AutoModelForCausalLM":
                if XPU:
@@ -189,6 +187,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                                                                device=device_map)
                self.OV = True
            else:
+                print("Automodel", file=sys.stderr)
                self.model = AutoModel.from_pretrained(model_name, 
                                                       trust_remote_code=request.TrustRemoteCode,  
                                                       use_safetensors=True,  
@@ -246,28 +245,28 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

        # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
-#        print("Calculated embeddings for: " + request.Embeddings, file=sys.stderr)
-#        print("Embeddings:", sentence_embeddings, file=sys.stderr)
        return backend_pb2.EmbeddingResult(embeddings=sentence_embeddings[0])

    async def _predict(self, request, context, streaming=False): 
        set_seed(request.Seed)
-        if request.TopP == 0:
-            request.TopP = 0.9
+        if request.TopP < 0 or request.TopP > 1:
+            request.TopP = 1
        
-        if request.TopK == 0:
-            request.TopK = 40
+        if request.TopK <= 0:
+            request.TopK = 50
+
+        if request.Temperature > 0 :
+            sample=True
+        else:
+            sample=False
+            request.TopP == None
+            request.TopK == None
+            request.Temperature == None

        prompt = request.Prompt
        if not request.Prompt and request.UseTokenizerTemplate and request.Messages:    
            prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)

-        eos_token_id = self.tokenizer.eos_token_id
-        if request.StopPrompts:
-            eos_token_id = []
-            for word in request.StopPrompts:
-                eos_token_id.append(self.tokenizer.convert_tokens_to_ids(word))
-
        inputs = self.tokenizer(prompt, return_tensors="pt")

        if request.Tokens > 0:
@@ -281,6 +280,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
            inputs = inputs.to("xpu")
            streaming = False

+        criteria=[]
+        if request.StopPrompts:
+            criteria = StoppingCriteriaList(
+                [
+                    StopStringCriteria(tokenizer=self.tokenizer, stop_strings=request.StopPrompts),
+                ]
+            )
+
        if streaming:
            streamer=TextIteratorStreamer(self.tokenizer,
                                        skip_prompt=True,
@@ -290,11 +297,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                        temperature=request.Temperature, 
                        top_p=request.TopP,
                        top_k=request.TopK, 
-                        do_sample=True,
+                        do_sample=sample,
                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=eos_token_id,
+                        eos_token_id=self.tokenizer.eos_token_id,
                        pad_token_id=self.tokenizer.eos_token_id,
-                        streamer=streamer)
+                        streamer=streamer,
+                        stopping_criteria=criteria,
+                        use_cache=True,
+                        )
            thread=Thread(target=self.model.generate, kwargs=config)
            thread.start()
            generated_text = ""
@@ -311,18 +321,20 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                                    temperature=request.Temperature, 
                                    top_p=request.TopP,
                                    top_k=request.TopK, 
-                                    do_sample=True,
+                                    do_sample=sample,
                                    pad_token=self.tokenizer.eos_token_id)
            else:
-                outputs = self.model.generate(inputs["input_ids"],
+                outputs = self.model.generate(**inputs,
                        max_new_tokens=max_tokens, 
                        temperature=request.Temperature, 
                        top_p=request.TopP,
                        top_k=request.TopK, 
-                        do_sample=True,
-                        attention_mask=inputs["attention_mask"],
-                        eos_token_id=eos_token_id,
-                        pad_token_id=self.tokenizer.eos_token_id)
+                        do_sample=sample,
+                        eos_token_id=self.tokenizer.eos_token_id,
+                        pad_token_id=self.tokenizer.eos_token_id,
+                        stopping_criteria=criteria,
+                        use_cache=True,
+                        )
            generated_text = self.tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)[0]

        if streaming:
--- a/backend/python/transformers/requirements-hipblas.txt
+++ b/backend/python/transformers/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/transformers/requirements-intel.txt
+++ b/backend/python/transformers/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -3,4 +3,7 @@ transformers
 grpcio==1.64.0
 protobuf
 torch
-certifi
+certifi
+intel-extension-for-transformers
+bitsandbytes
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers/run.sh
+++ b/backend/python/transformers/run.sh
@@ -1,4 +1,10 @@
 #!/bin/bash
 source $(dirname $0)/../common/libbackend.sh

+if [ -d "/opt/intel" ]; then
+    # Assumes we are using the Intel oneAPI container image
+    # https://github.com/intel/intel-extension-for-pytorch/issues/538
+    export XPU=1
+fi
+
 startBackend $@
--- a/backend/python/vall-e-x/requirements-hipblas.txt
+++ b/backend/python/vall-e-x/requirements-hipblas.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
+torchaudio
--- a/backend/python/vall-e-x/requirements-intel.txt
+++ b/backend/python/vall-e-x/requirements-intel.txt
@@ -3,4 +3,4 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vllm/requirements-hipblas.txt
+++ b/backend/python/vllm/requirements-hipblas.txt
@@ -0,0 +1,2 @@
+--extra-index-url https://download.pytorch.org/whl/rocm6.0
+torch
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools
+setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
--- a/core/application.go
+++ b/core/application.go
@@ -1,9 +1,9 @@
 package core

 import (
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/services"
-	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/services"
+	"github.com/mudler/LocalAI/pkg/model"
 )

 // The purpose of this structure is to hold pointers to all initialized services, to make plumbing easy
@@ -28,7 +28,6 @@ type Application struct {
 	// LocalAI System Services
 	BackendMonitorService *services.BackendMonitorService
 	GalleryService        *services.GalleryService
-	ListModelsService     *services.ListModelsService
 	LocalAIMetricsService *services.LocalAIMetricsService
 	// OpenAIService         *services.OpenAIService
 }
--- a/core/backend/embeddings.go
+++ b/core/backend/embeddings.go
@@ -3,10 +3,10 @@ package backend
 import (
 	"fmt"

-	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/config"

-	"github.com/go-skynet/LocalAI/pkg/grpc"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/grpc"
+	model "github.com/mudler/LocalAI/pkg/model"
 )

 func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -1,10 +1,10 @@
 package backend

 import (
-	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/config"

-	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	model "github.com/mudler/LocalAI/pkg/model"
 )

 func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -9,14 +9,14 @@ import (
 	"sync"
 	"unicode/utf8"

-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"

-	"github.com/go-skynet/LocalAI/pkg/gallery"
-	"github.com/go-skynet/LocalAI/pkg/grpc"
-	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/mudler/LocalAI/core/gallery"
+	"github.com/mudler/LocalAI/pkg/grpc"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	model "github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/utils"
 )

 type LLMResponse struct {
@@ -57,7 +57,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 		if _, err := os.Stat(modelFile); os.IsNotExist(err) {
 			utils.ResetDownloadTimers()
 			// if we failed to load the model, we try to download it
-			err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
+			err := gallery.InstallModelFromGallery(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction, o.EnforcePredownloadScans)
 			if err != nil {
 				return nil, err
 			}
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -5,9 +5,9 @@ import (
 	"os"
 	"path/filepath"

-	"github.com/go-skynet/LocalAI/core/config"
-	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/core/config"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 )

@@ -142,12 +142,14 @@ func gRPCPredictOpts(c config.BackendConfig, modelPath string) *pb.PredictOption
 		MirostatTAU:         float32(*c.LLMConfig.MirostatTAU),
 		Debug:               *c.Debug,
 		StopPrompts:         c.StopWords,
-		Repeat:              int32(c.RepeatPenalty),
+		Repeat:              int32(c.RepeatLastN),
+		FrequencyPenalty:    float32(c.FrequencyPenalty),
+		PresencePenalty:     float32(c.PresencePenalty),
+		Penalty:             float32(c.RepeatPenalty),
 		NKeep:               int32(c.Keep),
 		Batch:               int32(c.Batch),
 		IgnoreEOS:           c.IgnoreEOS,
 		Seed:                getSeed(c),
-		FrequencyPenalty:    float32(c.FrequencyPenalty),
 		MLock:               *c.MMlock,
 		MMap:                *c.MMap,
 		MainGPU:             c.MainGPU,
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -4,9 +4,9 @@ import (
 	"context"
 	"fmt"

-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	model "github.com/mudler/LocalAI/pkg/model"
 )

 func Rerank(backend, modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
--- a/core/backend/stores.go
+++ b/core/backend/stores.go
@@ -1,10 +1,10 @@
 package backend

 import (
-	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/config"

-	"github.com/go-skynet/LocalAI/pkg/grpc"
-	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/grpc"
+	"github.com/mudler/LocalAI/pkg/model"
 )

 func StoreBackend(sl *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string) (grpc.Backend, error) {
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -4,14 +4,14 @@ import (
 	"context"
 	"fmt"

-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"

-	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	model "github.com/mudler/LocalAI/pkg/model"
 )

-func ModelTranscription(audio, language string, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
+func ModelTranscription(audio, language string, translate bool, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {

 	opts := modelOpts(backendConfig, appConfig, []model.Option{
 		model.WithBackendString(model.WhisperBackend),
@@ -31,8 +31,9 @@ func ModelTranscription(audio, language string, ml *model.ModelLoader, backendCo
 	}

 	return whisperModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
-		Dst:      audio,
-		Language: language,
-		Threads:  uint32(*backendConfig.Threads),
+		Dst:       audio,
+		Language:  language,
+		Translate: translate,
+		Threads:   uint32(*backendConfig.Threads),
 	})
 }
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -6,11 +6,11 @@ import (
 	"os"
 	"path/filepath"

-	"github.com/go-skynet/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/config"

-	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/mudler/LocalAI/pkg/grpc/proto"
+	model "github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/utils"
 )

 func generateUniqueFileName(dir, baseName, ext string) string {
@@ -29,7 +29,16 @@ func generateUniqueFileName(dir, baseName, ext string) string {
 	}
 }

-func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (string, *proto.Result, error) {
+func ModelTTS(
+	backend,
+	text,
+	modelFile,
+	voice ,
+	language string,
+	loader *model.ModelLoader,
+	appConfig *config.ApplicationConfig,
+	backendConfig config.BackendConfig,
+) (string, *proto.Result, error) {
 	bb := backend
 	if bb == "" {
 		bb = model.PiperBackend
@@ -83,7 +92,13 @@ func ModelTTS(backend, text, modelFile, voice string, loader *model.ModelLoader,
 		Model: modelPath,
 		Voice: voice,
 		Dst:   filePath,
+		Language: &language,
 	})

+	// return RPC error if any
+	if !res.Success {
+		return "", nil, fmt.Errorf(res.Message)
+	}
+
 	return filePath, res, err
 }
--- a/core/cli/cli.go
+++ b/core/cli/cli.go
@@ -1,16 +1,18 @@
 package cli

 import (
-	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
-	"github.com/go-skynet/LocalAI/core/cli/worker"
+	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/core/cli/worker"
 )

 var CLI struct {
 	cliContext.Context `embed:""`

 	Run        RunCMD        `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
+	Federated  FederatedCLI  `cmd:"" help:"Run LocalAI in federated mode"`
 	Models     ModelsCMD     `cmd:"" help:"Manage LocalAI models and definitions"`
 	TTS        TTSCMD        `cmd:"" help:"Convert text to speech"`
 	Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
 	Worker     worker.Worker `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
+	Util       UtilCMD       `cmd:"" help:"Utility commands"`
 }
--- a/core/cli/federated.go
+++ b/core/cli/federated.go
@@ -0,0 +1,130 @@
+package cli
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"net"
+	"time"
+
+	"math/rand/v2"
+
+	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/core/p2p"
+	"github.com/mudler/edgevpn/pkg/node"
+	"github.com/mudler/edgevpn/pkg/protocol"
+	"github.com/mudler/edgevpn/pkg/types"
+	"github.com/rs/zerolog/log"
+)
+
+type FederatedCLI struct {
+	Address        string `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
+	Peer2PeerToken string `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
+}
+
+func (f *FederatedCLI) Run(ctx *cliContext.Context) error {
+
+	n, err := p2p.NewNode(f.Peer2PeerToken)
+	if err != nil {
+		return fmt.Errorf("creating a new node: %w", err)
+	}
+	err = n.Start(context.Background())
+	if err != nil {
+		return fmt.Errorf("creating a new node: %w", err)
+	}
+
+	if err := p2p.ServiceDiscoverer(context.Background(), n, f.Peer2PeerToken, p2p.FederatedID, nil); err != nil {
+		return err
+	}
+
+	return Proxy(context.Background(), n, f.Address, p2p.FederatedID)
+}
+
+func Proxy(ctx context.Context, node *node.Node, listenAddr, service string) error {
+
+	log.Info().Msgf("Allocating service '%s' on: %s", service, listenAddr)
+	// Open local port for listening
+	l, err := net.Listen("tcp", listenAddr)
+	if err != nil {
+		log.Error().Err(err).Msg("Error listening")
+		return err
+	}
+	//	ll.Info("Binding local port on", srcaddr)
+
+	ledger, _ := node.Ledger()
+
+	// Announce ourselves so nodes accepts our connection
+	ledger.Announce(
+		ctx,
+		10*time.Second,
+		func() {
+			// Retrieve current ID for ip in the blockchain
+			//_, found := ledger.GetKey(protocol.UsersLedgerKey, node.Host().ID().String())
+			// If mismatch, update the blockchain
+			//if !found {
+			updatedMap := map[string]interface{}{}
+			updatedMap[node.Host().ID().String()] = &types.User{
+				PeerID:    node.Host().ID().String(),
+				Timestamp: time.Now().String(),
+			}
+			ledger.Add(protocol.UsersLedgerKey, updatedMap)
+			//	}
+		},
+	)
+
+	defer l.Close()
+	for {
+		select {
+		case <-ctx.Done():
+			return errors.New("context canceled")
+		default:
+			log.Debug().Msg("New for connection")
+			// Listen for an incoming connection.
+			conn, err := l.Accept()
+			if err != nil {
+				fmt.Println("Error accepting: ", err.Error())
+				continue
+			}
+
+			// Handle connections in a new goroutine, forwarding to the p2p service
+			go func() {
+				var tunnelAddresses []string
+				for _, v := range p2p.GetAvailableNodes(p2p.FederatedID) {
+					if v.IsOnline() {
+						tunnelAddresses = append(tunnelAddresses, v.TunnelAddress)
+					} else {
+						log.Info().Msgf("Node %s is offline", v.ID)
+					}
+				}
+
+				// open a TCP stream to one of the tunnels
+				// chosen randomly
+				// TODO: optimize this and track usage
+				tunnelAddr := tunnelAddresses[rand.IntN(len(tunnelAddresses))]
+
+				tunnelConn, err := net.Dial("tcp", tunnelAddr)
+				if err != nil {
+					log.Error().Err(err).Msg("Error connecting to tunnel")
+					return
+				}
+
+				log.Info().Msgf("Redirecting %s to %s", conn.LocalAddr().String(), tunnelConn.RemoteAddr().String())
+				closer := make(chan struct{}, 2)
+				go copyStream(closer, tunnelConn, conn)
+				go copyStream(closer, conn, tunnelConn)
+				<-closer
+
+				tunnelConn.Close()
+				conn.Close()
+				//	ll.Infof("(service %s) Done handling %s", serviceID, l.Addr().String())
+			}()
+		}
+	}
+
+}
+
+func copyStream(closer chan struct{}, dst io.Writer, src io.Reader) {
+	defer func() { closer <- struct{}{} }() // connection is closed, send signal to stop proxy
+	io.Copy(dst, src)
+}
--- a/core/cli/models.go
+++ b/core/cli/models.go
@@ -2,17 +2,21 @@ package cli

 import (
 	"encoding/json"
+	"errors"
 	"fmt"

-	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
+	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/core/config"

-	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/mudler/LocalAI/core/gallery"
+	"github.com/mudler/LocalAI/pkg/downloader"
+	"github.com/mudler/LocalAI/pkg/startup"
 	"github.com/rs/zerolog/log"
 	"github.com/schollz/progressbar/v3"
 )

 type ModelsCMDFlags struct {
-	Galleries  string `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models"`
+	Galleries  string `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models" default:"${galleries}"`
 	ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 }

@@ -21,7 +25,8 @@ type ModelsList struct {
 }

 type ModelsInstall struct {
-	ModelArgs []string `arg:"" optional:"" name:"models" help:"Model configuration URLs to load"`
+	DisablePredownloadScan bool     `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
+	ModelArgs              []string `arg:"" optional:"" name:"models" help:"Model configuration URLs to load"`

 	ModelsCMDFlags `embed:""`
 }
@@ -32,7 +37,7 @@ type ModelsCMD struct {
 }

 func (ml *ModelsList) Run(ctx *cliContext.Context) error {
-	var galleries []gallery.Gallery
+	var galleries []config.Gallery
 	if err := json.Unmarshal([]byte(ml.Galleries), &galleries); err != nil {
 		log.Error().Err(err).Msg("unable to load galleries")
 	}
@@ -52,29 +57,51 @@ func (ml *ModelsList) Run(ctx *cliContext.Context) error {
 }

 func (mi *ModelsInstall) Run(ctx *cliContext.Context) error {
-	modelName := mi.ModelArgs[0]
-
-	var galleries []gallery.Gallery
+	var galleries []config.Gallery
 	if err := json.Unmarshal([]byte(mi.Galleries), &galleries); err != nil {
 		log.Error().Err(err).Msg("unable to load galleries")
 	}

-	progressBar := progressbar.NewOptions(
-		1000,
-		progressbar.OptionSetDescription(fmt.Sprintf("downloading model %s", modelName)),
-		progressbar.OptionShowBytes(false),
-		progressbar.OptionClearOnFinish(),
-	)
-	progressCallback := func(fileName string, current string, total string, percentage float64) {
-		v := int(percentage * 10)
-		err := progressBar.Set(v)
-		if err != nil {
-			log.Error().Err(err).Str("filename", fileName).Int("value", v).Msg("error while updating progress bar")
+	for _, modelName := range mi.ModelArgs {
+
+		progressBar := progressbar.NewOptions(
+			1000,
+			progressbar.OptionSetDescription(fmt.Sprintf("downloading model %s", modelName)),
+			progressbar.OptionShowBytes(false),
+			progressbar.OptionClearOnFinish(),
+		)
+		progressCallback := func(fileName string, current string, total string, percentage float64) {
+			v := int(percentage * 10)
+			err := progressBar.Set(v)
+			if err != nil {
+				log.Error().Err(err).Str("filename", fileName).Int("value", v).Msg("error while updating progress bar")
+			}
+		}
+		//startup.InstallModels()
+		models, err := gallery.AvailableGalleryModels(galleries, mi.ModelsPath)
+		if err != nil {
+			return err
+		}
+
+		if !downloader.LooksLikeOCI(modelName) {
+			model := gallery.FindModel(models, modelName, mi.ModelsPath)
+			if model == nil {
+				log.Error().Str("model", modelName).Msg("model not found")
+				return err
+			}
+
+			err = gallery.SafetyScanGalleryModel(model)
+			if err != nil && !errors.Is(err, downloader.ErrNonHuggingFaceFile) {
+				return err
+			}
+
+			log.Info().Str("model", modelName).Str("license", model.License).Msg("installing model")
+		}
+
+		err = startup.InstallModels(galleries, "", mi.ModelsPath, !mi.DisablePredownloadScan, progressCallback, modelName)
+		if err != nil {
+			return err
 		}
-	}
-	err := gallery.InstallModelFromGallery(galleries, modelName, mi.ModelsPath, gallery.GalleryModel{}, progressCallback)
-	if err != nil {
-		return err
 	}
 	return nil
 }
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -3,14 +3,16 @@ package cli
 import (
 	"context"
 	"fmt"
+	"net"
+	"os"
 	"strings"
 	"time"

-	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/core/http"
-	"github.com/go-skynet/LocalAI/core/p2p"
-	"github.com/go-skynet/LocalAI/core/startup"
+	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http"
+	"github.com/mudler/LocalAI/core/p2p"
+	"github.com/mudler/LocalAI/core/startup"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )
@@ -37,25 +39,30 @@ type RunCMD struct {
 	PreloadModelsConfig string   `env:"LOCALAI_PRELOAD_MODELS_CONFIG,PRELOAD_MODELS_CONFIG" help:"A List of models to apply at startup. Path to a YAML config file" group:"models"`

 	F16         bool `name:"f16" env:"LOCALAI_F16,F16" help:"Enable GPU acceleration" group:"performance"`
-	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" default:"4" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
+	Threads     int  `env:"LOCALAI_THREADS,THREADS" short:"t" help:"Number of threads used for parallel computation. Usage of the number of physical cores in the system is suggested" group:"performance"`
 	ContextSize int  `env:"LOCALAI_CONTEXT_SIZE,CONTEXT_SIZE" default:"512" help:"Default context size for models" group:"performance"`

-	Address              string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
-	CORS                 bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
-	CORSAllowOrigins     string   `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
-	UploadLimit          int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
-	APIKeys              []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
-	DisableWebUI         bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
-	Peer2Peer            bool     `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
-	Peer2PeerToken       string   `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
-	ParallelRequests     bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
-	SingleActiveBackend  bool     `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
-	PreloadBackendOnly   bool     `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
-	ExternalGRPCBackends []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
-	EnableWatchdogIdle   bool     `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
-	WatchdogIdleTimeout  string   `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
-	EnableWatchdogBusy   bool     `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
-	WatchdogBusyTimeout  string   `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
+	Address                string   `env:"LOCALAI_ADDRESS,ADDRESS" default:":8080" help:"Bind address for the API server" group:"api"`
+	CORS                   bool     `env:"LOCALAI_CORS,CORS" help:"" group:"api"`
+	CORSAllowOrigins       string   `env:"LOCALAI_CORS_ALLOW_ORIGINS,CORS_ALLOW_ORIGINS" group:"api"`
+	LibraryPath            string   `env:"LOCALAI_LIBRARY_PATH,LIBRARY_PATH" help:"Path to the library directory (for e.g. external libraries used by backends)" default:"/usr/share/local-ai/libs" group:"backends"`
+	CSRF                   bool     `env:"LOCALAI_CSRF" help:"Enables fiber CSRF middleware" group:"api"`
+	UploadLimit            int      `env:"LOCALAI_UPLOAD_LIMIT,UPLOAD_LIMIT" default:"15" help:"Default upload-limit in MB" group:"api"`
+	APIKeys                []string `env:"LOCALAI_API_KEY,API_KEY" help:"List of API Keys to enable API authentication. When this is set, all the requests must be authenticated with one of these API keys" group:"api"`
+	DisableWebUI           bool     `env:"LOCALAI_DISABLE_WEBUI,DISABLE_WEBUI" default:"false" help:"Disable webui" group:"api"`
+	DisablePredownloadScan bool     `env:"LOCALAI_DISABLE_PREDOWNLOAD_SCAN" help:"If true, disables the best-effort security scanner before downloading any files." group:"hardening" default:"false"`
+	OpaqueErrors           bool     `env:"LOCALAI_OPAQUE_ERRORS" default:"false" help:"If true, all error responses are replaced with blank 500 errors. This is intended only for hardening against information leaks and is normally not recommended." group:"hardening"`
+	Peer2Peer              bool     `env:"LOCALAI_P2P,P2P" name:"p2p" default:"false" help:"Enable P2P mode" group:"p2p"`
+	Peer2PeerToken         string   `env:"LOCALAI_P2P_TOKEN,P2P_TOKEN,TOKEN" name:"p2ptoken" help:"Token for P2P mode (optional)" group:"p2p"`
+	ParallelRequests       bool     `env:"LOCALAI_PARALLEL_REQUESTS,PARALLEL_REQUESTS" help:"Enable backends to handle multiple requests in parallel if they support it (e.g.: llama.cpp or vllm)" group:"backends"`
+	SingleActiveBackend    bool     `env:"LOCALAI_SINGLE_ACTIVE_BACKEND,SINGLE_ACTIVE_BACKEND" help:"Allow only one backend to be run at a time" group:"backends"`
+	PreloadBackendOnly     bool     `env:"LOCALAI_PRELOAD_BACKEND_ONLY,PRELOAD_BACKEND_ONLY" default:"false" help:"Do not launch the API services, only the preloaded models / backends are started (useful for multi-node setups)" group:"backends"`
+	ExternalGRPCBackends   []string `env:"LOCALAI_EXTERNAL_GRPC_BACKENDS,EXTERNAL_GRPC_BACKENDS" help:"A list of external grpc backends" group:"backends"`
+	EnableWatchdogIdle     bool     `env:"LOCALAI_WATCHDOG_IDLE,WATCHDOG_IDLE" default:"false" help:"Enable watchdog for stopping backends that are idle longer than the watchdog-idle-timeout" group:"backends"`
+	WatchdogIdleTimeout    string   `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
+	EnableWatchdogBusy     bool     `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
+	WatchdogBusyTimeout    string   `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
+	Federated              bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
 }

 func (r *RunCMD) Run(ctx *cliContext.Context) error {
@@ -77,17 +84,22 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithModelLibraryURL(r.RemoteLibrary),
 		config.WithCors(r.CORS),
 		config.WithCorsAllowOrigins(r.CORSAllowOrigins),
+		config.WithCsrf(r.CSRF),
+		config.WithLibPath(r.LibraryPath),
 		config.WithThreads(r.Threads),
 		config.WithBackendAssets(ctx.BackendAssets),
 		config.WithBackendAssetsOutput(r.BackendAssetsPath),
 		config.WithUploadLimitMB(r.UploadLimit),
 		config.WithApiKeys(r.APIKeys),
 		config.WithModelsURL(append(r.Models, r.ModelArgs...)...),
+		config.WithOpaqueErrors(r.OpaqueErrors),
+		config.WithEnforcedPredownloadScans(!r.DisablePredownloadScan),
 	}

+	token := ""
 	if r.Peer2Peer || r.Peer2PeerToken != "" {
 		log.Info().Msg("P2P mode enabled")
-		token := r.Peer2PeerToken
+		token = r.Peer2PeerToken
 		if token == "" {
 			// IF no token is provided, and p2p is enabled,
 			// we generate one and wait for the user to pick up the token (this is for interactive)
@@ -98,14 +110,46 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {

 			log.Info().Msg("To use the token, you can run the following command in another node or terminal:")
 			fmt.Printf("export TOKEN=\"%s\"\nlocal-ai worker p2p-llama-cpp-rpc\n", token)
-
-			// Ask for user confirmation
-			log.Info().Msg("Press a button to proceed")
-			var input string
-			fmt.Scanln(&input)
 		}
+		opts = append(opts, config.WithP2PToken(token))
+
+		node, err := p2p.NewNode(token)
+		if err != nil {
+			return err
+		}
+
 		log.Info().Msg("Starting P2P server discovery...")
-		if err := p2p.LLamaCPPRPCServerDiscoverer(context.Background(), token); err != nil {
+		if err := p2p.ServiceDiscoverer(context.Background(), node, token, "", func() {
+			var tunnelAddresses []string
+			for _, v := range p2p.GetAvailableNodes("") {
+				if v.IsOnline() {
+					tunnelAddresses = append(tunnelAddresses, v.TunnelAddress)
+				} else {
+					log.Info().Msgf("Node %s is offline", v.ID)
+				}
+			}
+			tunnelEnvVar := strings.Join(tunnelAddresses, ",")
+
+			os.Setenv("LLAMACPP_GRPC_SERVERS", tunnelEnvVar)
+			log.Debug().Msgf("setting LLAMACPP_GRPC_SERVERS to %s", tunnelEnvVar)
+		}); err != nil {
+			return err
+		}
+	}
+
+	if r.Federated {
+		_, port, err := net.SplitHostPort(r.Address)
+		if err != nil {
+			return err
+		}
+		if err := p2p.ExposeService(context.Background(), "localhost", port, token, p2p.FederatedID); err != nil {
+			return err
+		}
+		node, err := p2p.NewNode(token)
+		if err != nil {
+			return err
+		}
+		if err := p2p.ServiceDiscoverer(context.Background(), node, token, p2p.FederatedID, nil); err != nil {
 			return err
 		}
 	}
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -5,10 +5,10 @@ import (
 	"errors"
 	"fmt"

-	"github.com/go-skynet/LocalAI/core/backend"
-	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/core/backend"
+	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 )

@@ -18,6 +18,7 @@ type TranscriptCMD struct {
 	Backend           string `short:"b" default:"whisper" help:"Backend to run the transcription model"`
 	Model             string `short:"m" required:"" help:"Model name to run the TTS"`
 	Language          string `short:"l" help:"Language of the audio file"`
+	Translate         bool   `short:"c" help:"Translate the transcription to english"`
 	Threads           int    `short:"t" default:"1" help:"Number of threads used for parallel computation"`
 	ModelsPath        string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
@@ -30,7 +31,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 		AssetsDestination: t.BackendAssetsPath,
 	}

-	cl := config.NewBackendConfigLoader()
+	cl := config.NewBackendConfigLoader(t.ModelsPath)
 	ml := model.NewModelLoader(opts.ModelPath)
 	if err := cl.LoadBackendConfigsFromPath(t.ModelsPath); err != nil {
 		return err
@@ -50,7 +51,7 @@ func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
 		}
 	}()

-	tr, err := backend.ModelTranscription(t.Filename, t.Language, ml, c, opts)
+	tr, err := backend.ModelTranscription(t.Filename, t.Language, t.Translate, ml, c, opts)
 	if err != nil {
 		return err
 	}
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -7,10 +7,10 @@ import (
 	"path/filepath"
 	"strings"

-	"github.com/go-skynet/LocalAI/core/backend"
-	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
-	"github.com/go-skynet/LocalAI/core/config"
-	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/core/backend"
+	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 )

@@ -20,6 +20,7 @@ type TTSCMD struct {
 	Backend           string `short:"b" default:"piper" help:"Backend to run the TTS model"`
 	Model             string `short:"m" required:"" help:"Model name to run the TTS"`
 	Voice             string `short:"v" help:"Voice name to run the TTS"`
+	Language          string `short:"l" help:"Language to use with the TTS"`
 	OutputFile        string `short:"o" type:"path" help:"The path to write the output wav file"`
 	ModelsPath        string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
@@ -52,7 +53,7 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 	options := config.BackendConfig{}
 	options.SetDefaults()

-	filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, ml, opts, options)
+	filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, t.Language, ml, opts, options)
 	if err != nil {
 		return err
 	}
--- a/core/cli/util.go
+++ b/core/cli/util.go
@@ -0,0 +1,101 @@
+package cli
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+
+	"github.com/rs/zerolog/log"
+
+	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/gallery"
+	"github.com/mudler/LocalAI/pkg/downloader"
+	gguf "github.com/thxcode/gguf-parser-go"
+)
+
+type UtilCMD struct {
+	GGUFInfo GGUFInfoCMD `cmd:"" name:"gguf-info" help:"Get information about a GGUF file"`
+	HFScan   HFScanCMD   `cmd:"" name:"hf-scan" help:"Checks installed models for known security issues. WARNING: this is a best-effort feature and may not catch everything!"`
+}
+
+type GGUFInfoCMD struct {
+	Args   []string `arg:"" optional:"" name:"args" help:"Arguments to pass to the utility command"`
+	Header bool     `optional:"" default:"false" name:"header" help:"Show header information"`
+}
+
+type HFScanCMD struct {
+	ModelsPath string   `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
+	Galleries  string   `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models" default:"${galleries}"`
+	ToScan     []string `arg:""`
+}
+
+func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
+	if u.Args == nil || len(u.Args) == 0 {
+		return fmt.Errorf("no GGUF file provided")
+	}
+	// We try to guess only if we don't have a template defined already
+	f, err := gguf.ParseGGUFFile(u.Args[0])
+	if err != nil {
+		// Only valid for gguf files
+		log.Error().Msgf("guessDefaultsFromFile: %s", "not a GGUF file")
+		return err
+	}
+
+	log.Info().
+		Any("eosTokenID", f.Tokenizer().EOSTokenID).
+		Any("bosTokenID", f.Tokenizer().BOSTokenID).
+		Any("modelName", f.Model().Name).
+		Any("architecture", f.Architecture().Architecture).Msgf("GGUF file loaded: %s", u.Args[0])
+
+	log.Info().Any("tokenizer", fmt.Sprintf("%+v", f.Tokenizer())).Msg("Tokenizer")
+	log.Info().Any("architecture", fmt.Sprintf("%+v", f.Architecture())).Msg("Architecture")
+
+	v, exists := f.Header.MetadataKV.Get("tokenizer.chat_template")
+	if exists {
+		log.Info().Msgf("chat_template: %s", v.ValueString())
+	}
+
+	if u.Header {
+		for _, metadata := range f.Header.MetadataKV {
+			log.Info().Msgf("%s: %+v", metadata.Key, metadata.Value)
+		}
+		//	log.Info().Any("header", fmt.Sprintf("%+v", f.Header)).Msg("Header")
+	}
+
+	return nil
+}
+
+func (hfscmd *HFScanCMD) Run(ctx *cliContext.Context) error {
+	log.Info().Msg("LocalAI Security Scanner - This is BEST EFFORT functionality! Currently limited to huggingface models!")
+	if len(hfscmd.ToScan) == 0 {
+		log.Info().Msg("Checking all installed models against galleries")
+		var galleries []config.Gallery
+		if err := json.Unmarshal([]byte(hfscmd.Galleries), &galleries); err != nil {
+			log.Error().Err(err).Msg("unable to load galleries")
+		}
+
+		err := gallery.SafetyScanGalleryModels(galleries, hfscmd.ModelsPath)
+		if err == nil {
+			log.Info().Msg("No security warnings were detected for your installed models. Please note that this is a BEST EFFORT tool, and all issues may not be detected.")
+		} else {
+			log.Error().Err(err).Msg("! WARNING ! A known-vulnerable model is installed!")
+		}
+		return err
+	} else {
+		var errs error = nil
+		for _, uri := range hfscmd.ToScan {
+			log.Info().Str("uri", uri).Msg("scanning specific uri")
+			scanResults, err := downloader.HuggingFaceScan(uri)
+			if err != nil && !errors.Is(err, downloader.ErrNonHuggingFaceFile) {
+				log.Error().Err(err).Strs("clamAV", scanResults.ClamAVInfectedFiles).Strs("pickles", scanResults.DangerousPickles).Msg("! WARNING ! A known-vulnerable model is included in this repo!")
+				errs = errors.Join(errs, err)
+			}
+		}
+		if errs != nil {
+			return errs
+		}
+		log.Info().Msg("No security warnings were detected for your installed models. Please note that this is a BEST EFFORT tool, and all issues may not be detected.")
+		return nil
+	}
+}
--- a/core/cli/worker/worker_llamacpp.go
+++ b/core/cli/worker/worker_llamacpp.go
@@ -5,8 +5,9 @@ import (
 	"os"
 	"syscall"

-	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
-	"github.com/go-skynet/LocalAI/pkg/assets"
+	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/pkg/assets"
+	"github.com/mudler/LocalAI/pkg/library"
 	"github.com/rs/zerolog/log"
 )

@@ -27,17 +28,18 @@ func (r *LLamaCPP) Run(ctx *cliContext.Context) error {
 		return fmt.Errorf("usage: local-ai worker llama-cpp-rpc -- <llama-rpc-server-args>")
 	}

+	grpcProcess := assets.ResolvePath(
+		r.BackendAssetsPath,
+		"util",
+		"llama-cpp-rpc-server",
+	)
+
+	args := os.Args[4:]
+	args, grpcProcess = library.LoadLDSO(r.BackendAssetsPath, args, grpcProcess)
+
+	args = append([]string{grpcProcess}, args...)
 	return syscall.Exec(
-		assets.ResolvePath(
-			r.BackendAssetsPath,
-			"util",
-			"llama-cpp-rpc-server",
-		),
-		append([]string{
-			assets.ResolvePath(
-				r.BackendAssetsPath,
-				"util",
-				"llama-cpp-rpc-server",
-			)}, os.Args[4:]...),
+		grpcProcess,
+		args,
 		os.Environ())
 }
--- a/core/cli/worker/worker_nop2p.go
+++ b/core/cli/worker/worker_nop2p.go
@@ -6,7 +6,7 @@ package worker
 import (
 	"fmt"

-	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
+	cliContext "github.com/mudler/LocalAI/core/cli/context"
 )

 type P2P struct{}
--- a/core/cli/worker/worker_p2p.go
+++ b/core/cli/worker/worker_p2p.go
@@ -10,16 +10,17 @@ import (
 	"os/exec"
 	"time"

-	cliContext "github.com/go-skynet/LocalAI/core/cli/context"
-	"github.com/go-skynet/LocalAI/core/p2p"
-	"github.com/go-skynet/LocalAI/pkg/assets"
+	cliContext "github.com/mudler/LocalAI/core/cli/context"
+	"github.com/mudler/LocalAI/core/p2p"
+	"github.com/mudler/LocalAI/pkg/assets"
+	"github.com/mudler/LocalAI/pkg/library"
 	"github.com/phayes/freeport"
 	"github.com/rs/zerolog/log"
 )

 type P2P struct {
 	WorkerFlags       `embed:""`
-	Token             string   `env:"LOCALAI_TOKEN,TOKEN" help:"JSON list of galleries"`
+	Token             string   `env:"LOCALAI_TOKEN,LOCALAI_P2P_TOKEN,TOKEN" help:"P2P token to use"`
 	NoRunner          bool     `env:"LOCALAI_NO_RUNNER,NO_RUNNER" help:"Do not start the llama-cpp-rpc-server"`
 	RunnerAddress     string   `env:"LOCALAI_RUNNER_ADDRESS,RUNNER_ADDRESS" help:"Address of the llama-cpp-rpc-server"`
 	RunnerPort        string   `env:"LOCALAI_RUNNER_PORT,RUNNER_PORT" help:"Port of the llama-cpp-rpc-server"`
@@ -58,7 +59,7 @@ func (r *P2P) Run(ctx *cliContext.Context) error {
 			p = r.RunnerPort
 		}

-		err = p2p.BindLLamaCPPWorker(context.Background(), address, p, r.Token)
+		err = p2p.ExposeService(context.Background(), address, p, r.Token, "")
 		if err != nil {
 			return err
 		}
@@ -71,13 +72,18 @@ func (r *P2P) Run(ctx *cliContext.Context) error {
 	go func() {
 		for {
 			log.Info().Msgf("Starting llama-cpp-rpc-server on '%s:%d'", address, port)
+
+			grpcProcess := assets.ResolvePath(
+				r.BackendAssetsPath,
+				"util",
+				"llama-cpp-rpc-server",
+			)
+
+			args := append([]string{"--host", address, "--port", fmt.Sprint(port)}, r.ExtraLLamaCPPArgs...)
+			args, grpcProcess = library.LoadLDSO(r.BackendAssetsPath, args, grpcProcess)
+
 			cmd := exec.Command(
-				assets.ResolvePath(
-					r.BackendAssetsPath,
-					"util",
-					"llama-cpp-rpc-server",
-				),
-				append([]string{"--host", address, "--port", fmt.Sprint(port)}, r.ExtraLLamaCPPArgs...)...,
+				grpcProcess, args...,
 			)

 			cmd.Env = os.Environ()
@@ -86,14 +92,14 @@ func (r *P2P) Run(ctx *cliContext.Context) error {
 			cmd.Stdout = os.Stdout

 			if err := cmd.Start(); err != nil {
-				log.Error().Err(err).Msg("Failed to start llama-cpp-rpc-server")
+				log.Error().Any("grpcProcess", grpcProcess).Any("args", args).Err(err).Msg("Failed to start llama-cpp-rpc-server")
 			}

 			cmd.Wait()
 		}
 	}()

-	err = p2p.BindLLamaCPPWorker(context.Background(), address, fmt.Sprint(port), r.Token)
+	err = p2p.ExposeService(context.Background(), address, fmt.Sprint(port), r.Token, "")
 	if err != nil {
 		return err
 	}
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -6,7 +6,7 @@ import (
 	"encoding/json"
 	"time"

-	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/mudler/LocalAI/pkg/xsysinfo"
 	"github.com/rs/zerolog/log"
 )

@@ -14,6 +14,7 @@ type ApplicationConfig struct {
 	Context                             context.Context
 	ConfigFile                          string
 	ModelPath                           string
+	LibPath                             string
 	UploadLimitMB, Threads, ContextSize int
 	DisableWebUI                        bool
 	F16                                 bool
@@ -25,14 +26,18 @@ type ApplicationConfig struct {
 	DynamicConfigsDir                   string
 	DynamicConfigsDirPollInterval       time.Duration
 	CORS                                bool
+	CSRF                                bool
 	PreloadJSONModels                   string
 	PreloadModelsFromPath               string
 	CORSAllowOrigins                    string
 	ApiKeys                             []string
+	EnforcePredownloadScans             bool
+	OpaqueErrors                        bool
+	P2PToken                            string

 	ModelLibraryURL string

-	Galleries []gallery.Gallery
+	Galleries []Gallery

 	BackendAssets     embed.FS
 	AssetsDestination string
@@ -59,7 +64,6 @@ func NewApplicationConfig(o ...AppOption) *ApplicationConfig {
 	opt := &ApplicationConfig{
 		Context:       context.Background(),
 		UploadLimitMB: 15,
-		Threads:       1,
 		ContextSize:   512,
 		Debug:         true,
 	}
@@ -87,12 +91,30 @@ func WithCors(b bool) AppOption {
 	}
 }

+func WithCsrf(b bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.CSRF = b
+	}
+}
+
+func WithP2PToken(s string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.P2PToken = s
+	}
+}
+
 func WithModelLibraryURL(url string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.ModelLibraryURL = url
 	}
 }

+func WithLibPath(path string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.LibPath = path
+	}
+}
+
 var EnableWatchDog = func(o *ApplicationConfig) {
 	o.WatchDog = true
 }
@@ -165,10 +187,10 @@ func WithBackendAssets(f embed.FS) AppOption {
 func WithStringGalleries(galls string) AppOption {
 	return func(o *ApplicationConfig) {
 		if galls == "" {
-			o.Galleries = []gallery.Gallery{}
+			o.Galleries = []Gallery{}
 			return
 		}
-		var galleries []gallery.Gallery
+		var galleries []Gallery
 		if err := json.Unmarshal([]byte(galls), &galleries); err != nil {
 			log.Error().Err(err).Msg("failed loading galleries")
 		}
@@ -176,7 +198,7 @@ func WithStringGalleries(galls string) AppOption {
 	}
 }

-func WithGalleries(galleries []gallery.Gallery) AppOption {
+func WithGalleries(galleries []Gallery) AppOption {
 	return func(o *ApplicationConfig) {
 		o.Galleries = append(o.Galleries, galleries...)
 	}
@@ -213,6 +235,9 @@ func WithUploadLimitMB(limit int) AppOption {

 func WithThreads(threads int) AppOption {
 	return func(o *ApplicationConfig) {
+		if threads == 0 { // 0 is not allowed
+			threads = xsysinfo.CPUPhysicalCores()
+		}
 		o.Threads = threads
 	}
 }
@@ -277,6 +302,18 @@ func WithApiKeys(apiKeys []string) AppOption {
 	}
 }

+func WithEnforcedPredownloadScans(enforced bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.EnforcePredownloadScans = enforced
+	}
+}
+
+func WithOpaqueErrors(opaque bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.OpaqueErrors = opaque
+	}
+}
+
 // ToConfigLoaderOptions returns a slice of ConfigLoader Option.
 // Some options defined at the application level are going to be passed as defaults for
 // all the configuration for the models.
@@ -289,6 +326,7 @@ func (o *ApplicationConfig) ToConfigLoaderOptions() []ConfigLoaderOption {
 		LoadOptionDebug(o.Debug),
 		LoadOptionF16(o.F16),
 		LoadOptionThreads(o.Threads),
+		ModelPath(o.ModelPath),
 	}
 }

--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -5,16 +5,25 @@ import (
 	"regexp"
 	"strings"

-	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/pkg/downloader"
-	"github.com/go-skynet/LocalAI/pkg/functions"
-	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/downloader"
+	"github.com/mudler/LocalAI/pkg/functions"
+	"github.com/mudler/LocalAI/pkg/utils"
 )

 const (
 	RAND_SEED = -1
 )

+type TTSConfig struct {
+
+	// Voice wav path or id
+	Voice string `yaml:"voice"`
+
+	// Vall-e-x
+	VallE VallE `yaml:"vall-e"`
+}
+
 type BackendConfig struct {
 	schema.PredictionOptions `yaml:"parameters"`
 	Name                     string `yaml:"name"`
@@ -27,9 +36,11 @@ type BackendConfig struct {
 	Backend        string            `yaml:"backend"`
 	TemplateConfig TemplateConfig    `yaml:"template"`

-	PromptStrings, InputStrings                []string `yaml:"-"`
-	InputToken                                 [][]int  `yaml:"-"`
-	functionCallString, functionCallNameString string   `yaml:"-"`
+	PromptStrings, InputStrings                []string               `yaml:"-"`
+	InputToken                                 [][]int                `yaml:"-"`
+	functionCallString, functionCallNameString string                 `yaml:"-"`
+	ResponseFormat                             string                 `yaml:"-"`
+	ResponseFormatMap                          map[string]interface{} `yaml:"-"`

 	FunctionsConfig functions.FunctionsConfig `yaml:"function"`

@@ -47,8 +58,8 @@ type BackendConfig struct {
 	// GRPC Options
 	GRPC GRPC `yaml:"grpc"`

-	// Vall-e-x
-	VallE VallE `yaml:"vall-e"`
+	// TTS specifics
+	TTSConfig `yaml:"tts"`

 	// CUDA
 	// Explicitly enable CUDA or not (some backends might need it)
@@ -357,6 +368,8 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	if debug {
 		cfg.Debug = &trueV
 	}
+
+	guessDefaultsFromFile(cfg, lo.modelPath)
 }

 func (c *BackendConfig) Validate() bool {
@@ -377,10 +390,6 @@ func (c *BackendConfig) Validate() bool {
 		}
 	}

-	if c.Name == "" {
-		return false
-	}
-
 	if c.Backend != "" {
 		// a regex that checks that is a string name with no special characters, except '-' and '_'
 		re := regexp.MustCompile(`^[a-zA-Z0-9-_]+$`)
@@ -389,3 +398,7 @@ func (c *BackendConfig) Validate() bool {

 	return true
 }
+
+func (c *BackendConfig) HasTemplate() bool {
+	return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != ""
+}
--- a/core/config/backend_config_loader.go
+++ b/core/config/backend_config_loader.go
@@ -11,25 +11,28 @@ import (
 	"sync"

 	"github.com/charmbracelet/glamour"
-	"github.com/go-skynet/LocalAI/core/schema"
-	"github.com/go-skynet/LocalAI/pkg/downloader"
-	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/downloader"
+	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/rs/zerolog/log"
 	"gopkg.in/yaml.v3"
 )

 type BackendConfigLoader struct {
-	configs map[string]BackendConfig
+	configs   map[string]BackendConfig
+	modelPath string
 	sync.Mutex
 }

-func NewBackendConfigLoader() *BackendConfigLoader {
+func NewBackendConfigLoader(modelPath string) *BackendConfigLoader {
 	return &BackendConfigLoader{
-		configs: make(map[string]BackendConfig),
+		configs:   make(map[string]BackendConfig),
+		modelPath: modelPath,
 	}
 }

 type LoadOptions struct {
+	modelPath        string
 	debug            bool
 	threads, ctxSize int
 	f16              bool
@@ -53,6 +56,12 @@ func LoadOptionContextSize(ctxSize int) ConfigLoaderOption {
 	}
 }

+func ModelPath(modelPath string) ConfigLoaderOption {
+	return func(o *LoadOptions) {
+		o.modelPath = modelPath
+	}
+}
+
 func LoadOptionF16(f16 bool) ConfigLoaderOption {
 	return func(o *LoadOptions) {
 		o.f16 = f16
--- a/core/config/backend_config_test.go
+++ b/core/config/backend_config_test.go
@@ -16,7 +16,8 @@ var _ = Describe("Test cases for config related functions", func() {
 			Expect(err).To(BeNil())
 			defer os.Remove(tmp.Name())
 			_, err = tmp.WriteString(
-				`backend: "foo-bar"
+				`backend: "../foo-bar"
+name: "foo"
 parameters:
  model: "foo-bar"`)
 			Expect(err).ToNot(HaveOccurred())
--- a/core/config/config_test.go
+++ b/core/config/config_test.go
@@ -26,7 +26,7 @@ var _ = Describe("Test cases for config related functions", func() {

 		It("Test LoadConfigs", func() {

-			bcl := NewBackendConfigLoader()
+			bcl := NewBackendConfigLoader(os.Getenv("MODELS_PATH"))
 			err := bcl.LoadBackendConfigsFromPath(os.Getenv("MODELS_PATH"))

 			Expect(err).To(BeNil())
--- a/Show More
+++ b/Show More