Ci testsg

feat(guesser): identify gemma models (#2561 )
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-07-07 06:49:49 -04:00 · 2024-06-13 23:34:11 +02:00 · 2024-06-13 19:12:37 +02:00 · 2024-06-13 16:12:46 +02:00 · 2024-06-13 08:34:32 +00:00 · 2024-06-13 01:05:24 +02:00
739 changed files with 100178 additions and 4536 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,6 +1,16 @@
-.git
 .idea
+.github
+.vscode
 models
 examples/chatbot-ui/models
 examples/rwkv/models
 examples/**/models
+Dockerfile*
+__pycache__
+
+# SonarQube
+.scannerwork
+
+# backend virtual environments
+**/venv
+backend/python/**/source
--- a/.editorconfig
+++ b/.editorconfig
@@ -0,0 +1,31 @@
+
+root = true
+
+[*]
+indent_style = space
+indent_size = 2
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.go]
+indent_style = tab
+
+[Makefile]
+indent_style = tab
+
+[*.proto]
+indent_size = 2
+
+[*.py]
+indent_size = 4
+
+[*.js]
+indent_size = 2
+
+[*.yaml]
+indent_size = 2
+
+[*.md]
+trim_trailing_whitespace = false
--- a/.env
+++ b/.env
@@ -1,30 +1,94 @@
 ## Set number of threads.
 ## Note: prefer the number of physical cores. Overbooking the CPU degrades performance notably.
-# THREADS=14
+# LOCALAI_THREADS=14

 ## Specify a different bind address (defaults to ":8080")
-# ADDRESS=127.0.0.1:8080
+# LOCALAI_ADDRESS=127.0.0.1:8080

 ## Default models context size
-# CONTEXT_SIZE=512
+# LOCALAI_CONTEXT_SIZE=512
+#
+## Define galleries.
+## models will to install will be visible in `/models/available`
+# LOCALAI_GALLERIES=[{"name":"localai", "url":"github:mudler/LocalAI/gallery/index.yaml@master"}]
+
+## CORS settings
+# LOCALAI_CORS=true
+# LOCALAI_CORS_ALLOW_ORIGINS=*

 ## Default path for models
-MODELS_PATH=/models
+#
+# LOCALAI_MODELS_PATH=/models

 ## Enable debug mode
-# DEBUG=true
+# LOCALAI_LOG_LEVEL=debug

-## Specify a build type. Available: cublas, openblas.
+## Disables COMPEL (Diffusers)
+# COMPEL=0
+
+## Enable/Disable single backend (useful if only one GPU is available)
+# LOCALAI_SINGLE_ACTIVE_BACKEND=true
+
+## Specify a build type. Available: cublas, openblas, clblas.
+## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
+## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
+## clBLAS:   This is an open-source implementation of the BLAS library that uses OpenCL, a framework for writing programs that execute across heterogeneous platforms consisting of CPUs, GPUs, and other processors. clBLAS is designed to take advantage of the parallel computing power of GPUs but can also run on any hardware that supports OpenCL. This includes hardware from different vendors like Nvidia, AMD, and Intel.
 # BUILD_TYPE=openblas

-## Uncomment and set to false to disable rebuilding from source
-# REBUILD=false
+## Uncomment and set to true to enable rebuilding from source
+# REBUILD=true

-## Enable image generation with stablediffusion (requires REBUILD=true)
+## Enable go tags, available: stablediffusion, tts
+## stablediffusion: image generation with stablediffusion
+## tts: enables text-to-speech with go-piper 
+## (requires REBUILD=true)
+#
 # GO_TAGS=stablediffusion

 ## Path where to store generated images
-# IMAGE_PATH=/tmp
+# LOCALAI_IMAGE_PATH=/tmp/generated/images

 ## Specify a default upload limit in MB (whisper)
-# UPLOAD_LIMIT
+# LOCALAI_UPLOAD_LIMIT=15
+
+## List of external GRPC backends (note on the container image this variable is already set to use extra backends available in extra/)
+# LOCALAI_EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py
+
+### Advanced settings ###
+### Those are not really used by LocalAI, but from components in the stack ###
+##
+### Preload libraries
+# LD_PRELOAD=
+
+### Huggingface cache for models
+# HUGGINGFACE_HUB_CACHE=/usr/local/huggingface
+
+### Python backends GRPC max workers
+### Default number of workers for GRPC Python backends.
+### This actually controls wether a backend can process multiple requests or not.
+# PYTHON_GRPC_MAX_WORKERS=1
+
+### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
+# LLAMACPP_PARALLEL=1
+
+### Define a list of GRPC Servers for llama-cpp workers to distribute the load
+# https://github.com/ggerganov/llama.cpp/pull/6829
+# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
+# LLAMACPP_GRPC_SERVERS=""
+
+### Enable to run parallel requests
+# LOCALAI_PARALLEL_REQUESTS=true
+
+### Watchdog settings
+###
+# Enables watchdog to kill backends that are inactive for too much time
+# LOCALAI_WATCHDOG_IDLE=true
+#
+# Time in duration format (e.g. 1h30m) after which a backend is considered idle
+# LOCALAI_WATCHDOG_IDLE_TIMEOUT=5m
+#
+# Enables watchdog to kill backends that are busy for too much time
+# LOCALAI_WATCHDOG_BUSY=true
+#
+# Time in duration format (e.g. 1h30m) after which a backend is considered busy
+# LOCALAI_WATCHDOG_BUSY_TIMEOUT=5m
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.sh text eol=lf
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -0,0 +1,5 @@
+# These are supported funding model platforms
+
+github: [mudler]
+custom: 
+- https://www.buymeacoffee.com/mudler
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -2,9 +2,7 @@
 name: Bug report
 about: Create a report to help us improve
 title: ''
-labels: bug
-assignees: mudler
-
+labels: bug, unconfirmed, up-for-grabs
 ---

 <!-- Thanks for helping us to improve LocalAI! We welcome all bug reports. Please fill out each area of the template so we can better help you. Comments like this will be hidden when you post but you can delete them if you wish. -->
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -2,9 +2,7 @@
 name: Feature request
 about: Suggest an idea for this project
 title: ''
-labels: enhancement
-assignees: mudler
-
+labels: enhancement, up-for-grabs
 ---

 <!-- Thanks for helping us to improve LocalAI! We welcome all feature requests. Please fill out each area of the template so we can better help you. Comments like this will be hidden when you post but you can delete them if you wish. -->
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -8,16 +8,24 @@ This PR fixes #
 **[Signed commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin)**
 - [ ] Yes, I signed my commits.
 
-
 <!--
 Thank you for contributing to LocalAI! 

-Contributing Conventions:
+Contributing Conventions
+-------------------------

-1. Include descriptive PR titles with [<component-name>] prepended.
-2. Build and test your changes before submitting a PR. 
+The draft above helps to give a quick overview of your PR.
+
+Remember to remove this comment and to at least:
+
+1. Include descriptive PR titles with [<component-name>] prepended. We use [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/).
+2. Build and test your changes before submitting a PR (`make build`). 
 3. Sign your commits
+4. **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below).
+5. **X/Twitter handle:** we announce bigger features on X/Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out!

 By following the community's contribution conventions upfront, the review process will 
 be accelerated and your PR merged more quickly.
+
+If no one reviews your PR within a few days, please @-mention @mudler.
 -->
--- a/.github/bump_docs.sh
+++ b/.github/bump_docs.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -xe
+REPO=$1
+
+LATEST_TAG=$(curl -s "https://api.github.com/repos/$REPO/releases/latest" | jq -r '.tag_name')
+
+cat <<< $(jq ".version = \"$LATEST_TAG\"" docs/data/version.json) > docs/data/version.json
--- a/.github/checksum_checker.sh
+++ b/.github/checksum_checker.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+# This scripts needs yq and huggingface_hub to be installed
+# to install hugingface_hub run pip install huggingface_hub
+
+# Path to the input YAML file
+input_yaml=$1
+
+# Function to download file and check checksum using Python
+function check_and_update_checksum() {
+    model_name="$1"
+    file_name="$2"
+    uri="$3"
+    old_checksum="$4"
+    idx="$5"
+
+    # Download the file and calculate new checksum using Python
+    new_checksum=$(python3 -c "
+import hashlib
+from huggingface_hub import hf_hub_download, get_paths_info
+import requests
+import sys
+import os
+
+uri = '$uri'
+file_name = uri.split('/')[-1]
+
+# Function to parse the URI and determine download method
+# Function to parse the URI and determine download method
+def parse_uri(uri):
+    if uri.startswith('huggingface://'):
+        repo_id = uri.split('://')[1]
+        return 'huggingface', repo_id.rsplit('/', 1)[0]
+    elif 'huggingface.co' in uri:
+        parts = uri.split('/resolve/')
+        if len(parts) > 1:
+            repo_path = parts[0].split('https://huggingface.co/')[-1]
+            return 'huggingface', repo_path
+    return 'direct', uri
+
+def calculate_sha256(file_path):
+    sha256_hash = hashlib.sha256()
+    with open(file_path, 'rb') as f:
+        for byte_block in iter(lambda: f.read(4096), b''):
+            sha256_hash.update(byte_block)
+    return sha256_hash.hexdigest()
+
+download_type, repo_id_or_url = parse_uri(uri)
+
+new_checksum =  None
+
+# Decide download method based on URI type
+if download_type == 'huggingface':
+    # Use HF API to pull sha
+    for file in get_paths_info(repo_id_or_url, [file_name], repo_type='model'):
+        try:
+            new_checksum = file.lfs.sha256
+            break
+        except Exception as e:
+            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
+            sys.exit(2)
+    if new_checksum is None:
+        try:
+            file_path = hf_hub_download(repo_id=repo_id_or_url, filename=file_name)
+        except Exception as e:
+            print(f'Error from Hugging Face Hub: {str(e)}', file=sys.stderr)
+            sys.exit(2)
+else:
+    response = requests.get(repo_id_or_url)
+    if response.status_code == 200:
+        with open(file_name, 'wb') as f:
+            f.write(response.content)
+        file_path = file_name
+    elif response.status_code == 404:
+        print(f'File not found: {response.status_code}', file=sys.stderr)
+        sys.exit(2)
+    else:
+        print(f'Error downloading file: {response.status_code}', file=sys.stderr)
+        sys.exit(1)
+
+if new_checksum is None:
+    new_checksum = calculate_sha256(file_path)
+    print(new_checksum)
+    os.remove(file_path)
+else:
+    print(new_checksum)
+
+")
+
+    if [[ "$new_checksum" == "" ]]; then
+        echo "Error calculating checksum for $file_name. Skipping..."
+        return
+    fi
+
+    echo "Checksum for $file_name: $new_checksum"
+
+    # Compare and update the YAML file if checksums do not match
+    result=$?
+    if [[ $result -eq 2 ]]; then
+        echo "File not found, deleting entry for $file_name..."
+        # yq eval -i "del(.[$idx].files[] | select(.filename == \"$file_name\"))" "$input_yaml"
+    elif [[ "$old_checksum" != "$new_checksum" ]]; then
+        echo "Checksum mismatch for $file_name. Updating..."
+        yq eval -i "del(.[$idx].files[] | select(.filename == \"$file_name\").sha256)" "$input_yaml"
+        yq eval -i "(.[$idx].files[] | select(.filename == \"$file_name\")).sha256 = \"$new_checksum\"" "$input_yaml"
+    elif [[ $result -ne 0 ]]; then
+        echo "Error downloading file $file_name. Skipping..."
+    else
+        echo "Checksum match for $file_name. No update needed."
+    fi
+}
+
+# Read the YAML and process each file
+len=$(yq eval '. | length' "$input_yaml")
+for ((i=0; i<$len; i++))
+do
+    name=$(yq eval ".[$i].name" "$input_yaml")
+    files_len=$(yq eval ".[$i].files | length" "$input_yaml")
+    for ((j=0; j<$files_len; j++))
+    do
+        filename=$(yq eval ".[$i].files[$j].filename" "$input_yaml")
+        uri=$(yq eval ".[$i].files[$j].uri" "$input_yaml")
+        checksum=$(yq eval ".[$i].files[$j].sha256" "$input_yaml")
+        echo "Checking model $name, file $filename. URI = $uri, Checksum = $checksum"
+        check_and_update_checksum "$name" "$filename" "$uri" "$checksum" "$i"
+    done
+done
--- a/.github/ci/modelslist.go
+++ b/.github/ci/modelslist.go
@@ -0,0 +1,297 @@
+package main
+
+import (
+	"fmt"
+	"html/template"
+	"io/ioutil"
+	"os"
+
+	"gopkg.in/yaml.v3"
+)
+
+var modelPageTemplate string = `
+<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>LocalAI models</title>
+    <link href="https://cdnjs.cloudflare.com/ajax/libs/flowbite/2.3.0/flowbite.min.css" rel="stylesheet" />
+    <script src="https://cdn.jsdelivr.net/npm/vanilla-lazyload@19.1.3/dist/lazyload.min.js"></script>
+
+    <link
+    rel="stylesheet"
+    href="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.8.0/build/styles/default.min.css"
+  />
+    <script
+    defer
+    src="https://cdn.jsdelivr.net/gh/highlightjs/cdn-release@11.8.0/build/highlight.min.js"
+  ></script>
+    <script
+    defer
+    src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"
+  ></script>
+  <script
+    defer
+    src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"
+  ></script>
+  <script
+    defer
+    src="https://cdn.jsdelivr.net/npm/dompurify@3.0.6/dist/purify.min.js"
+  ></script>
+
+  <link href="/static/general.css" rel="stylesheet" />
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700&family=Roboto:wght@400;500&display=swap" rel="stylesheet">
+    <link
+    href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700,900&display=swap"
+    rel="stylesheet" />
+  <link
+    rel="stylesheet"
+    href="https://cdn.jsdelivr.net/npm/tw-elements/css/tw-elements.min.css" />
+  <script src="https://cdn.tailwindcss.com/3.3.0"></script>
+  <script>
+    tailwind.config = {
+      darkMode: "class",
+      theme: {
+        fontFamily: {
+          sans: ["Roboto", "sans-serif"],
+          body: ["Roboto", "sans-serif"],
+          mono: ["ui-monospace", "monospace"],
+        },
+      },
+      corePlugins: {
+        preflight: false,
+      },
+    };
+  </script>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.1.1/css/all.min.css">
+    <script src="https://unpkg.com/htmx.org@1.9.12" integrity="sha384-ujb1lZYygJmzgSwoxRggbCHcjc0rB2XoQrxeTUQyRjrOnlCoYta87iKBWq3EsdM2" crossorigin="anonymous"></script>
+</head>
+
+<body class="bg-gray-900 text-gray-200">
+<div class="flex flex-col min-h-screen">
+
+<nav class="bg-gray-800 shadow-lg">
+    <div class="container mx-auto px-4 py-4">
+        <div class="flex items-center justify-between">
+            <div class="flex items-center">
+                <a href="/" class="text-white text-xl font-bold"><img src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd" alt="LocalAI Logo" class="h-10 mr-3 border-2 border-gray-300 shadow rounded"></a>
+                <a href="/" class="text-white text-xl font-bold">LocalAI</a>
+            </div>
+            <!-- Menu button for small screens -->
+            <div class="lg:hidden">
+                <button id="menu-toggle" class="text-gray-400 hover:text-white focus:outline-none">
+                    <i class="fas fa-bars fa-lg"></i>
+                </button>
+            </div>
+            <!-- Navigation links -->
+            <div class="hidden lg:flex lg:items-center lg:justify-end lg:flex-1 lg:w-0">
+                <a href="https://localai.io" class="text-gray-400 hover:text-white px-3 py-2 rounded" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
+            </div>
+        </div>
+        <!-- Collapsible menu for small screens -->
+        <div class="hidden lg:hidden" id="mobile-menu">
+            <div class="pt-4 pb-3 border-t border-gray-700">
+                
+                <a href="https://localai.io" class="block text-gray-400 hover:text-white px-3 py-2 rounded mt-1" target="_blank" ><i class="fas fa-book-reader pr-2"></i> Documentation</a>
+               
+            </div>
+        </div>
+    </div>
+</nav>
+
+<style>
+  .is-hidden {
+	display: none;
+	  }
+</style>
+
+<div class="container mx-auto px-4 flex-grow">
+
+<div class="models mt-12">
+	<h2 class="text-center text-3xl font-semibold text-gray-100">
+	LocalAI model gallery list </h2><br>
+
+	<h2 class="text-center text-3xl font-semibold text-gray-100">
+
+	 🖼️ Available {{.AvailableModels}} models</i> repositories     <a href="https://localai.io/models/" target="_blank" >
+			<i class="fas fa-circle-info pr-2"></i>
+		</a></h2> 
+
+	<h3>	  
+	Refer to <a href="https://localai.io/models" target=_blank> Model gallery</a> for more information on how to use the models with LocalAI.
+
+	You can install models with the CLI command <code>local-ai models install <model-name></code>. or by using the WebUI.
+	</h3>
+  
+	<input class="form-control appearance-none block w-full mt-5 px-3 py-2 text-base font-normal text-gray-300 pb-2 mb-5 bg-gray-800 bg-clip-padding border border-solid border-gray-600 rounded transition ease-in-out m-0 focus:text-gray-300 focus:bg-gray-900 focus:border-blue-500 focus:outline-none" type="search" 
+	id="searchbox" placeholder="Live search keyword..">
+	  <div class="dark grid grid-cols-1 grid-rows-1 md:grid-cols-3 block rounded-lg shadow-secondary-1 dark:bg-surface-dark">
+		{{ range $_, $model := .Models }}
+		<div class="box me-4 mb-2 block rounded-lg bg-white shadow-secondary-1  dark:bg-gray-800 dark:bg-surface-dark dark:text-white text-surface pb-2">
+		<div>
+		    {{ $icon := "https://upload.wikimedia.org/wikipedia/commons/6/65/No-Image-Placeholder.svg" }}
+			{{ if $model.Icon }}
+	  		{{ $icon = $model.Icon }}
+	  		{{ end }}
+			<div class="flex justify-center items-center">
+				<img data-src="{{ $icon }}" alt="{{$model.Name}}" class="rounded-t-lg max-h-48 max-w-96 object-cover mt-3 lazy">
+			</div>
+	  		<div class="p-6 text-surface dark:text-white">
+				<h5 class="mb-2 text-xl font-medium leading-tight">{{$model.Name}}</h5>
+				
+				   
+				<p class="mb-4 text-base truncate">{{ $model.Description }}</p>
+		
+			</div>
+			<div class="px-6 pt-4 pb-2">
+
+      <!-- Modal toggle -->
+      <button data-modal-target="{{ $model.Name}}-modal" data-modal-toggle="{{ $model.Name }}-modal" class="block text-white bg-blue-700 hover:bg-blue-800 focus:ring-4 focus:outline-none focus:ring-blue-300 font-medium rounded-lg text-sm px-5 py-2.5 text-center dark:bg-blue-600 dark:hover:bg-blue-700 dark:focus:ring-blue-800" type="button">
+        More info
+      </button>
+
+    <!-- Main modal -->
+    <div id="{{ $model.Name}}-modal" tabindex="-1" aria-hidden="true" class="hidden overflow-y-auto overflow-x-hidden fixed top-0 right-0 left-0 z-50 justify-center items-center w-full md:inset-0 h-[calc(100%-1rem)] max-h-full">
+        <div class="relative p-4 w-full max-w-2xl max-h-full">
+            <!-- Modal content -->
+            <div class="relative bg-white rounded-lg shadow dark:bg-gray-700">
+                <!-- Modal header -->
+                <div class="flex items-center justify-between p-4 md:p-5 border-b rounded-t dark:border-gray-600">
+                    <h3 class="text-xl font-semibold text-gray-900 dark:text-white">
+                        {{ $model.Name}}
+                    </h3>
+                    <button type="button" class="text-gray-400 bg-transparent hover:bg-gray-200 hover:text-gray-900 rounded-lg text-sm w-8 h-8 ms-auto inline-flex justify-center items-center dark:hover:bg-gray-600 dark:hover:text-white" data-modal-hide="{{$model.Name}}-modal">
+                        <svg class="w-3 h-3" aria-hidden="true" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 14 14">
+                            <path stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="m1 1 6 6m0 0 6 6M7 7l6-6M7 7l-6 6"/>
+                        </svg>
+                        <span class="sr-only">Close modal</span>
+                    </button>
+                </div>
+                <!-- Modal body -->
+                <div class="p-4 md:p-5 space-y-4">
+                    <div class="flex justify-center items-center">
+                    <img data-src="{{ $icon }}" alt="{{$model.Name}}" class="lazy rounded-t-lg max-h-48 max-w-96 object-cover mt-3">
+                  </div>
+
+                    <p class="text-base leading-relaxed text-gray-500 dark:text-gray-400">
+                    {{ $model.Description }}
+
+                    </p>
+                    
+                    <p class="text-base leading-relaxed text-gray-500 dark:text-gray-400">
+                    To install the model with the CLI, run: <br>
+                    <code> local-ai models install {{$model.Name}} </code> <br>
+
+                    <hr>
+                    See also <a href="https://localai.io/models/" target="_blank" >
+                    Installation <i class="fas fa-circle-info pr-2"></i>
+                    </a> to see how to install models with the REST API.
+                    </p>
+
+                    <p class="text-base leading-relaxed text-gray-500 dark:text-gray-400">
+                    <ul>
+                    {{ range $_, $u := $model.URLs }}
+                    <li><a href="{{ $u }}" target=_blank><i class="fa-solid fa-link"></i> {{ $u }}</a></li>
+                    {{ end }}  
+                    </ul>
+                    </p>
+                </div>
+                <!-- Modal footer -->
+                <div class="flex items-center p-4 md:p-5 border-t border-gray-200 rounded-b dark:border-gray-600">
+                    <button data-modal-hide="{{ $model.Name}}-modal" type="button" class="py-2.5 px-5 ms-3 text-sm font-medium text-gray-900 focus:outline-none bg-white rounded-lg border border-gray-200 hover:bg-gray-100 hover:text-blue-700 focus:z-10 focus:ring-4 focus:ring-gray-100 dark:focus:ring-gray-700 dark:bg-gray-800 dark:text-gray-400 dark:border-gray-600 dark:hover:text-white dark:hover:bg-gray-700">Close</button>
+                </div>
+            </div>
+        </div>
+    </div>
+
+
+			</div>
+		</div>
+		</div>
+		{{ end }}      
+
+		</div>
+  </div>
+</div>
+
+<script>
+var lazyLoadInstance = new LazyLoad({
+  // Your custom settings go here
+});
+
+let cards = document.querySelectorAll('.box')
+    
+function liveSearch() {
+    let search_query = document.getElementById("searchbox").value;
+    
+    //Use innerText if all contents are visible
+    //Use textContent for including hidden elements
+    for (var i = 0; i < cards.length; i++) {
+        if(cards[i].textContent.toLowerCase()
+                .includes(search_query.toLowerCase())) {
+            cards[i].classList.remove("is-hidden");
+        } else {
+            cards[i].classList.add("is-hidden");
+        }
+    }
+}
+
+//A little delay
+let typingTimer;               
+let typeInterval = 500;  
+let searchInput = document.getElementById('searchbox');
+
+searchInput.addEventListener('keyup', () => {
+    clearTimeout(typingTimer);
+    typingTimer = setTimeout(liveSearch, typeInterval);
+});
+</script>
+
+</div>
+
+<script src="https://cdnjs.cloudflare.com/ajax/libs/flowbite/2.3.0/flowbite.min.js"></script>
+</body>
+</html>
+`
+
+type GalleryModel struct {
+	Name        string   `json:"name" yaml:"name"`
+	URLs        []string `json:"urls" yaml:"urls"`
+	Icon        string   `json:"icon" yaml:"icon"`
+	Description string   `json:"description" yaml:"description"`
+}
+
+func main() {
+	// read the YAML file which contains the models
+
+	f, err := ioutil.ReadFile(os.Args[1])
+	if err != nil {
+		fmt.Println("Error reading file:", err)
+		return
+	}
+
+	models := []*GalleryModel{}
+	err = yaml.Unmarshal(f, &models)
+	if err != nil {
+		// write to stderr
+		os.Stderr.WriteString("Error unmarshaling YAML: " + err.Error() + "\n")
+		return
+	}
+
+	// render the template
+	data := struct {
+		Models          []*GalleryModel
+		AvailableModels int
+	}{
+		Models:          models,
+		AvailableModels: len(models),
+	}
+	tmpl := template.Must(template.New("modelPage").Parse(modelPageTemplate))
+
+	err = tmpl.Execute(os.Stdout, data)
+	if err != nil {
+		fmt.Println("Error executing template:", err)
+		return
+	}
+}
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -0,0 +1,25 @@
+# https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+version: 2
+updates:
+  - package-ecosystem: "gomod"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+  - package-ecosystem: "github-actions"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
+  - package-ecosystem: "pip"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
+  - package-ecosystem: "docker"
+    # Workflow files stored in the default location of `.github/workflows`. (You don't need to specify `/.github/workflows` for `directory`. You can use `directory: "/"`.)
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every weekday
+      interval: "weekly"
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -0,0 +1,24 @@
+enhancements:
+ - head-branch: ['^feature', 'feature']
+
+kind/documentation:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'docs/*'
+  - changed-files:
+    - any-glob-to-any-file: '*.md'
+
+area/ai-model:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'gallery/*'
+
+examples:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: 'examples/*'
+
+ci:
+- any:
+  - changed-files:
+    - any-glob-to-any-file: '.github/*'
--- a/.github/release.yml
+++ b/.github/release.yml
@@ -12,13 +12,23 @@ changelog:
    - title: "Bug fixes :bug:"
      labels:
        - bug
+        - regression
    - title: Exciting New Features 🎉
      labels:
        - Semver-Minor
        - enhancement
+        - ux
+        - roadmap
+    - title: 🧠 Models
+      labels:
+        - area/ai-model
+    - title: 📖 Documentation and examples
+      labels:
+        - kind/documentation
+        - examples
    - title: 👒 Dependencies
      labels:
        - dependencies
    - title: Other Changes
      labels:
-        - "*"
+        - "*"
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -12,6 +12,9 @@ jobs:
          - repository: "go-skynet/go-llama.cpp"
            variable: "GOLLAMA_VERSION"
            branch: "master"
+          - repository: "ggerganov/llama.cpp"
+            variable: "CPPLLAMA_VERSION"
+            branch: "master"
          - repository: "go-skynet/go-ggml-transformers.cpp"
            variable: "GOGGMLTRANSFORMERS_VERSION"
            branch: "master"
@@ -30,14 +33,23 @@ jobs:
          - repository: "nomic-ai/gpt4all"
            variable: "GPT4ALL_VERSION"
            branch: "main"
+          - repository: "mudler/go-ggllm.cpp"
+            variable: "GOGGLLM_VERSION"
+            branch: "master"
+          - repository: "mudler/go-stable-diffusion"
+            variable: "STABLEDIFFUSION_VERSION"
+            branch: "master"
+          - repository: "mudler/go-piper"
+            variable: "PIPER_VERSION"
+            branch: "master"
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - name: Bump dependencies 🔧
        run: |
          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v5
+        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.UPDATE_BOT_TOKEN }}
          push-to-fork: ci-forks/LocalAI
--- a/.github/workflows/bump_docs.yaml
+++ b/.github/workflows/bump_docs.yaml
@@ -0,0 +1,31 @@
+name: Bump dependencies
+on:
+  schedule:
+    - cron: 0 20 * * *
+  workflow_dispatch:
+jobs:
+  bump:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: "mudler/LocalAI"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Bump dependencies 🔧
+        run: |
+          bash .github/bump_docs.sh ${{ matrix.repository }}
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v6
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: ':arrow_up: Update docs version ${{ matrix.repository }}'
+          title: ':arrow_up: Update docs version ${{ matrix.repository }}'
+          branch: "update/docs"
+          body: Bump of ${{ matrix.repository }} version inside docs
+          signoff: true
+
+
+
--- a/.github/workflows/checksum_checker.yaml
+++ b/.github/workflows/checksum_checker.yaml
@@ -0,0 +1,47 @@
+name: Check if checksums are up-to-date
+on:
+  schedule:
+    - cron: 0 20 * * *
+  workflow_dispatch:
+jobs:
+  checksum_check:
+    runs-on: arc-runner-set
+    steps:
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+      - uses: actions/checkout@v4
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y pip wget
+          sudo pip install --upgrade pip 
+          pip install huggingface_hub
+      - name: 'Setup yq'
+        uses: dcarbone/install-yq-action@v1.1.1
+        with:
+          version: 'v4.43.1'
+          download-compressed: true
+          force: true
+
+      - name: Checksum checker 🔧
+        run: |
+          export HF_HOME=/hf_cache
+          sudo mkdir /hf_cache
+          sudo chmod 777 /hf_cache
+          bash .github/checksum_checker.sh gallery/index.yaml
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v6
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: ':arrow_up: Checksum updates in gallery/index.yaml'
+          title: 'models(gallery): :arrow_up: update checksum'
+          branch: "update/checksum"
+          body: Updating checksums in gallery/index.yaml
+          signoff: true
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -0,0 +1,43 @@
+name: Dependabot auto-merge
+on:
+- pull_request_target
+
+permissions:
+  contents: write
+  pull-requests: write
+  packages: read
+
+jobs:
+  dependabot:
+    runs-on: ubuntu-latest
+    if: ${{ github.actor == 'dependabot[bot]' }}
+    steps:
+      - name: Dependabot metadata
+        id: metadata
+        uses: dependabot/fetch-metadata@v2.1.0
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          skip-commit-verification: true
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Approve a PR if not already approved
+        run: |
+          gh pr checkout "$PR_URL"
+            if [ "$(gh pr status --json reviewDecision -q .currentBranch.reviewDecision)" != "APPROVED" ];
+          then
+            gh pr review --approve "$PR_URL"
+          else
+            echo "PR already approved.";
+          fi
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+
+      - name: Enable auto-merge for Dependabot PRs
+        if: ${{ contains(github.event.pull_request.title, 'bump')}}
+        run: gh pr merge --auto --squash "$PR_URL"
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
--- a/.github/workflows/disabled/test-gpu.yml
+++ b/.github/workflows/disabled/test-gpu.yml
@@ -0,0 +1,63 @@
+---
+name: 'GPU tests'
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+    tags:
+      - '*'
+
+concurrency:
+  group: ci-gpu-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  ubuntu-latest:
+    runs-on: gpu
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive apt-get install -y make wget
+      - name: Build
+        run: |
+          if [ ! -e /run/systemd/system ]; then
+            sudo mkdir /run/systemd/system
+          fi
+          sudo mkdir -p /host/tests/${{ github.head_ref || github.ref }}
+          sudo chmod -R 777 /host/tests/${{ github.head_ref || github.ref }}
+          make \
+            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
+            BUILD_TYPE=cublas \
+            prepare-e2e run-e2e-image test-e2e
+      - name: Release space from worker ♻
+        if: always()
+        run: |
+          sudo rm -rf build || true
+          sudo rm -rf bin || true
+          sudo rm -rf dist || true
+          sudo docker logs $(sudo docker ps -q --filter ancestor=localai-tests) > logs.txt
+          sudo cat logs.txt || true
+          sudo rm -rf logs.txt
+          make clean || true
+          make \
+            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
+            teardown-e2e || true
+          sudo rm -rf /host/tests/${{ github.head_ref || github.ref }} || true
+          docker system prune -f -a --volumes || true
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -0,0 +1,94 @@
+name: 'generate and publish GRPC docker caches'
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+
+concurrency:
+  group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  generate_caches:
+    strategy:
+      matrix:
+        include:
+          - grpc-base-image: ubuntu:22.04
+            runs-on: 'ubuntu-latest'
+            platforms: 'linux/amd64,linux/arm64'
+    runs-on: ${{matrix.runs-on}}
+    steps:
+      - name: Release space from worker
+        if: matrix.runs-on == 'ubuntu-latest'
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get remove -y microsoft-edge-stable || true
+          sudo apt-get remove -y firefox || true
+          sudo apt-get remove -y powershell || true
+          sudo apt-get remove -y r-base-core || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf "/usr/local/share/boost" || true
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+          df -h
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Cache GRPC
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
+          build-args: |
+            GRPC_BASE_IMAGE=${{ matrix.grpc-base-image }}
+            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
+            GRPC_VERSION=v1.64.0
+          context: .
+          file: ./Dockerfile
+          cache-to: type=gha,ignore-error=true
+          cache-from: type=gha
+          target: grpc
+          platforms: ${{ matrix.platforms }}
+          push: false
--- a/.github/workflows/generate_intel_image.yaml
+++ b/.github/workflows/generate_intel_image.yaml
@@ -0,0 +1,59 @@
+name: 'generate and publish intel docker caches'
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+
+concurrency:
+  group: intel-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  generate_caches:
+    strategy:
+      matrix:
+        include:
+          - base-image: intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
+            runs-on: 'ubuntu-latest'
+            platforms: 'linux/amd64'
+    runs-on: ${{matrix.runs-on}}
+    steps:
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+
+      - name: Login to quay
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Cache Intel images
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BASE_IMAGE=${{ matrix.base-image }}
+          context: .
+          file: ./Dockerfile
+          tags: quay.io/go-skynet/intel-oneapi-base:latest
+          push: true
+          target: intel
+          platforms: ${{ matrix.platforms }}
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -0,0 +1,130 @@
+---
+name: 'build container images tests'
+
+on:
+  pull_request:
+
+concurrency:
+  group: ci-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  extras-image-build:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
+      makeflags: ${{ matrix.makeflags }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      # Pushing with all jobs in parallel
+      # eats the bandwidth of all the nodes
+      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
+      matrix:
+        include:
+          - build-type: ''
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas'
+            ffmpeg: 'false'
+            image-type: 'extras'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: 'sycl-f16-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+  core-image-build:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
+      makeflags: ${{ matrix.makeflags }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      matrix:
+        include:
+          - build-type: ''
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=4 --output-sync=target"
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: 'sycl-f16-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -2,7 +2,6 @@
 name: 'build container images'

 on:
-  pull_request:
  push:
    branches:
      - master
@@ -14,96 +13,305 @@ concurrency:
  cancel-in-progress: true

 jobs:
-  docker:
+  self-hosted-jobs:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
+      aio: ${{ matrix.aio }}
+      makeflags: ${{ matrix.makeflags }}
+      latest-image: ${{ matrix.latest-image }}
+      latest-image-aio: ${{ matrix.latest-image-aio }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+    strategy:
+      # Pushing with all jobs in parallel
+      # eats the bandwidth of all the nodes
+      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
+      matrix:
+        include:
+          # Extra images
+          - build-type: ''
+            #platforms: 'linux/amd64,linux/arm64'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: ''
+            ffmpeg: ''
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: ''
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda11'
+            ffmpeg: ''
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12'
+            ffmpeg: ''
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-cublas-cuda11-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
+            aio: "-aio-gpu-nvidia-cuda-11"
+            latest-image: 'latest-gpu-nvidia-cuda-11'
+            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-11'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-cublas-cuda12-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            base-image: "ubuntu:22.04"
+            aio: "-aio-gpu-nvidia-cuda-12"
+            latest-image: 'latest-gpu-nvidia-cuda-12'
+            latest-image-aio: 'latest-aio-gpu-nvidia-cuda-12'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: ''
+            #platforms: 'linux/amd64,linux/arm64'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: ''
+            ffmpeg: ''
+            image-type: 'extras'
+            base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: '-hipblas-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            aio: "-aio-gpu-hipblas"
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            latest-image: 'latest-gpu-hipblas'
+            latest-image-aio: 'latest-aio-gpu-hipblas'
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas'
+            ffmpeg: 'false'
+            image-type: 'extras'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: '-sycl-f16-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            aio: "-aio-gpu-intel-f16"
+            latest-image: 'latest-gpu-intel-f16'
+            latest-image-aio: 'latest-aio-gpu-intel-f16'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'sycl_f32'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: '-sycl-f32-ffmpeg'
+            ffmpeg: 'true'
+            image-type: 'extras'
+            runs-on: 'arc-runner-set'
+            aio: "-aio-gpu-intel-f32"
+            latest-image: 'latest-gpu-intel-f32'
+            latest-image-aio: 'latest-aio-gpu-intel-f32'
+            makeflags: "--jobs=3 --output-sync=target"
+          # Core images
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: '-sycl-f16-core'
+            ffmpeg: 'false'
+            image-type: 'core'
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'sycl_f32'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: '-sycl-f32-core'
+            ffmpeg: 'false'
+            image-type: 'core'
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'sycl_f16'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: '-sycl-f16-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'sycl_f32'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            base-image: "quay.io/go-skynet/intel-oneapi-base:latest"
+            grpc-base-image: "ubuntu:22.04"
+            tag-suffix: '-sycl-f32-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+          - build-type: 'hipblas'
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-hipblas-core'
+            ffmpeg: 'false'
+            image-type: 'core'
+            base-image: "rocm/dev-ubuntu-22.04:6.1"
+            grpc-base-image: "ubuntu:22.04"
+            runs-on: 'arc-runner-set'
+            makeflags: "--jobs=3 --output-sync=target"
+  
+  core-image-build:
+    uses: ./.github/workflows/image_build.yml
+    with:
+      tag-latest: ${{ matrix.tag-latest }}
+      tag-suffix: ${{ matrix.tag-suffix }}
+      ffmpeg: ${{ matrix.ffmpeg }}
+      image-type: ${{ matrix.image-type }}
+      build-type: ${{ matrix.build-type }}
+      cuda-major-version: ${{ matrix.cuda-major-version }}
+      cuda-minor-version: ${{ matrix.cuda-minor-version }}
+      platforms: ${{ matrix.platforms }}
+      runs-on: ${{ matrix.runs-on }}
+      aio: ${{ matrix.aio }}
+      base-image: ${{ matrix.base-image }}
+      grpc-base-image: ${{ matrix.grpc-base-image }}
+      makeflags: ${{ matrix.makeflags }}
+      latest-image: ${{ matrix.latest-image }}
+      latest-image-aio: ${{ matrix.latest-image-aio }}
+    secrets:
+      dockerUsername: ${{ secrets.DOCKERHUB_USERNAME }}
+      dockerPassword: ${{ secrets.DOCKERHUB_PASSWORD }}
+      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
      matrix:
        include:
          - build-type: ''
            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'auto'
-            tag-suffix: ''
-            ffmpeg: ''
-          - build-type: 'cublas'
-            cuda-major-version: 11
-            cuda-minor-version: 7
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11'
-            ffmpeg: ''
-          - build-type: 'cublas'
-            cuda-major-version: 12
-            cuda-minor-version: 1
-            platforms: 'linux/amd64'
-            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12'
-            ffmpeg: ''
-          - build-type: ''
-            platforms: 'linux/amd64,linux/arm64'
-            tag-latest: 'false'
-            tag-suffix: '-ffmpeg'
+            tag-suffix: '-ffmpeg-core'
            ffmpeg: 'true'
+            image-type: 'core'
+            base-image: "ubuntu:22.04"
+            runs-on: 'ubuntu-latest'
+            aio: "-aio-cpu"
+            latest-image: 'latest-cpu'
+            latest-image-aio: 'latest-aio-cpu'
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
-            cuda-major-version: 11
-            cuda-minor-version: 7
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda11-ffmpeg'
-            ffmpeg: 'true'
+            tag-suffix: '-cublas-cuda11-core'
+            ffmpeg: ''
+            image-type: 'core'
+            base-image: "ubuntu:22.04"
+            runs-on: 'ubuntu-latest'
+            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
-            cuda-major-version: 12
-            cuda-minor-version: 1
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
            platforms: 'linux/amd64'
            tag-latest: 'false'
-            tag-suffix: '-cublas-cuda12-ffmpeg'
+            tag-suffix: '-cublas-cuda12-core'
+            ffmpeg: ''
+            image-type: 'core'
+            base-image: "ubuntu:22.04"
+            runs-on: 'ubuntu-latest'
+            makeflags: "--jobs=4 --output-sync=target"
+          - build-type: 'cublas'
+            cuda-major-version: "11"
+            cuda-minor-version: "7"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda11-ffmpeg-core'
            ffmpeg: 'true'
-
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v4
-        with:
-          images: quay.io/go-skynet/local-ai
-          tags: |
-            type=ref,event=branch
-            type=semver,pattern={{raw}}
-            type=sha
-          flavor: |
-            latest=${{ matrix.tag-latest }}
-            suffix=${{ matrix.tag-suffix }}
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@master
-        with:
-          platforms: all
-
-      - name: Set up Docker Buildx
-        id: buildx
-        uses: docker/setup-buildx-action@master
-
-      - name: Login to DockerHub
-        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v2
-        with:
-          registry: quay.io
-          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
-          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
-
-      - name: Build and push
-        uses: docker/build-push-action@v4
-        with:
-          builder: ${{ steps.buildx.outputs.name }}
-          build-args: |
-            BUILD_TYPE=${{ matrix.build-type }}
-            CUDA_MAJOR_VERSION=${{ matrix.cuda-major-version }}
-            CUDA_MINOR_VERSION=${{ matrix.cuda-minor-version }}
-            FFMPEG=${{ matrix.ffmpeg }}
-          context: .
-          file: ./Dockerfile
-          platforms: ${{ matrix.platforms }}
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=4 --output-sync=target"
+          - build-type: 'cublas'
+            cuda-major-version: "12"
+            cuda-minor-version: "1"
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-ffmpeg-core'
+            ffmpeg: 'true'
+            image-type: 'core'
+            runs-on: 'ubuntu-latest'
+            base-image: "ubuntu:22.04"
+            makeflags: "--jobs=4 --output-sync=target"
--- a/.github/workflows/image_build.yml
+++ b/.github/workflows/image_build.yml
@@ -0,0 +1,335 @@
+---
+name: 'build container images (reusable)'
+
+on:
+  workflow_call:
+    inputs:
+      base-image:
+        description: 'Base image'
+        required: true
+        type: string
+      grpc-base-image:
+        description: 'GRPC Base image, must be a compatible image with base-image'
+        required: false
+        default: ''
+        type: string
+      build-type:
+        description: 'Build type'
+        default: ''
+        type: string
+      cuda-major-version:
+        description: 'CUDA major version'
+        default: "11"
+        type: string
+      cuda-minor-version:
+        description: 'CUDA minor version'
+        default: "7"
+        type: string
+      platforms:
+        description: 'Platforms'
+        default: ''
+        type: string
+      tag-latest:
+        description: 'Tag latest'
+        default: ''
+        type: string
+      latest-image:
+          description: 'Tag latest'
+          default: ''
+          type: string
+      latest-image-aio:
+          description: 'Tag latest'
+          default: ''
+          type: string
+      tag-suffix:
+        description: 'Tag suffix'
+        default: ''
+        type: string
+      ffmpeg:
+        description: 'FFMPEG'
+        default: ''
+        type: string
+      image-type:
+        description: 'Image type'
+        default: ''
+        type: string
+      runs-on:
+        description: 'Runs on'
+        required: true
+        default: ''
+        type: string
+      makeflags:
+        description: 'Make Flags'
+        required: false
+        default: '--jobs=4 --output-sync=target'
+        type: string
+      aio:
+        description: 'AIO Image Name'
+        required: false
+        default: ''
+        type: string
+    secrets:
+      dockerUsername:
+        required: true
+      dockerPassword:
+        required: true
+      quayUsername:
+        required: true
+      quayPassword:
+        required: true
+jobs:
+  reusable_image-build:
+    runs-on: ${{ inputs.runs-on }}
+    steps:
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Release space from worker
+        if: inputs.runs-on == 'ubuntu-latest'
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get remove -y microsoft-edge-stable || true
+          sudo apt-get remove -y firefox || true
+          sudo apt-get remove -y powershell || true
+          sudo apt-get remove -y r-base-core || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          sudo rm -rf /usr/share/dotnet || true
+          sudo rm -rf /opt/ghc || true
+          sudo rm -rf "/usr/local/share/boost" || true
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+          df -h
+
+      - name: Docker meta
+        id: meta
+        if: github.event_name != 'pull_request'
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/local-ai
+            localai/localai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.tag-suffix }}
+      - name: Docker meta for PR
+        id: meta_pull_request
+        if: github.event_name == 'pull_request'
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            ttl.sh/localai-ci-pr-${{ github.event.number }}
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.tag-suffix }}
+      - name: Docker meta AIO (quay.io)
+        if: inputs.aio != ''
+        id: meta_aio
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            quay.io/go-skynet/local-ai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+          flavor: |
+            latest=${{ inputs.tag-latest }}
+            suffix=${{ inputs.aio }}
+
+      - name: Docker meta AIO (dockerhub)
+        if: inputs.aio != ''
+        id: meta_aio_dockerhub
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            localai/localai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+          flavor: |
+            suffix=${{ inputs.aio }}
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@master
+        with:
+          platforms: all
+
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@master
+
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.dockerUsername }}
+          password: ${{ secrets.dockerPassword }}
+
+      - name: Login to DockerHub
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.quayUsername }}
+          password: ${{ secrets.quayPassword }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        if: github.event_name != 'pull_request'
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
+          # This is why some build args like GRPC_VERSION and MAKEFLAGS are hardcoded
+          build-args: |
+            BUILD_TYPE=${{ inputs.build-type }}
+            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
+            FFMPEG=${{ inputs.ffmpeg }}
+            IMAGE_TYPE=${{ inputs.image-type }}
+            BASE_IMAGE=${{ inputs.base-image }}
+            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
+            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
+            GRPC_VERSION=v1.64.0
+            MAKEFLAGS=${{ inputs.makeflags }}
+          context: .
+          file: ./Dockerfile
+          cache-from: type=gha
+          platforms: ${{ inputs.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+### Start testing image
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        if: github.event_name == 'pull_request'
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          # The build-args MUST be an EXACT match between the image cache and other workflow steps that want to use that cache.
+          # This means that even the MAKEFLAGS have to be an EXACT match.
+          # If the build-args are not an EXACT match, it will result in a cache miss, which will require GRPC to be built from scratch.
+          # This is why some build args like GRPC_VERSION and MAKEFLAGS are hardcoded
+          build-args: |
+            BUILD_TYPE=${{ inputs.build-type }}
+            CUDA_MAJOR_VERSION=${{ inputs.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ inputs.cuda-minor-version }}
+            FFMPEG=${{ inputs.ffmpeg }}
+            IMAGE_TYPE=${{ inputs.image-type }}
+            BASE_IMAGE=${{ inputs.base-image }}
+            GRPC_BASE_IMAGE=${{ inputs.grpc-base-image || inputs.base-image }}
+            GRPC_MAKEFLAGS=--jobs=4 --output-sync=target
+            GRPC_VERSION=v1.64.0
+            MAKEFLAGS=${{ inputs.makeflags }}
+          context: .
+          file: ./Dockerfile
+          cache-from: type=gha
+          platforms: ${{ inputs.platforms }}
+          push: true
+          tags: ${{ steps.meta_pull_request.outputs.tags }}
+          labels: ${{ steps.meta_pull_request.outputs.labels }}
+      - name: Testing image
+        if: github.event_name == 'pull_request'
+        run: |
+          echo "Image is available at ttl.sh/localai-ci-pr-${{ github.event.number }}:${{ steps.meta_pull_request.outputs.version }}" >> $GITHUB_STEP_SUMMARY
+## End testing image
+      - name: Build and push AIO image
+        if: inputs.aio != ''
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BASE_IMAGE=quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
+            MAKEFLAGS=${{ inputs.makeflags }}
+          context: .
+          file: ./Dockerfile.aio
+          platforms: ${{ inputs.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta_aio.outputs.tags }}
+          labels: ${{ steps.meta_aio.outputs.labels }}
+
+      - name: Build and push AIO image (dockerhub)
+        if: inputs.aio != ''
+        uses: docker/build-push-action@v5
+        with:
+          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BASE_IMAGE=localai/localai:${{ steps.meta.outputs.version }}
+            MAKEFLAGS=${{ inputs.makeflags }}
+          context: .
+          file: ./Dockerfile.aio
+          platforms: ${{ inputs.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
+          labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
+
+      - name: Latest tag
+        # run this on branches, when it is a tag and there is a latest-image defined
+        if: github.event_name != 'pull_request' && inputs.latest-image != ''  && github.ref_type == 'tag'
+        run: |
+          docker pull localai/localai:${{ steps.meta.outputs.version }}
+          docker tag localai/localai:${{ steps.meta.outputs.version }} localai/localai:${{ inputs.latest-image }}
+          docker push localai/localai:${{ inputs.latest-image }}
+          docker pull quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }}
+          docker tag quay.io/go-skynet/local-ai:${{ steps.meta.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image }}
+          docker push quay.io/go-skynet/local-ai:${{ inputs.latest-image }}
+      - name: Latest AIO tag
+        # run this on branches, when it is a tag and there is a latest-image defined
+        if: github.event_name != 'pull_request' && inputs.latest-image-aio != ''  && github.ref_type == 'tag'
+        run: |
+          docker pull localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }}
+          docker tag localai/localai:${{ steps.meta_aio_dockerhub.outputs.version }} localai/localai:${{ inputs.latest-image-aio }}
+          docker push localai/localai:${{ inputs.latest-image-aio }}
+          docker pull quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }}
+          docker tag quay.io/go-skynet/local-ai:${{ steps.meta_aio.outputs.version }} quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
+          docker push quay.io/go-skynet/local-ai:${{ inputs.latest-image-aio }}
+  
+      - name: job summary
+        run: |
+          echo "Built image: ${{ steps.meta.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
+
+      - name: job summary(AIO)
+        if: inputs.aio != ''
+        run: |
+          echo "Built image: ${{ steps.meta_aio.outputs.labels }}" >> $GITHUB_STEP_SUMMARY
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -0,0 +1,12 @@
+name: "Pull Request Labeler"
+on:
+- pull_request_target
+
+jobs:
+  labeler:
+    permissions:
+      contents: read
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/labeler@v5
--- a/.github/workflows/localaibot_automerge.yml
+++ b/.github/workflows/localaibot_automerge.yml
@@ -0,0 +1,35 @@
+name: LocalAI-bot auto-merge
+on:
+- pull_request_target
+
+permissions:
+  contents: write
+  pull-requests: write
+  packages: read
+
+jobs:
+  dependabot:
+    runs-on: ubuntu-latest
+    if: ${{ github.actor == 'localai-bot' }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Approve a PR if not already approved
+        run: |
+          gh pr checkout "$PR_URL"
+            if [ "$(gh pr status --json reviewDecision -q .currentBranch.reviewDecision)" != "APPROVED" ];
+          then
+            gh pr review --approve "$PR_URL"
+          else
+            echo "PR already approved.";
+          fi
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
+
+      - name: Enable auto-merge for LocalAIBot PRs
+        run: gh pr merge --auto --squash "$PR_URL"
+        env:
+          PR_URL: ${{github.event.pull_request.html_url}}
+          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -1,79 +1,285 @@
 name: Build and Release

-on: push
+on:
+- push
+- pull_request
+
+env:
+  GRPC_VERSION: v1.64.0

 permissions:
  contents: write

+concurrency:
+  group: ci-releases-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
 jobs:
-  build-linux:
-    strategy:
-      matrix:
-        include:
-          - build: 'avx2'
-            defines: ''
-          - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
-          - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON'
+
+  build-linux-arm:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21.x'
+          cache: false
+
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
+          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
+          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
+      - name: Install CUDA Dependencies
+        run: |
+          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/cross-linux-aarch64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          sudo apt-get install -y cuda-cross-aarch64 cuda-nvcc-cross-aarch64-${CUDA_VERSION} libcublas-cross-aarch64-${CUDA_VERSION}
+        env:
+          CUDA_VERSION: 12-4
+      - name: Cache grpc
+        id: cache-grpc
+        uses: actions/cache@v4
+        with:
+          path: grpc
+          key: ${{ runner.os }}-arm-grpc-${{ env.GRPC_VERSION }}
+      - name: Build grpc
+        if: steps.cache-grpc.outputs.cache-hit != 'true'
+        run: |
+
+          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+            -DgRPC_BUILD_TESTS=OFF \
+            ../.. && sudo make --jobs 5 --output-sync=target
+      - name: Install gRPC
+        run: |
+          GNU_HOST=aarch64-linux-gnu
+          C_COMPILER_ARM_LINUX=$GNU_HOST-gcc
+          CXX_COMPILER_ARM_LINUX=$GNU_HOST-g++
+
+          CROSS_TOOLCHAIN=/usr/$GNU_HOST
+          CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
+          CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
+
+          # https://cmake.org/cmake/help/v3.13/manual/cmake-toolchains.7.html#cross-compiling-for-linux
+          echo "set(CMAKE_SYSTEM_NAME Linux)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_SYSTEM_PROCESSOR arm)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_STAGING_PREFIX $CROSS_STAGING_PREFIX)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_SYSROOT ${CROSS_TOOLCHAIN}/sysroot)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_C_COMPILER /usr/bin/$C_COMPILER_ARM_LINUX)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_CXX_COMPILER /usr/bin/$CXX_COMPILER_ARM_LINUX)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)" >> $CMAKE_CROSS_TOOLCHAIN && \
+            echo "set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)" >> $CMAKE_CROSS_TOOLCHAIN
+          GRPC_DIR=$PWD/grpc
+          cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install && \
+          GRPC_CROSS_BUILD_DIR=$GRPC_DIR/cmake/cross_build && \
+          mkdir -p $GRPC_CROSS_BUILD_DIR && \
+          cd $GRPC_CROSS_BUILD_DIR && \
+          cmake -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CROSS_TOOLCHAIN \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DCMAKE_INSTALL_PREFIX=$CROSS_TOOLCHAIN/grpc_install \
+            ../.. && \
+          sudo make -j`nproc` install
      - name: Build
        id: build
-        env:
-          CMAKE_ARGS: "${{ matrix.defines }}"
-          BUILD_ID: "${{ matrix.build }}"
        run: |
-          STATIC=true make dist
-      - uses: actions/upload-artifact@v3
+          GNU_HOST=aarch64-linux-gnu
+          C_COMPILER_ARM_LINUX=$GNU_HOST-gcc
+          CXX_COMPILER_ARM_LINUX=$GNU_HOST-g++
+
+          CROSS_TOOLCHAIN=/usr/$GNU_HOST
+          CROSS_STAGING_PREFIX=$CROSS_TOOLCHAIN/stage
+          CMAKE_CROSS_TOOLCHAIN=/tmp/arm.toolchain.cmake
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+          export PATH=$PATH:$GOPATH/bin
+          export PATH=/usr/local/cuda/bin:$PATH
+          GO_TAGS=p2p GOOS=linux GOARCH=arm64 CMAKE_ARGS="-DProtobuf_INCLUDE_DIRS=$CROSS_STAGING_PREFIX/include -DProtobuf_DIR=$CROSS_STAGING_PREFIX/lib/cmake/protobuf -DgRPC_DIR=$CROSS_STAGING_PREFIX/lib/cmake/grpc -DCMAKE_TOOLCHAIN_FILE=$CMAKE_CROSS_TOOLCHAIN -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++" make dist-cross-linux-arm64
+      - uses: actions/upload-artifact@v4
        with:
-          name: ${{ matrix.build }}
+          name: LocalAI-linux-arm64
          path: release/
      - name: Release
-        uses: softprops/action-gh-release@v1
+        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
            release/*

-  build-macOS:
-    strategy:
-      matrix:
-        include:
-          - build: 'avx2'
-            defines: ''
-          - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
-          - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON'
-    runs-on: macOS-latest
+  build-linux:
+    runs-on: arc-runner-set
    steps:
+      - name: Force Install GIT latest
+        run: |
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          submodules: true
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21.x'
+          cache: false
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache cmake
+      - name: Intel Dependencies
+        run: |
+          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+          echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
+          sudo apt update
+          sudo apt install -y intel-basekit
+      - name: Install CUDA Dependencies
+        run: |
+          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
+        env:
+          CUDA_VERSION: 12-3
+      - name: "Install Hipblas"
+        env:
+          ROCM_VERSION: "6.1"
+          AMDGPU_VERSION: "6.1"
+        run: |
+            set -ex 
+
+            sudo apt-get update
+            sudo DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ca-certificates curl libnuma-dev gnupg 
+            
+            curl -sL https://repo.radeon.com/rocm/rocm.gpg.key | sudo apt-key add - 
+              
+            printf "deb [arch=amd64] https://repo.radeon.com/rocm/apt/$ROCM_VERSION/ jammy main" | sudo tee /etc/apt/sources.list.d/rocm.list
+            
+            printf "deb [arch=amd64] https://repo.radeon.com/amdgpu/$AMDGPU_VERSION/ubuntu jammy main" | sudo tee /etc/apt/sources.list.d/amdgpu.list
+            printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
+            sudo apt-get update
+
+            sudo DEBIAN_FRONTEND=noninteractive apt-get install -y \
+                hipblas-dev rocm-dev \
+                rocblas-dev
+          
+            sudo apt-get clean
+            sudo rm -rf /var/lib/apt/lists/*
+            sudo ldconfig 
+      - name: Cache grpc
+        id: cache-grpc
+        uses: actions/cache@v4
+        with:
+          path: grpc
+          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
+      - name: Build grpc
+        if: steps.cache-grpc.outputs.cache-hit != 'true'
+        run: |
+          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+            -DgRPC_BUILD_TESTS=OFF \
+            ../.. && sudo make --jobs 5 --output-sync=target
+      - name: Install gRPC
+        run: |
+          cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install
      - name: Build
        id: build
-        env:
-          CMAKE_ARGS: "${{ matrix.defines }}"
-          BUILD_ID: "${{ matrix.build }}"
        run: |
-          make dist
-      - uses: actions/upload-artifact@v3
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+          export PATH=$PATH:$GOPATH/bin
+          export PATH=/usr/local/cuda/bin:$PATH
+          export PATH=/opt/rocm/bin:$PATH
+          source /opt/intel/oneapi/setvars.sh
+          GO_TAGS=p2p make -j4 dist
+      - uses: actions/upload-artifact@v4
        with:
-          name: ${{ matrix.build }}
+          name: LocalAI-linux
          path: release/
      - name: Release
-        uses: softprops/action-gh-release@v1
+        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/')
        with:
          files: |
-            release/*
+            release/*
+
+  build-stablediffusion:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21.x'
+          cache: false
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+      - name: Build stablediffusion
+        run: |
+          export PATH=$PATH:$GOPATH/bin
+          make backend-assets/grpc/stablediffusion
+          mkdir -p release && cp backend-assets/grpc/stablediffusion release
+        env:
+          GO_TAGS: stablediffusion
+      - uses: actions/upload-artifact@v4
+        with:
+          name: stablediffusion
+          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*
+
+  build-macOS-arm64:
+    runs-on: macos-14
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - uses: actions/setup-go@v5
+        with:
+          go-version: '1.21.x'
+          cache: false
+      - name: Setup tmate session if tests fail
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          limit-access-to-actor: true
+      - name: Dependencies
+        run: |
+          brew install protobuf grpc
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+      - name: Build
+        id: build
+        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
+          export PATH=$PATH:$GOPATH/bin
+          GO_TAGS=p2p make dist
+      - uses: actions/upload-artifact@v4
+        with:
+          name: LocalAI-MacOS-arm64
+          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v2
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*
--- a/.github/workflows/secscan.yaml
+++ b/.github/workflows/secscan.yaml
@@ -0,0 +1,30 @@
+name: "Security Scan"
+
+# Run workflow each time code is pushed to your repository and on a schedule.
+# The scheduled workflow runs every at 00:00 on Sunday UTC time.
+on:
+  push:
+  schedule:
+  - cron: '0 0 * * 0'
+
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    env:
+      GO111MODULE: on
+    steps:
+      - name: Checkout Source
+        uses: actions/checkout@v4
+        if: ${{ github.actor != 'dependabot[bot]' }}
+      - name: Run Gosec Security Scanner
+        if: ${{ github.actor != 'dependabot[bot]' }}
+        uses: securego/gosec@master
+        with:
+          # we let the report trigger content trigger a failure using the GitHub Security features.
+          args: '-no-fail -fmt sarif -out results.sarif ./...'
+      - name: Upload SARIF file
+        if: ${{ github.actor != 'dependabot[bot]' }}
+        uses: github/codeql-action/upload-sarif@v3
+        with:
+          # Path to SARIF file relative to the root of the repository
+          sarif_file: results.sarif
--- a/.github/workflows/test-extra.yml
+++ b/.github/workflows/test-extra.yml
@@ -0,0 +1,321 @@
+---
+name: 'Tests extras backends'
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+    tags:
+      - '*'
+
+concurrency:
+  group: ci-tests-extra-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  tests-transformers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.64.0
+          
+      - name: Test transformers
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/transformers
+           make --jobs=5 --output-sync=target -C backend/python/transformers test
+
+  tests-sentencetransformers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.64.0
+          
+      - name: Test sentencetransformers
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers
+           make --jobs=5 --output-sync=target -C backend/python/sentencetransformers test
+
+
+  tests-rerankers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.64.0
+
+      - name: Test rerankers
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/rerankers
+           make --jobs=5 --output-sync=target -C backend/python/rerankers test
+
+  tests-diffusers:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential ffmpeg
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          pip install --user grpcio-tools==1.64.0
+      - name: Test diffusers
+        run: |
+          make --jobs=5 --output-sync=target -C backend/python/diffusers
+          make --jobs=5 --output-sync=target -C backend/python/diffusers test
+
+  tests-parler-tts:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.64.0
+
+      - name: Test parler-tts
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts
+           make --jobs=5 --output-sync=target -C backend/python/parler-tts test
+  
+  tests-openvoice:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.64.0
+
+      - name: Test openvoice
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/openvoice
+           make --jobs=5 --output-sync=target -C backend/python/openvoice test
+
+  tests-transformers-musicgen:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.64.0
+
+      - name: Test transformers-musicgen
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen
+           make --jobs=5 --output-sync=target -C backend/python/transformers-musicgen test
+
+
+
+  # tests-petals:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v4
+  #       with: 
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user grpcio-tools==1.64.0
+
+  #     - name: Test petals
+  #       run: |
+  #          make --jobs=5 --output-sync=target -C backend/python/petals
+  #          make --jobs=5 --output-sync=target -C backend/python/petals test
+
+           
+
+  # tests-bark:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Release space from worker
+  #       run: |
+  #           echo "Listing top largest packages"
+  #           pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+  #           head -n 30 <<< "${pkgs}"
+  #           echo
+  #           df -h
+  #           echo
+  #           sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+  #           sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+  #           sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+  #           sudo rm -rf /usr/local/lib/android
+  #           sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+  #           sudo rm -rf /usr/share/dotnet
+  #           sudo apt-get remove -y '^mono-.*' || true
+  #           sudo apt-get remove -y '^ghc-.*' || true
+  #           sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+  #           sudo apt-get remove -y 'php.*' || true
+  #           sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+  #           sudo apt-get remove -y '^google-.*' || true
+  #           sudo apt-get remove -y azure-cli || true
+  #           sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+  #           sudo apt-get remove -y '^gfortran-.*' || true
+  #           sudo apt-get remove -y microsoft-edge-stable || true
+  #           sudo apt-get remove -y firefox || true
+  #           sudo apt-get remove -y powershell || true
+  #           sudo apt-get remove -y r-base-core || true
+  #           sudo apt-get autoremove -y
+  #           sudo apt-get clean
+  #           echo
+  #           echo "Listing top largest packages"
+  #           pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+  #           head -n 30 <<< "${pkgs}"
+  #           echo
+  #           sudo rm -rfv build || true
+  #           sudo rm -rf /usr/share/dotnet || true
+  #           sudo rm -rf /opt/ghc || true
+  #           sudo rm -rf "/usr/local/share/boost" || true
+  #           sudo rm -rf "$AGENT_TOOLSDIRECTORY" || true
+  #           df -h
+  #     - name: Clone
+  #       uses: actions/checkout@v4
+  #       with: 
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user grpcio-tools==1.64.0
+
+  #     - name: Test bark
+  #       run: |
+  #          make --jobs=5 --output-sync=target -C backend/python/bark
+  #          make --jobs=5 --output-sync=target -C backend/python/bark test
+
+           
+  # Below tests needs GPU. Commented out for now
+  # TODO: Re-enable as soon as we have GPU nodes
+  # tests-vllm:
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - name: Clone
+  #       uses: actions/checkout@v4
+  #       with: 
+  #         submodules: true
+  #     - name: Dependencies
+  #       run: |
+  #         sudo apt-get update
+  #         sudo apt-get install build-essential ffmpeg
+  #         # Install UV
+  #         curl -LsSf https://astral.sh/uv/install.sh | sh
+  #         sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+  #         sudo apt-get install -y libopencv-dev
+  #         pip install --user grpcio-tools==1.64.0
+  #     - name: Test vllm
+  #       run: |
+  #          make --jobs=5 --output-sync=target -C backend/python/vllm
+  #          make --jobs=5 --output-sync=target -C backend/python/vllm test
+  tests-vallex:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake curl patch python3-pip
+          sudo apt-get install -y libopencv-dev
+          pip install --user grpcio-tools==1.64.0
+      - name: Test vall-e-x
+        run: |
+           make --jobs=5 --output-sync=target -C backend/python/vall-e-x
+           make --jobs=5 --output-sync=target -C backend/python/vall-e-x test
+
+  tests-coqui:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+          sudo apt-get install -y ca-certificates cmake curl patch espeak espeak-ng python3-pip
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          pip install --user grpcio-tools==1.64.0
+      - name: Test coqui
+        run: |
+          make --jobs=5 --output-sync=target -C backend/python/coqui
+          make --jobs=5 --output-sync=target -C backend/python/coqui test
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -9,36 +9,222 @@ on:
    tags:
      - '*'

+env:
+  GRPC_VERSION: v1.64.0
+
 concurrency:
  group: ci-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
  cancel-in-progress: true

 jobs:
-  ubuntu-latest:
+  tests-linux:
    runs-on: ubuntu-latest
-
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
    steps:
+      - name: Release space from worker
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          df -h
      - name: Clone
-        uses: actions/checkout@v3
-        with: 
+        uses: actions/checkout@v4
+        with:
          submodules: true
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+          cache: false
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg
+          sudo apt-get install build-essential curl ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+             gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          # Install UV
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          sudo apt-get install -y ca-certificates cmake patch python3-pip unzip
+          sudo apt-get install -y libopencv-dev
+
+          curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
+          unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+          rm protoc.zip
+
+          curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
+          export CUDACXX=/usr/local/cuda/bin/nvcc
+
+          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.0
+          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@8ba23be9613c672d40ae261d2a1335d639bdd59b
+
+          # The python3-grpc-tools package in 22.04 is too old
+          pip install --user grpcio-tools
+
+          sudo rm -rfv /usr/bin/conda || true
+          PATH=$PATH:/opt/conda/bin make -C backend/python/sentencetransformers
+
+          # Pre-build piper before we start tests in order to have shared libraries in place
+          make sources/go-piper && \
+          GO_TAGS="tts" make -C sources/go-piper piper.o && \
+          sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \
+          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+        env:
+          CUDA_VERSION: 12-3
+      - name: Cache grpc
+        id: cache-grpc
+        uses: actions/cache@v4
+        with:
+          path: grpc
+          key: ${{ runner.os }}-grpc-${{ env.GRPC_VERSION }}
+      - name: Build grpc
+        if: steps.cache-grpc.outputs.cache-hit != 'true'
+        run: |
+          git clone --recurse-submodules -b ${{ env.GRPC_VERSION }} --depth 1 --jobs 5 --shallow-submodules https://github.com/grpc/grpc && \
+          cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+            -DgRPC_BUILD_TESTS=OFF \
+            ../.. && sudo make --jobs 5
+      - name: Install gRPC
+        run: |
+          cd grpc && cd cmake/build && sudo make --jobs 5 install
      - name: Test
        run: |
-          make test
+          PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" make --jobs 5 --output-sync=target test
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true

-  macOS-latest:
-    runs-on: macOS-latest
+  tests-aio-container:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Release space from worker
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          df -h
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Build images
+        run: |
+          docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
+          BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
+      - name: Test
+        run: |
+          LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
+            make run-e2e-aio
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true

+  tests-apple:
+    runs-on: macOS-14
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v3
-        with: 
+        uses: actions/checkout@v4
+        with:
          submodules: true
-
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v5
+        with:
+          go-version: ${{ matrix.go-version }}
+          cache: false
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
+      - name: Dependencies
+        run: |
+          brew install protobuf grpc make protoc-gen-go protoc-gen-go-grpc
+          pip install --user grpcio-tools==1.64.0
      - name: Test
        run: |
-          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
+          # Used to run the newer GNUMake version from brew that supports --output-sync
+          export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
+          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make --jobs 4 --output-sync=target test
+      - name: Setup tmate session if tests fail
+        if: ${{ failure() }}
+        uses: mxschmitt/action-tmate@v3.18
+        with:
+          detached: true
+          connect-timeout-seconds: 180
+          limit-access-to-actor: true
--- a/.github/workflows/update_swagger.yaml
+++ b/.github/workflows/update_swagger.yaml
@@ -0,0 +1,31 @@
+name: Update swagger
+on:
+  schedule:
+    - cron: 0 20 * * *
+  workflow_dispatch:
+jobs:
+  swagger:
+    strategy:
+      fail-fast: false
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          go-version: 'stable'
+      - run: |
+          go install github.com/swaggo/swag/cmd/swag@latest
+      - name: Bump swagger 🔧
+        run: |
+          make swagger
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v6
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: 'feat(swagger): update swagger'
+          title: 'feat(swagger): update swagger'
+          branch: "update/swagger"
+          body:  Update swagger
+          signoff: true
+
--- a/.github/workflows/yaml-check.yml
+++ b/.github/workflows/yaml-check.yml
@@ -0,0 +1,18 @@
+name: 'Yamllint GitHub Actions'
+on:
+  - pull_request
+jobs:
+  yamllint:
+    name: 'Yamllint'
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout'
+        uses: actions/checkout@master
+      - name: 'Yamllint'
+        uses: karancode/yamllint-github-action@master
+        with:
+          yamllint_file_or_dir: 'gallery'
+          yamllint_strict: false
+          yamllint_comment: true
+        env:
+          GITHUB_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -1,12 +1,20 @@
 # go-llama build artifacts
-go-llama
-gpt4all
-go-stable-diffusion
+/sources/
+__pycache__/
+*.a
+get-sources
+prepare-sources
+/backend/cpp/llama/grpc-server
+/backend/cpp/llama/llama.cpp
+/backend/cpp/llama-*
+
+*.log
+
 go-ggml-transformers
 go-gpt2
 go-rwkv
 whisper.cpp
-bloomz
+/bloomz
 go-bert

 # LocalAI build binary
@@ -14,6 +22,9 @@ LocalAI
 local-ai
 # prevent above rules from omitting the helm chart
 !charts/*
+# prevent above rules from omitting the api/localai folder
+!api/localai
+!core/**/localai

 # Ignore models
 models/*
@@ -27,6 +38,19 @@ release/
 .idea

 # Generated during build
-backend-assets/
+backend-assets/*
+!backend-assets/.keep
+prepare
+/ggml-metal.metal
+docs/static/gallery.html

-/ggml-metal.metal
+# Protobuf generated files
+*.pb.go
+*pb2.py
+*pb2_grpc.py
+
+# SonarQube
+.scannerwork
+
+# backend virtual environments
+**/venv
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "docs/themes/hugo-theme-relearn"]
+	path = docs/themes/hugo-theme-relearn
+	url = https://github.com/McShelby/hugo-theme-relearn.git
+[submodule "docs/themes/lotusdocs"]
+	path = docs/themes/lotusdocs
+	url = https://github.com/colinwilson/lotusdocs
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -0,0 +1,5 @@
+{
+    "recommendations": [
+        "golang.go"
+    ]
+}
--- a/.yamllint
+++ b/.yamllint
@@ -0,0 +1,4 @@
+extends: default
+
+rules:
+    line-length: disable
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,88 @@
+# Contributing to LocalAI
+
+Thank you for your interest in contributing to LocalAI! We appreciate your time and effort in helping to improve our project. Before you get started, please take a moment to review these guidelines.
+
+## Table of Contents
+
+- [Getting Started](#getting-started)
+  - [Prerequisites](#prerequisites)
+  - [Setting up the Development Environment](#setting-up-the-development-environment)
+- [Contributing](#contributing)
+  - [Submitting an Issue](#submitting-an-issue)
+  - [Creating a Pull Request (PR)](#creating-a-pull-request-pr)
+- [Coding Guidelines](#coding-guidelines)
+- [Testing](#testing)
+- [Documentation](#documentation)
+- [Community and Communication](#community-and-communication)
+
+
+
+## Getting Started
+
+### Prerequisites
+
+- Golang [1.21]
+- Git
+- macOS/Linux
+
+### Setting up the Development Environment and running localAI in the local environment
+
+1. Clone the repository: `git clone https://github.com/go-skynet/LocalAI.git`
+2. Navigate to the project directory: `cd LocalAI`
+3. Install the required dependencies ( see https://localai.io/basics/build/#build-localai-locally )
+4. Build LocalAI: `make build`
+5. Run LocalAI: `./local-ai`
+
+## Contributing
+
+We welcome contributions from everyone! To get started, follow these steps:
+
+### Submitting an Issue
+
+If you find a bug, have a feature request, or encounter any issues, please check the [issue tracker](https://github.com/go-skynet/LocalAI/issues) to see if a similar issue has already been reported. If not, feel free to [create a new issue](https://github.com/go-skynet/LocalAI/issues/new) and provide as much detail as possible.
+
+### Creating a Pull Request (PR)
+
+1. Fork the repository.
+2. Create a new branch with a descriptive name: `git checkout -b [branch name]`
+3. Make your changes and commit them.
+4. Push the changes to your fork: `git push origin [branch name]`
+5. Create a new pull request from your branch to the main project's `main` or `master` branch.
+6. Provide a clear description of your changes in the pull request.
+7. Make any requested changes during the review process.
+8. Once your PR is approved, it will be merged into the main project.
+
+## Coding Guidelines
+
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
+
+## Testing
+
+`make test` cannot handle all the model now. Please be sure to add a test case for the new features or the part was changed.
+
+### Running AIO tests
+
+All-In-One images has a set of tests that automatically verifies that most of the endpoints works correctly, a flow can be :
+
+```bash
+# Build the LocalAI docker image
+make DOCKER_IMAGE=local-ai docker
+
+# Build the corresponding AIO image
+BASE_IMAGE=local-ai DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
+
+# Run the AIO e2e tests
+LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio make run-e2e-aio
+```
+
+## Documentation
+
+We are welcome the contribution of the documents, please open new PR or create a new issue. The documentation is available under `docs/` https://github.com/mudler/LocalAI/tree/master/docs
+ 
+## Community and Communication
+
+- You can reach out via the Github issue tracker.
+- Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
+- Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
+
+---
--- a/379
+++ b/379
@@ -1,40 +1,235 @@
-ARG GO_VERSION=1.20-bullseye
+ARG IMAGE_TYPE=extras
+ARG BASE_IMAGE=ubuntu:22.04
+ARG GRPC_BASE_IMAGE=${BASE_IMAGE}
+ARG INTEL_BASE_IMAGE=${BASE_IMAGE}

-FROM golang:$GO_VERSION as requirements
+# The requirements-core target is common to all images.  It should not be placed in requirements-core unless every single build will use it.
+FROM ${BASE_IMAGE} AS requirements-core
+
+USER root
+
+ARG GO_VERSION=1.22.4
+ARG TARGETARCH
+ARG TARGETVARIANT
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,petals:/build/backend/python/petals/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
+
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        build-essential \
+        ccache \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        unzip && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install Go
+RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
+ENV PATH $PATH:/root/go/bin:/usr/local/go/bin
+
+# Install grpc compilers
+RUN go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.1 && \
+    go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
+
+COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
+RUN update-ca-certificates
+
+# Use the variables in subsequent instructions
+RUN echo "Target Architecture: $TARGETARCH"
+RUN echo "Target Variant: $TARGETVARIANT"
+
+# Cuda
+ENV PATH /usr/local/cuda/bin:${PATH}
+
+# HipBLAS requirements
+ENV PATH /opt/rocm/bin:${PATH}
+
+# OpenBLAS requirements and stable diffusion
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        libopenblas-dev \
+        libopencv-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set up OpenCV
+RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+
+WORKDIR /build
+
+RUN test -n "$TARGETARCH" \
+    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
+
+###################################
+###################################
+
+# The requirements-extras target is for any builds with IMAGE_TYPE=extras. It should not be placed in this target unless every IMAGE_TYPE=extras build will use it
+FROM requirements-core AS requirements-extras
+
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        espeak-ng \
+        espeak \
+        python3-pip \
+        python-is-python3 \
+        python3-dev \
+        python3-venv && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    pip install --upgrade pip
+
+# Install grpcio-tools (the version in 22.04 is too old)
+RUN pip install --user grpcio-tools
+
+###################################
+###################################
+
+# The requirements-drivers target is for BUILD_TYPE specific items.  If you need to install something specific to CUDA, or specific to ROCM, it goes here.
+# This target will be built on top of requirements-core or requirements-extras as retermined by the IMAGE_TYPE build-arg
+FROM requirements-${IMAGE_TYPE} AS requirements-drivers

 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=11
-ARG CUDA_MINOR_VERSION=7
+ARG CUDA_MINOR_VERSION=8

 ENV BUILD_TYPE=${BUILD_TYPE}

-RUN apt-get update && \
-    apt-get install -y ca-certificates cmake curl patch
-
 # CuBLAS requirements
+RUN <<EOT bash
+    if [ "${BUILD_TYPE}" = "cublas" ]; then
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+                        software-properties-common pciutils
+        if [ "amd64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+            fi
+        if [ "arm64" = "$TARGETARCH" ]; then
+            curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/arm64/cuda-keyring_1.1-1_all.deb
+        fi
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+            rm -f cuda-keyring_1.1-1_all.deb && \
+            apt-get update && \
+            apt-get install -y --no-install-recommends \
+                cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+                libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
+            apt-get clean && \
+        rm -rf /var/lib/apt/lists/*
+    fi
+EOT
+
 RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
-    apt-get install -y software-properties-common && \
-    apt-add-repository contrib && \
-    curl -O https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb && \
-    dpkg -i cuda-keyring_1.0-1_all.deb && \
-    rm -f cuda-keyring_1.0-1_all.deb && \
-    apt-get update && \
-    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+        apt-get update && \
+        apt-get install -y  --no-install-recommends \
+            software-properties-common pciutils && \
+        curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
+        dpkg -i cuda-keyring_1.1-1_all.deb && \
+        rm -f cuda-keyring_1.1-1_all.deb && \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcufft-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcurand-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} \
+            libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
    ; fi
-ENV PATH /usr/local/cuda/bin:${PATH}

-# OpenBLAS requirements
-RUN apt-get install -y libopenblas-dev
+# If we are building with clblas support, we need the libraries for the builds
+RUN if [ "${BUILD_TYPE}" = "clblas" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            libclblast-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
+    ; fi

-# Stable Diffusion requirements
-RUN apt-get install -y libopencv-dev && \
-    ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+RUN if [ "${BUILD_TYPE}" = "hipblas" ]; then \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            hipblas-dev \
+            rocblas-dev && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* && \
+        # I have no idea why, but the ROCM lib packages don't trigger ldconfig after they install, which results in local-ai and others not being able
+        # to locate the libraries. We run ldconfig ourselves to work around this packaging deficiency
+        ldconfig \
+    ; fi

-FROM requirements as builder
+###################################
+###################################

-ARG GO_TAGS=stablediffusion
+# Temporary workaround for Intel's repository to work correctly
+# https://community.intel.com/t5/Intel-oneAPI-Math-Kernel-Library/APT-Repository-not-working-signatures-invalid/m-p/1599436/highlight/true#M36143
+# This is a temporary workaround until Intel fixes their repository
+FROM ${INTEL_BASE_IMAGE} AS intel
+RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | \
+gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
+RUN echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" > /etc/apt/sources.list.d/intel-graphics.list

+###################################
+###################################
+
+# The grpc target does one thing, it builds and installs GRPC.  This is in it's own layer so that it can be effectively cached by CI.
+# You probably don't need to change anything here, and if you do, make sure that CI is adjusted so that the cache continues to work.
+FROM ${GRPC_BASE_IMAGE} AS grpc
+
+# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
+ARG GRPC_MAKEFLAGS="-j4 -Otarget"
+ARG GRPC_VERSION=v1.64.2
+
+ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
+
+WORKDIR /build
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ca-certificates \
+        build-essential \
+        cmake \
+        git && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
+# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
+# and running make install in the target container
+RUN git clone --recurse-submodules --jobs 4 -b ${GRPC_VERSION} --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+    mkdir -p /build/grpc/cmake/build && \
+    cd /build/grpc/cmake/build && \
+    cmake -DgRPC_INSTALL=ON -DgRPC_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX:PATH=/opt/grpc ../.. && \
+    make && \
+    make install && \
+    rm -rf /build
+
+###################################
+###################################
+
+# The builder target compiles LocalAI. This target is not the target that will be uploaded to the registry.
+# Adjustments to the build process should likely be made here.
+FROM requirements-drivers AS builder
+
+ARG GO_TAGS="stablediffusion tts p2p"
+ARG GRPC_BACKENDS
+ARG MAKEFLAGS
+
+ENV GRPC_BACKENDS=${GRPC_BACKENDS}
 ENV GO_TAGS=${GO_TAGS}
+ENV MAKEFLAGS=${MAKEFLAGS}
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
 ENV NVIDIA_VISIBLE_DEVICES=all
@@ -42,29 +237,161 @@ ENV NVIDIA_VISIBLE_DEVICES=all
 WORKDIR /build

 COPY . .
+COPY .git .
+RUN echo "GO_TAGS: $GO_TAGS"
+
+RUN make prepare
+
+# We need protoc installed, and the version in 22.04 is too old.  We will create one as part installing the GRPC build below
+# but that will also being in a newer version of absl which stablediffusion cannot compile with.  This version of protoc is only
+# here so that we can generate the grpc code for the stablediffusion build
+RUN <<EOT bash
+    if [ "amd64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-x86_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+    if [ "arm64" = "$TARGETARCH" ]; then
+        curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v27.1/protoc-27.1-linux-aarch_64.zip -o protoc.zip && \
+        unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
+        rm protoc.zip
+    fi
+EOT
+
+# stablediffusion does not tolerate a newer version of abseil, build it first
+RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+
+# Install the pre-built GRPC
+COPY --from=grpc /opt/grpc /usr/local
+
+# Rebuild with defaults backends
+WORKDIR /build
 RUN make build

-FROM requirements
+RUN if [ ! -d "/build/sources/go-piper/piper-phonemize/pi/lib/" ]; then \
+        mkdir -p /build/sources/go-piper/piper-phonemize/pi/lib/ \
+        touch /build/sources/go-piper/piper-phonemize/pi/lib/keep \
+    ; fi
+
+###################################
+###################################
+
+# This is the final target. The result of this target will be the image uploaded to the registry.
+# If you cannot find a more suitable place for an addition, this layer is a suitable place for it.
+FROM requirements-drivers

 ARG FFMPEG
+ARG BUILD_TYPE
+ARG TARGETARCH
+ARG IMAGE_TYPE=extras
+ARG EXTRA_BACKENDS
+ARG MAKEFLAGS

-ENV REBUILD=true
+ENV BUILD_TYPE=${BUILD_TYPE}
+ENV REBUILD=false
 ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
+ENV MAKEFLAGS=${MAKEFLAGS}
+
+ARG CUDA_MAJOR_VERSION=11
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
+ENV NVIDIA_VISIBLE_DEVICES=all

 # Add FFmpeg
 RUN if [ "${FFMPEG}" = "true" ]; then \
-    apt-get install -y ffmpeg \
+        apt-get update && \
+        apt-get install -y --no-install-recommends \
+            ffmpeg && \
+        apt-get clean && \
+        rm -rf /var/lib/apt/lists/* \
    ; fi

 WORKDIR /build

+# we start fresh & re-copy all assets because `make build` does not clean up nicely after itself
+# so when `entrypoint.sh` runs `make build` again (which it does by default), the build would fail
+# see https://github.com/go-skynet/LocalAI/pull/658#discussion_r1241971626 and
+# https://github.com/go-skynet/LocalAI/pull/434
 COPY . .
+
+COPY --from=builder /build/sources ./sources/
+COPY --from=grpc /opt/grpc /usr/local
+
 RUN make prepare-sources
+
+# Copy the binary
 COPY --from=builder /build/local-ai ./

+# Copy shared libraries for piper
+COPY --from=builder /build/sources/go-piper/piper-phonemize/pi/lib/* /usr/lib/
+
+# do not let stablediffusion rebuild (requires an older version of absl)
+COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
+
+# Change the shell to bash so we can use [[ tests below
+SHELL ["/bin/bash", "-c"]
+# We try to strike a balance between individual layer size (as that affects total push time) and total image size
+# Splitting the backends into more groups with fewer items results in a larger image, but a smaller size for the largest layer
+# Splitting the backends into fewer groups with more items results in a smaller image, but a larger size for the largest layer
+
+RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/coqui \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "parler-tts" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/parler-tts \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "diffusers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/diffusers \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/transformers-musicgen \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/exllama \
+    ; fi
+
+RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/vall-e-x \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "openvoice" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/openvoice \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "petals" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/petals \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "sentencetransformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/sentencetransformers \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "exllama2" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/exllama2 \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "transformers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/transformers \
+    ; fi
+
+RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/vllm \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/autogptq \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/bark \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "rerankers" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/rerankers \
+    ; fi && \
+    if [[ ( "${EXTRA_BACKENDS}" =~ "mamba" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
+        make -C backend/python/mamba \
+    ; fi
+
+# Make sure the models directory exists
+RUN mkdir -p /build/models
+
 # Define the health check command
 HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
-  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
+  CMD curl -f ${HEALTHCHECK_ENDPOINT} || exit 1

+VOLUME /build/models
 EXPOSE 8080
-ENTRYPOINT [ "/build/entrypoint.sh" ]
+ENTRYPOINT [ "/build/entrypoint.sh" ]
--- a/Dockerfile.aio
+++ b/Dockerfile.aio
@@ -0,0 +1,8 @@
+ARG BASE_IMAGE=ubuntu:22.04
+
+FROM ${BASE_IMAGE} 
+
+RUN apt-get update && apt-get install -y pciutils && apt-get clean
+
+COPY aio/ /aio
+ENTRYPOINT [ "/aio/entrypoint.sh" ]
--- a/Entitlements.plist
+++ b/Entitlements.plist
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>com.apple.security.network.client</key>
+    <true/>
+    <key>com.apple.security.network.server</key>
+    <true/>
+</dict>
+</plist>
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023 Ettore Di Giacinto
+Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/911
+++ b/911
--- a/README.md
+++ b/README.md
@@ -1,205 +1,192 @@
 <h1 align="center">
  <br>
-  <img height="300" src="https://user-images.githubusercontent.com/2420543/233147843-88697415-6dbf-4368-a862-ab217f9f7342.jpeg"> <br>
+  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
    LocalAI
 <br>
 </h1>

-[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml) [![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)
+<p align="center">
+<a href="https://github.com/go-skynet/LocalAI/fork" target="blank">
+<img src="https://img.shields.io/github/forks/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI forks"/>
+</a>
+<a href="https://github.com/go-skynet/LocalAI/stargazers" target="blank">
+<img src="https://img.shields.io/github/stars/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI stars"/>
+</a>
+<a href="https://github.com/go-skynet/LocalAI/pulls" target="blank">
+<img src="https://img.shields.io/github/issues-pr/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI pull-requests"/>
+</a>
+<a href='https://github.com/go-skynet/LocalAI/releases'>
+<img src='https://img.shields.io/github/release/go-skynet/LocalAI?&label=Latest&style=for-the-badge'>
+</a>
+</p>

-[![](https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted)](https://discord.gg/uJAeKSAGDy) 
+<p align="center">
+<a href="https://hub.docker.com/r/localai/localai" target="blank">
+<img src="https://img.shields.io/badge/dockerhub-images-important.svg?logo=Docker" alt="LocalAI Docker hub"/>
+</a>
+<a href="https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest" target="blank">
+<img src="https://img.shields.io/badge/quay.io-images-important.svg?" alt="LocalAI Quay.io"/>
+</a>
+</p>

-**LocalAI** is a drop-in replacement REST API that’s compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format. Does not require GPU.
+<p align="center">
+<a href="https://twitter.com/LocalAI_API" target="blank">
+<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
+</a>
+<a href="https://discord.gg/uJAeKSAGDy" target="blank">
+<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
+</a>
+</p>

-For a list of the supported model families, please see [the model compatibility table](https://localai.io/model-compatibility/index.html#model-compatibility-table).
+> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
+>
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/) [ 🚀 Roadmap ](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

-In a nutshell:
+[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

- Local, OpenAI drop-in alternative REST API. You own your data.
- NO GPU required. NO Internet access is required either. Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. [See building instructions](https://localai.io/basics/build/index.html).
- Supports multiple models, Audio transcription, Text generation with GPTs, Image generation with stable diffusion (experimental)
- Once loaded the first time, it keep models loaded in memory for faster inference
- Doesn't shell-out, but uses C++ bindings for a faster inference and better performance. 
+**LocalAI** is the free, Open Source OpenAI alternative. LocalAI act as a drop-in replacement REST API that’s compatible with OpenAI (Elevenlabs, Anthropic... ) API specifications for local AI inferencing. It allows you to run LLMs, generate images, audio (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families. Does not require GPU. It is created and maintained by [Ettore Di Giacinto](https://github.com/mudler).

-LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome!
-
-| [ChatGPT OSS alternative](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui)                                                                                                                | [Image generation](https://localai.io/api-endpoints/index.html#image-generation)                                                                                                              |
-|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
-|  ![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)            | ![b6441997879](https://github.com/go-skynet/LocalAI/assets/2420543/d50af51c-51b7-4f39-b6c2-bf04c403894c)                  |
-
-
-See the [Getting started](https://localai.io/basics/getting_started/index.html) and [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/) sections to learn how to use LocalAI. For a list of curated models check out the [model gallery](https://localai.io/models/).
-
-## News
-
- 🔥🔥🔥 06-06-2023: **v1.18.0**: Many updates, new features, and much more 🚀, check out the [Changelog](https://localai.io/basics/news/index.html#-06-06-2023-__v1180__-)!
- 29-05-2023: LocalAI now has a website, [https://localai.io](https://localai.io)! check the news in the [dedicated section](https://localai.io/basics/news/index.html)!
-
-For latest news, follow also on Twitter [@LocalAI_API](https://twitter.com/LocalAI_API) and [@mudler_it](https://twitter.com/mudler_it)
-
-## Contribute and help
-
-To help the project you can:
-
- Upvote the [Reddit post](https://www.reddit.com/r/selfhosted/comments/12w4p2f/localai_openai_compatible_api_to_run_llm_models/) about LocalAI.
-
- [Hacker news post](https://news.ycombinator.com/item?id=35726934) - help us out by voting if you like this project.
-
- If you have technological skills and want to contribute to development, have a look at the open issues. If you are new you can have a look at the [good-first-issue](https://github.com/go-skynet/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) and [help-wanted](https://github.com/go-skynet/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels.
-
- If you don't have technological skills you can still help improving documentation or add examples or share your user-stories with our community, any help and contribution is welcome!
-
-## Usage
-
-Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section. Here below you will find generic, quick instructions to get ready and use LocalAI.
-
-The easiest way to run LocalAI is by using `docker-compose` (to build locally, see [building LocalAI](https://localai.io/basics/build/index.html)):
+![screen](https://github.com/mudler/LocalAI/assets/2420543/20b5ccd2-8393-44f0-aaf6-87a23806381e)

 ```bash
-
-git clone https://github.com/go-skynet/LocalAI
-
-cd LocalAI
-
-# (optional) Checkout a specific LocalAI tag
-# git checkout -b build <TAG>
-
-# copy your models to models/
-cp your-model.bin models/
-
-# (optional) Edit the .env file to set things like context size and threads
-# vim .env
-
-# start with docker-compose
-docker-compose up -d --pull always
-# or you can build the images with:
-# docker-compose up -d --build
-
-# Now API is accessible at localhost:8080
-curl http://localhost:8080/v1/models
-# {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
-
-curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
-     "model": "your-model.bin",            
-     "prompt": "A long time ago in a galaxy far, far away",
-     "temperature": 0.7
-   }'
+docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
+# Alternative images:
+# - if you have an Nvidia GPU:
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
+# - without preconfigured models
+# docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
+# - without preconfigured models for Nvidia GPUs
+# docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12 
 ```

-### Example: Use GPT4ALL-J model
+[💻 Getting started](https://localai.io/basics/getting_started/index.html)

-<details>
+## 🔥🔥 Hot topics / Roadmap

-```bash
-# Clone LocalAI
-git clone https://github.com/go-skynet/LocalAI
+[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)

-cd LocalAI
+- 🔥🔥 Decentralized llama.cpp:  https://github.com/mudler/LocalAI/pull/2343 (peer2peer llama.cpp!) 👉 Docs  https://localai.io/features/distribute/
+- 🔥🔥 Openvoice: https://github.com/mudler/LocalAI/pull/2334
+- 🆕 Function calls without grammars and mixed mode: https://github.com/mudler/LocalAI/pull/2328
+- 🔥🔥 Distributed inferencing: https://github.com/mudler/LocalAI/pull/2324
+- Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
+- Reranker API: https://github.com/mudler/LocalAI/pull/2121

-# (optional) Checkout a specific LocalAI tag
-# git checkout -b build <TAG>
+Hot topics (looking for contributors):

-# Download gpt4all-j to models/
-wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
+- Backends v2: https://github.com/mudler/LocalAI/issues/1126
+- Improving UX v2: https://github.com/mudler/LocalAI/issues/1373
+- Assistant API: https://github.com/mudler/LocalAI/issues/1273
+- Moderation endpoint: https://github.com/mudler/LocalAI/issues/999
+- Vulkan: https://github.com/mudler/LocalAI/issues/1647

-# Use a template from the examples
-cp -rf prompt-templates/ggml-gpt4all-j.tmpl models/
+If you want to help and contribute, issues up for grabs: https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3A%22up+for+grabs%22

-# (optional) Edit the .env file to set things like context size and threads
-# vim .env
+## 🚀 [Features](https://localai.io/features/)

-# start with docker-compose
-docker-compose up -d --pull always
-# or you can build the images with:
-# docker-compose up -d --build
-# Now API is accessible at localhost:8080
-curl http://localhost:8080/v1/models
-# {"object":"list","data":[{"id":"ggml-gpt4all-j","object":"model"}]}
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
+- 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
+- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
+- 🔥 [OpenAI-alike tools API](https://localai.io/features/openai-functions/) 
+- 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
+- ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
+- 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)
+- 🥽 [Vision API](https://localai.io/features/gpt-vision/)
+- 📈 [Reranker API](https://localai.io/features/reranker/)
+- 🆕🖧 [P2P Inferencing](https://localai.io/features/distribute/)

-curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-     "model": "ggml-gpt4all-j",
-     "messages": [{"role": "user", "content": "How are you?"}],
-     "temperature": 0.9 
-   }'
+## 💻 Usage

-# {"model":"ggml-gpt4all-j","choices":[{"message":{"role":"assistant","content":"I'm doing well, thanks. How about you?"}}]}
-```
-</details>
+Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.

+### 🔗 Community and integrations

-### Build locally
+Build and deploy custom containers:
+- https://github.com/sozercan/aikit

-<details>
+WebUIs:
+- https://github.com/Jirubizu/localai-admin
+- https://github.com/go-skynet/LocalAI-frontend
+- QA-Pilot(An interactive chat project that leverages LocalAI LLMs for rapid understanding and navigation of GitHub code repository) https://github.com/reid41/QA-Pilot

-In order to build the `LocalAI` container image locally you can use `docker`:
+Model galleries
+- https://github.com/go-skynet/model-gallery
+
+Other:
+- Helm chart https://github.com/go-skynet/helm-charts
+- VSCode extension https://github.com/badgooooor/localai-vscode-plugin
+- Terminal utility https://github.com/djcopley/ShellOracle
+- Local Smart assistant https://github.com/mudler/LocalAGI
+- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
+- Discord bot https://github.com/mudler/LocalAGI/tree/main/examples/discord
+- Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
+- Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
+- Telegram bot https://github.com/mudler/LocalAI/tree/master/examples/telegram-bot
+- Examples: https://github.com/mudler/LocalAI/tree/master/examples/
+  
+
+### 🔗 Resources
+
+- [LLM finetuning guide](https://localai.io/docs/advanced/fine-tuning/)
+- [How to build locally](https://localai.io/basics/build/index.html)
+- [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
+- [Projects integrating LocalAI](https://localai.io/docs/integrations/)
+- [How tos section](https://io.midori-ai.xyz/howtos/) (curated by our community)
+
+## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)
+
+- 🆕 [Run LocalAI on Jetson Nano Devkit](https://mudler.pm/posts/local-ai-jetson-nano-devkit/)
+- [Run LocalAI on AWS EKS with Pulumi](https://www.pulumi.com/blog/low-code-llm-apps-with-local-ai-flowise-and-pulumi/)
+- [Run LocalAI on AWS](https://staleks.hashnode.dev/installing-localai-on-aws-ec2-instance)
+- [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
+- [LocalAI meets k8sgpt](https://www.youtube.com/watch?v=PKrDNuJ_dfE)
+- [Question Answering on Documents locally with LangChain, LocalAI, Chroma, and GPT4All](https://mudler.pm/posts/localai-question-answering/)
+- [Tutorial to use k8sgpt with LocalAI](https://medium.com/@tyler_97636/k8sgpt-localai-unlock-kubernetes-superpowers-for-free-584790de9b65)
+
+## Citation
+
+If you utilize this repository, data in a downstream project, please consider citing it with:

 ```
-# build the image
-docker build -t localai .
-docker run localai
+@misc{localai,
+  author = {Ettore Di Giacinto},
+  title = {LocalAI: The free, Open source OpenAI alternative},
+  year = {2023},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/go-skynet/LocalAI}},
 ```

-Or you can build the binary with `make`:
+## ❤️ Sponsors

-```
-make build
-```
+> Do you find LocalAI useful?

-</details>
+Support the project by becoming [a backer or sponsor](https://github.com/sponsors/mudler). Your logo will show up here with a link to your website.

-See the [build section](https://localai.io/basics/build/index.html) in our documentation for detailed instructions.
+A huge thank you to our generous sponsors who support this project covering CI expenses, and our [Sponsor list](https://github.com/sponsors/mudler):

-### Run LocalAI in Kubernetes
+<p align="center">
+  <a href="https://www.spectrocloud.com/" target="blank">
+    <img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
+  </a>
+  <a href="https://www.premai.io/" target="blank">
+    <img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
+  </a>
+</p>

-LocalAI can be installed inside Kubernetes with helm. See [installation instructions](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes).
-
-## Supported API endpoints
-
-See the [list of the supported API endpoints](https://localai.io/api-endpoints/index.html) and how to configure image generation and audio transcription.
-
-## Frequently asked questions
-
-See [the FAQ](https://localai.io/faq/index.html) section for a list of common questions.
-
-## Projects already using LocalAI to run local models
-
-Feel free to open up a PR to get your project listed!
-
- [Kairos](https://github.com/kairos-io/kairos)
- [k8sgpt](https://github.com/k8sgpt-ai/k8sgpt#running-local-models)
- [Spark](https://github.com/cedriking/spark)
- [autogpt4all](https://github.com/aorumbayev/autogpt4all)
- [Mods](https://github.com/charmbracelet/mods)
- [Flowise](https://github.com/FlowiseAI/Flowise)
-
-## Short-term roadmap
-
- [x] Mimic OpenAI API (https://github.com/go-skynet/LocalAI/issues/10)
- [ ] Binary releases (https://github.com/go-skynet/LocalAI/issues/6)
- [ ] Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351) and [gpt4all](https://github.com/go-skynet/LocalAI/issues/85)
- [x] Multi-model support
- [x] Have a webUI!
- [x] Allow configuration of defaults for models.
- [x] Support for embeddings
- [x] Support for audio transcription with https://github.com/ggerganov/whisper.cpp
- [ ] GPU/CUDA support ( https://github.com/go-skynet/LocalAI/issues/69 )
- [ ] Enable automatic downloading of models from a curated gallery, with only free-licensed models, directly from the webui.
-
-## Star history
+## 🌟 Star history

 [![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)

-## License
+## 📖 License

 LocalAI is a community-driven project created by [Ettore Di Giacinto](https://github.com/mudler/).

-MIT
+MIT - Author Ettore Di Giacinto <mudler@localai.io>

-## Author
-
-Ettore Di Giacinto and others
-
-## Acknowledgements
+## 🙇 Acknowledgements

 LocalAI couldn't have been built without the help of great software already available from the community. Thank you!

@@ -210,9 +197,11 @@ LocalAI couldn't have been built without the help of great software already avai
 - https://github.com/EdVince/Stable-Diffusion-NCNN
 - https://github.com/ggerganov/whisper.cpp
 - https://github.com/saharNooby/rwkv.cpp
+- https://github.com/rhasspy/piper

-## Contributors
+## 🤗 Contributors

+This is a community project, a special thanks to our contributors! 🤗
 <a href="https://github.com/go-skynet/LocalAI/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=go-skynet/LocalAI" />
 </a>
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -0,0 +1,42 @@
+# Security Policy
+
+## Introduction
+
+At LocalAI, we take the security of our software seriously. We understand the importance of protecting our community from vulnerabilities and are committed to ensuring the safety and security of our users.
+
+## Supported Versions
+
+We provide support and updates for certain versions of our software. The following table outlines which versions are currently supported with security updates:
+
+| Version | Supported          |
+| ------- | ------------------ |
+| > 2.0   | :white_check_mark: |
+| < 2.0   | :x:                |
+
+Please ensure that you are using a supported version to receive the latest security updates.
+
+## Reporting a Vulnerability
+
+We encourage the responsible disclosure of any security vulnerabilities. If you believe you've found a security issue in our software, we kindly ask you to follow the steps below to report it to us:
+
+1. **Email Us:** Send an email to [security@localai.io](mailto:security@localai.io) with a detailed report. Please do not disclose the vulnerability publicly or to any third parties before it has been addressed by us.
+
+2. **Expect a Response:** We aim to acknowledge receipt of vulnerability reports within 48 hours. Our security team will review your report and work closely with you to understand the impact and ensure a thorough investigation.
+
+3. **Collaboration:** If the vulnerability is accepted, we will work with you and our community to address the issue promptly. We'll keep you informed throughout the resolution process and may request additional information or collaboration.
+
+4. **Disclosure:** Once the vulnerability has been resolved, we encourage a coordinated disclosure. We believe in transparency and will work with you to ensure that our community is informed in a responsible manner.
+
+## Use of Third-Party Platforms
+
+As a Free and Open Source Software (FOSS) organization, we do not offer monetary bounties. However, researchers who wish to report vulnerabilities can also do so via [Huntr](https://huntr.dev/bounties), a platform that recognizes contributions to open source security.
+
+## Contact
+
+For any security-related inquiries beyond vulnerability reporting, please contact us at [security@localai.io](mailto:security@localai.io).
+
+## Acknowledgments
+
+We appreciate the efforts of those who contribute to the security of our project. Your responsible disclosure is invaluable to the safety and integrity of LocalAI.
+
+Thank you for helping us keep LocalAI secure.
--- a/aio/cpu/README.md
+++ b/aio/cpu/README.md
@@ -0,0 +1,5 @@
+## AIO CPU size
+
+Use this image with CPU-only.
+
+Please keep using only C++ backends so the base image is as small as possible (without CUDA, cuDNN, python, etc).
--- a/aio/cpu/embeddings.yaml
+++ b/aio/cpu/embeddings.yaml
@@ -0,0 +1,12 @@
+name: text-embedding-ada-002
+backend: bert-embeddings
+parameters:
+  model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
+      "input": "Your text string goes here",
+      "model": "text-embedding-ada-002"
+    }'
--- a/aio/cpu/image-gen.yaml
+++ b/aio/cpu/image-gen.yaml
@@ -0,0 +1,62 @@
+name: stablediffusion
+backend: stablediffusion
+parameters:
+  model: stablediffusion_assets
+
+license: "BSD-3"
+urls:
+- https://github.com/EdVince/Stable-Diffusion-NCNN
+- https://github.com/EdVince/Stable-Diffusion-NCNN/blob/main/LICENSE
+
+description: |
+     Stable Diffusion in NCNN with c++, supported txt2img and img2img
+
+download_files:
+- filename: "stablediffusion_assets/AutoencoderKL-256-256-fp16-opt.param"
+  sha256: "18ca4b66685e21406bcf64c484b3b680b4949900415536d599cc876579c85c82"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-256-256-fp16-opt.param"
+- filename: "stablediffusion_assets/AutoencoderKL-512-512-fp16-opt.param"
+  sha256: "cf45f63aacf3dbbab0f59ed92a6f2c14d9a1801314631cd3abe91e3c85639a20"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-512-512-fp16-opt.param"
+- filename: "stablediffusion_assets/AutoencoderKL-base-fp16.param"
+  sha256: "0254a056dce61b0c27dc9ec1b78b53bcf55315c540f55f051eb841aa992701ba"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/AutoencoderKL-base-fp16.param"
+- filename: "stablediffusion_assets/AutoencoderKL-encoder-512-512-fp16.bin"
+  sha256: "ddcb79a9951b9f91e05e087739ed69da2c1c4ae30ba4168cce350b49d617c9fa"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-encoder-512-512-fp16.bin"
+- filename: "stablediffusion_assets/AutoencoderKL-fp16.bin"
+  sha256: "f02e71f80e70252734724bbfaed5c4ddd3a8ed7e61bb2175ff5f53099f0e35dd"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/AutoencoderKL-fp16.bin"
+- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.bin"
+  sha256: "1c9a12f4e1dd1b295a388045f7f28a2352a4d70c3dc96a542189a3dd7051fdd6"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/FrozenCLIPEmbedder-fp16.bin"
+- filename: "stablediffusion_assets/FrozenCLIPEmbedder-fp16.param"
+  sha256: "471afbe678dd1fd3fe764ef9c6eccaccb0a7d7e601f27b462aa926b20eb368c9"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/FrozenCLIPEmbedder-fp16.param"
+- filename: "stablediffusion_assets/log_sigmas.bin"
+  sha256: "a2089f8aa4c61f9c200feaec541ab3f5c94233b28deb6d5e8bcd974fa79b68ac"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/raw/main/x86/linux/assets/log_sigmas.bin"
+- filename: "stablediffusion_assets/UNetModel-256-256-MHA-fp16-opt.param"
+  sha256: "a58c380229f09491776df837b7aa7adffc0a87821dc4708b34535da2e36e3da1"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-256-256-MHA-fp16-opt.param"
+- filename: "stablediffusion_assets/UNetModel-512-512-MHA-fp16-opt.param"
+  sha256: "f12034067062827bd7f43d1d21888d1f03905401acf6c6eea22be23c259636fa"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-512-512-MHA-fp16-opt.param"
+- filename: "stablediffusion_assets/UNetModel-base-MHA-fp16.param"
+  sha256: "696f6975de49f4325b53ce32aff81861a6d6c07cd9ce3f0aae2cc405350af38d"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/UNetModel-base-MHA-fp16.param"
+- filename: "stablediffusion_assets/UNetModel-MHA-fp16.bin"
+  sha256: "d618918d011bfc1f644c0f2a33bf84931bd53b28a98492b0a8ed6f3a818852c3"
+  uri: "https://github.com/EdVince/Stable-Diffusion-NCNN/releases/download/naifu/UNetModel-MHA-fp16.bin"
+- filename: "stablediffusion_assets/vocab.txt"
+  sha256: "e30e57b6f1e47616982ef898d8922be24e535b4fa3d0110477b3a6f02ebbae7d"
+  uri: "https://raw.githubusercontent.com/EdVince/Stable-Diffusion-NCNN/main/x86/linux/assets/vocab.txt"
+
+usage: |
+        curl http://localhost:8080/v1/images/generations \
+          -H "Content-Type: application/json" \
+          -d '{
+            "prompt": "<positive prompt>|<negative prompt>",
+            "step": 25,
+            "size": "512x512"
+          }'
--- a/aio/cpu/rerank.yaml
+++ b/aio/cpu/rerank.yaml
@@ -0,0 +1,27 @@
+name: jina-reranker-v1-base-en
+backend: rerankers
+parameters:
+  model: cross-encoder
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/cpu/speech-to-text.yaml
+++ b/aio/cpu/speech-to-text.yaml
@@ -0,0 +1,18 @@
+name: whisper-1
+backend: whisper
+parameters:
+  model: ggml-whisper-base.bin
+
+usage: |
+    ## example audio file
+    wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
+
+    ## Send the example audio file to the transcriptions endpoint
+    curl http://localhost:8080/v1/audio/transcriptions \
+         -H "Content-Type: multipart/form-data" \
+         -F file="@$PWD/gb1.ogg" -F model="whisper-1"
+
+download_files:
+- filename: "ggml-whisper-base.bin"
+  sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
+  uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
--- a/aio/cpu/text-to-speech.yaml
+++ b/aio/cpu/text-to-speech.yaml
@@ -0,0 +1,15 @@
+name: tts-1
+download_files:
+  - filename: voice-en-us-amy-low.tar.gz
+    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
+
+parameters:
+  model: en-us-amy-low.onnx
+
+usage: |
+    To test if this model works as expected, you can use the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
+      "model":"voice-en-us-amy-low",
+      "input": "Hi, this is a test."
+    }'
--- a/aio/cpu/text-to-text.yaml
+++ b/aio/cpu/text-to-text.yaml
@@ -0,0 +1,101 @@
+name: gpt-4
+mmap: true
+parameters:
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+context_size: 8192
+
+stopwords:
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+
+template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
+    {{- end }}
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
+  completion: |
+    {{.Input}}
+  function: |-
+    <|im_start|>system
+    You are a function calling AI model.
+    Here are the available tools:
+    <tools>
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    </tools>
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
+    <tool_call>
+    {"arguments": <args-dict>, "name": <function-name>}
+    </tool_call><|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
--- a/aio/cpu/vision.yaml
+++ b/aio/cpu/vision.yaml
@@ -0,0 +1,31 @@
+backend: llama-cpp
+context_size: 4096
+f16: true
+mmap: true
+name: gpt-4-vision-preview
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: bakllava-mmproj.gguf
+parameters:
+  model: bakllava.gguf
+
+template:
+  chat: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input}}
+    ASSISTANT:
+
+download_files:
+- filename: bakllava.gguf
+  uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
+- filename: bakllava-mmproj.gguf
+  uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -0,0 +1,138 @@
+#!/bin/bash
+
+echo "===> LocalAI All-in-One (AIO) container starting..."
+
+GPU_ACCELERATION=false
+GPU_VENDOR=""
+
+function check_intel() {
+    if lspci | grep -E 'VGA|3D' | grep -iq intel; then
+        echo "Intel GPU detected"
+        if [ -d /opt/intel ]; then
+            GPU_ACCELERATION=true
+            GPU_VENDOR=intel
+        else
+            echo "Intel GPU detected, but Intel GPU drivers are not installed. GPU acceleration will not be available."
+        fi
+    fi
+}
+
+function check_nvidia_wsl() {
+    if lspci | grep -E 'VGA|3D' | grep -iq "Microsoft Corporation Device 008e"; then
+        # We make the assumption this WSL2 cars is NVIDIA, then check for nvidia-smi
+        # Make sure the container was run with `--gpus all` as the only required parameter
+        echo "NVIDIA GPU detected via WSL2"
+        # nvidia-smi should be installed in the container
+        if nvidia-smi; then
+            GPU_ACCELERATION=true
+            GPU_VENDOR=nvidia
+        else
+            echo "NVIDIA GPU detected via WSL2, but nvidia-smi is not installed. GPU acceleration will not be available."
+        fi
+    fi
+}
+
+function check_amd() {
+    if lspci | grep -E 'VGA|3D' | grep -iq amd; then
+        echo "AMD GPU detected"
+        # Check if ROCm is installed
+        if [ -d /opt/rocm ]; then
+            GPU_ACCELERATION=true
+            GPU_VENDOR=amd
+        else
+            echo "AMD GPU detected, but ROCm is not installed. GPU acceleration will not be available."
+        fi
+    fi
+}
+
+function check_nvidia() {
+    if lspci | grep -E 'VGA|3D' | grep -iq nvidia; then
+        echo "NVIDIA GPU detected"
+        # nvidia-smi should be installed in the container
+        if nvidia-smi; then
+            GPU_ACCELERATION=true
+            GPU_VENDOR=nvidia
+        else
+            echo "NVIDIA GPU detected, but nvidia-smi is not installed. GPU acceleration will not be available."
+        fi
+    fi
+}
+
+function check_metal() {
+    if system_profiler SPDisplaysDataType | grep -iq 'Metal'; then
+        echo "Apple Metal supported GPU detected"
+        GPU_ACCELERATION=true
+        GPU_VENDOR=apple
+    fi
+}
+
+function detect_gpu() {
+    case "$(uname -s)" in
+        Linux)
+            check_nvidia
+            check_amd
+            check_intel
+            check_nvidia_wsl
+            ;;
+        Darwin)
+            check_metal
+            ;;
+    esac
+}
+
+function detect_gpu_size() {
+    # Attempting to find GPU memory size for NVIDIA GPUs
+    if [ "$GPU_ACCELERATION" = true ] && [ "$GPU_VENDOR" = "nvidia" ]; then
+        echo "NVIDIA GPU detected. Attempting to find memory size..."
+        # Using head -n 1 to get the total memory of the 1st NVIDIA GPU detected.
+        # If handling multiple GPUs is required in the future, this is the place to do it
+        nvidia_sm=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits | head -n 1)
+        if [ ! -z "$nvidia_sm" ]; then
+            echo "Total GPU Memory: $nvidia_sm MiB"
+            # if bigger than 8GB, use 16GB
+            #if [ "$nvidia_sm" -gt 8192 ]; then
+            #    GPU_SIZE=gpu-16g
+            #else
+            GPU_SIZE=gpu-8g
+            #fi
+        else
+            echo "Unable to determine NVIDIA GPU memory size. Falling back to CPU."
+            GPU_SIZE=gpu-8g
+        fi
+    elif [ "$GPU_ACCELERATION" = true ] && [ "$GPU_VENDOR" = "intel" ]; then
+        GPU_SIZE=intel
+    # Default to a generic GPU size until we implement GPU size detection for non NVIDIA GPUs
+    elif [ "$GPU_ACCELERATION" = true ]; then
+        echo "Non-NVIDIA GPU detected. Specific GPU memory size detection is not implemented."
+        GPU_SIZE=gpu-8g
+
+    # default to cpu if GPU_SIZE is not set
+    else
+        echo "GPU acceleration is not enabled or supported. Defaulting to CPU."
+        GPU_SIZE=cpu
+    fi
+}
+
+function check_vars() {
+    if [ -z "$MODELS" ]; then
+        echo "MODELS environment variable is not set. Please set it to a comma-separated list of model YAML files to load."
+        exit 1
+    fi
+
+    if [ -z "$PROFILE" ]; then
+        echo "PROFILE environment variable is not set. Please set it to one of the following: cpu, gpu-8g, gpu-16g, apple"
+        exit 1
+    fi
+}
+
+detect_gpu
+detect_gpu_size
+
+PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
+
+check_vars
+
+echo "===> Starting LocalAI[$PROFILE] with the following models: $MODELS"
+
+exec /build/entrypoint.sh "$@"
--- a/aio/gpu-8g/embeddings.yaml
+++ b/aio/gpu-8g/embeddings.yaml
@@ -0,0 +1,12 @@
+name: text-embedding-ada-002
+backend: sentencetransformers
+parameters:
+  model: all-MiniLM-L6-v2
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
+      "input": "Your text string goes here",
+      "model": "text-embedding-ada-002"
+    }'
--- a/aio/gpu-8g/image-gen.yaml
+++ b/aio/gpu-8g/image-gen.yaml
@@ -0,0 +1,25 @@
+name: stablediffusion
+parameters:
+  model: DreamShaper_8_pruned.safetensors
+backend: diffusers
+step: 25
+f16: true
+
+diffusers:
+  pipeline_type: StableDiffusionPipeline
+  cuda: true
+  enable_parameters: "negative_prompt,num_inference_steps"
+  scheduler_type: "k_dpmpp_2m"
+
+download_files:
+- filename: DreamShaper_8_pruned.safetensors
+  uri: huggingface://Lykon/DreamShaper/DreamShaper_8_pruned.safetensors
+
+usage: |
+        curl http://localhost:8080/v1/images/generations \
+          -H "Content-Type: application/json" \
+          -d '{
+            "prompt": "<positive prompt>|<negative prompt>",
+            "step": 25,
+            "size": "512x512"
+          }'
--- a/aio/gpu-8g/rerank.yaml
+++ b/aio/gpu-8g/rerank.yaml
@@ -0,0 +1,27 @@
+name: jina-reranker-v1-base-en
+backend: rerankers
+parameters:
+  model: cross-encoder
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/gpu-8g/speech-to-text.yaml
+++ b/aio/gpu-8g/speech-to-text.yaml
@@ -0,0 +1,18 @@
+name: whisper-1
+backend: whisper
+parameters:
+  model: ggml-whisper-base.bin
+
+usage: |
+    ## example audio file
+    wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
+
+    ## Send the example audio file to the transcriptions endpoint
+    curl http://localhost:8080/v1/audio/transcriptions \
+         -H "Content-Type: multipart/form-data" \
+         -F file="@$PWD/gb1.ogg" -F model="whisper-1"
+
+download_files:
+- filename: "ggml-whisper-base.bin"
+  sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
+  uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
--- a/aio/gpu-8g/text-to-speech.yaml
+++ b/aio/gpu-8g/text-to-speech.yaml
@@ -0,0 +1,15 @@
+name: tts-1
+download_files:
+  - filename: voice-en-us-amy-low.tar.gz
+    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
+
+parameters:
+  model: en-us-amy-low.onnx
+
+usage: |
+    To test if this model works as expected, you can use the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
+      "model":"tts-1",
+      "input": "Hi, this is a test."
+    }'
--- a/aio/gpu-8g/text-to-text.yaml
+++ b/aio/gpu-8g/text-to-text.yaml
@@ -0,0 +1,101 @@
+name: gpt-4
+mmap: true
+parameters:
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+context_size: 8192
+
+stopwords:
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+
+template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
+    {{- end }}
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
+  completion: |
+    {{.Input}}
+  function: |-
+    <|im_start|>system
+    You are a function calling AI model.
+    Here are the available tools:
+    <tools>
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    </tools>
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
+    <tool_call>
+    {"arguments": <args-dict>, "name": <function-name>}
+    </tool_call><|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
--- a/aio/gpu-8g/vision.yaml
+++ b/aio/gpu-8g/vision.yaml
@@ -0,0 +1,35 @@
+backend: llama-cpp
+context_size: 4096
+f16: true
+mmap: true
+name: gpt-4-vision-preview
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: llava-v1.6-7b-mmproj-f16.gguf
+parameters:
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+  seed: -1
+
+template:
+  chat: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input}}
+    ASSISTANT:
+
+download_files:
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/aio/intel/embeddings.yaml
+++ b/aio/intel/embeddings.yaml
@@ -0,0 +1,12 @@
+name: text-embedding-ada-002
+backend: sentencetransformers
+parameters:
+  model: all-MiniLM-L6-v2
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
+      "input": "Your text string goes here",
+      "model": "text-embedding-ada-002"
+    }'
--- a/aio/intel/image-gen.yaml
+++ b/aio/intel/image-gen.yaml
@@ -0,0 +1,20 @@
+name: stablediffusion
+parameters:
+  model: runwayml/stable-diffusion-v1-5
+backend: diffusers
+step: 25
+f16: true
+diffusers:
+  pipeline_type: StableDiffusionPipeline
+  cuda: true
+  enable_parameters: "negative_prompt,num_inference_steps"
+  scheduler_type: "k_dpmpp_2m"
+
+usage: |
+        curl http://localhost:8080/v1/images/generations \
+          -H "Content-Type: application/json" \
+          -d '{
+            "prompt": "<positive prompt>|<negative prompt>",
+            "step": 25,
+            "size": "512x512"
+          }'
--- a/aio/intel/rerank.yaml
+++ b/aio/intel/rerank.yaml
@@ -0,0 +1,27 @@
+name: jina-reranker-v1-base-en
+backend: rerankers
+parameters:
+  model: cross-encoder
+
+usage: |
+    You can test this model with curl like this:
+
+    curl http://localhost:8080/v1/rerank \
+      -H "Content-Type: application/json" \
+      -d '{
+      "model": "jina-reranker-v1-base-en",
+      "query": "Organic skincare products for sensitive skin",
+      "documents": [
+        "Eco-friendly kitchenware for modern homes",
+        "Biodegradable cleaning supplies for eco-conscious consumers",
+        "Organic cotton baby clothes for sensitive skin",
+        "Natural organic skincare range for sensitive skin",
+        "Tech gadgets for smart homes: 2024 edition",
+        "Sustainable gardening tools and compost solutions",
+        "Sensitive skin-friendly facial cleansers and toners",
+        "Organic food wraps and storage solutions",
+        "All-natural pet food for dogs with allergies",
+        "Yoga mats made from recycled materials"
+      ],
+      "top_n": 3
+    }'
--- a/aio/intel/speech-to-text.yaml
+++ b/aio/intel/speech-to-text.yaml
@@ -0,0 +1,18 @@
+name: whisper-1
+backend: whisper
+parameters:
+  model: ggml-whisper-base.bin
+
+usage: |
+    ## example audio file
+    wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
+
+    ## Send the example audio file to the transcriptions endpoint
+    curl http://localhost:8080/v1/audio/transcriptions \
+         -H "Content-Type: multipart/form-data" \
+         -F file="@$PWD/gb1.ogg" -F model="whisper-1"
+
+download_files:
+- filename: "ggml-whisper-base.bin"
+  sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
+  uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
--- a/aio/intel/text-to-speech.yaml
+++ b/aio/intel/text-to-speech.yaml
@@ -0,0 +1,15 @@
+name: tts-1
+download_files:
+  - filename: voice-en-us-amy-low.tar.gz
+    uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
+
+parameters:
+  model: en-us-amy-low.onnx
+
+usage: |
+    To test if this model works as expected, you can use the following curl command:
+
+    curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
+      "model":"tts-1",
+      "input": "Hi, this is a test."
+    }'
--- a/aio/intel/text-to-text.yaml
+++ b/aio/intel/text-to-text.yaml
@@ -0,0 +1,103 @@
+name: gpt-4
+mmap: false
+context_size: 8192
+
+f16: false
+parameters:
+  model: huggingface://NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
+
+stopwords:
+- "<|im_end|>"
+- "<dummy32000>"
+- "</tool_call>"
+- "<|eot_id|>"
+- "<|end_of_text|>"
+
+function:
+  # disable injecting the "answer" tool
+  disable_no_action: true
+
+  grammar:
+    # This allows the grammar to also return messages
+    mixed_mode: true
+    # Suffix to add to the grammar
+    #prefix: '<tool_call>\n'
+    # Force parallel calls in the grammar
+    # parallel_calls: true
+
+  return_name_in_function_response: true
+  # Without grammar uncomment the lines below
+  # Warning: this is relying only on the capability of the
+  # LLM model to generate the correct function call.
+  json_regex_match: 
+   - "(?s)<tool_call>(.*?)</tool_call>"
+   - "(?s)<tool_call>(.*?)"
+  replace_llm_results:
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+  replace_function_results: 
+  # Replace everything that is not JSON array or object
+  # 
+  - key: '(?s)^[^{\[]*'
+    value: ""
+  - key: '(?s)[^}\]]*$'
+    value: ""
+  - key: "'([^']*?)'"
+    value: "_DQUOTE_${1}_DQUOTE_"
+  - key: '\\"'
+    value: "__TEMP_QUOTE__"
+  - key: "\'"
+    value: "'"
+  - key: "_DQUOTE_"
+    value: '"'
+  - key: "__TEMP_QUOTE__"
+    value: '"'
+  # Drop the scratchpad content from responses
+  - key: "(?s)<scratchpad>.*</scratchpad>"
+    value: ""
+
+template:
+  chat: |
+    {{.Input -}}
+    <|im_start|>assistant
+  chat_message: |
+    <|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
+    {{- if .FunctionCall }}
+    <tool_call>
+    {{- else if eq .RoleName "tool" }}
+    <tool_response>
+    {{- end }}
+    {{- if .Content}}
+    {{.Content }}
+    {{- end }}
+    {{- if .FunctionCall}}
+    {{toJson .FunctionCall}}
+    {{- end }}
+    {{- if .FunctionCall }}
+    </tool_call>
+    {{- else if eq .RoleName "tool" }}
+    </tool_response>
+    {{- end }}<|im_end|>
+  completion: |
+    {{.Input}}
+  function: |-
+    <|im_start|>system
+    You are a function calling AI model.
+    Here are the available tools:
+    <tools>
+    {{range .Functions}}
+    {'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
+    {{end}}
+    </tools>
+    You should call the tools provided to you sequentially
+    Please use <scratchpad> XML tags to record your reasoning and planning before you call the functions as follows:
+    <scratchpad>
+    {step-by-step reasoning and plan in bullet points}
+    </scratchpad>
+    For each function call return a json object with function name and arguments within <tool_call> XML tags as follows:
+    <tool_call>
+    {"arguments": <args-dict>, "name": <function-name>}
+    </tool_call><|im_end|>
+    {{.Input -}}
+    <|im_start|>assistant
--- a/aio/intel/vision.yaml
+++ b/aio/intel/vision.yaml
@@ -0,0 +1,35 @@
+backend: llama-cpp
+context_size: 4096
+mmap: false
+f16: false
+name: gpt-4-vision-preview
+
+roles:
+  user: "USER:"
+  assistant: "ASSISTANT:"
+  system: "SYSTEM:"
+
+mmproj: llava-v1.6-7b-mmproj-f16.gguf
+parameters:
+  model: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  temperature: 0.2
+  top_k: 40
+  top_p: 0.95
+  seed: -1
+
+template:
+  chat: |
+    A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+    {{.Input}}
+    ASSISTANT:
+
+download_files:
+- filename: llava-v1.6-mistral-7b.Q5_K_M.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q5_K_M.gguf
+- filename: llava-v1.6-7b-mmproj-f16.gguf
+  uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
+
+usage: |
+    curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+        "model": "gpt-4-vision-preview",
+        "messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'
--- a/api/api.go
+++ b/api/api.go
@@ -1,152 +0,0 @@
-package api
-
-import (
-	"errors"
-
-	"github.com/go-skynet/LocalAI/pkg/assets"
-	"github.com/gofiber/fiber/v2"
-	"github.com/gofiber/fiber/v2/middleware/cors"
-	"github.com/gofiber/fiber/v2/middleware/logger"
-	"github.com/gofiber/fiber/v2/middleware/recover"
-	"github.com/rs/zerolog"
-	"github.com/rs/zerolog/log"
-)
-
-func App(opts ...AppOption) (*fiber.App, error) {
-	options := newOptions(opts...)
-
-	zerolog.SetGlobalLevel(zerolog.InfoLevel)
-	if options.debug {
-		zerolog.SetGlobalLevel(zerolog.DebugLevel)
-	}
-
-	// Return errors as JSON responses
-	app := fiber.New(fiber.Config{
-		BodyLimit:             options.uploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
-		DisableStartupMessage: options.disableMessage,
-		// Override default error handler
-		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
-			// Status code defaults to 500
-			code := fiber.StatusInternalServerError
-
-			// Retrieve the custom status code if it's a *fiber.Error
-			var e *fiber.Error
-			if errors.As(err, &e) {
-				code = e.Code
-			}
-
-			// Send custom error page
-			return ctx.Status(code).JSON(
-				ErrorResponse{
-					Error: &APIError{Message: err.Error(), Code: code},
-				},
-			)
-		},
-	})
-
-	if options.debug {
-		app.Use(logger.New(logger.Config{
-			Format: "[${ip}]:${port} ${status} - ${method} ${path}\n",
-		}))
-	}
-
-	cm := NewConfigMerger()
-	if err := cm.LoadConfigs(options.loader.ModelPath); err != nil {
-		log.Error().Msgf("error loading config files: %s", err.Error())
-	}
-
-	if options.configFile != "" {
-		if err := cm.LoadConfigFile(options.configFile); err != nil {
-			log.Error().Msgf("error loading config file: %s", err.Error())
-		}
-	}
-
-	if options.debug {
-		for _, v := range cm.ListConfigs() {
-			cfg, _ := cm.GetConfig(v)
-			log.Debug().Msgf("Model: %s (config: %+v)", v, cfg)
-		}
-	}
-
-	if options.assetsDestination != "" {
-		// Extract files from the embedded FS
-		err := assets.ExtractFiles(options.backendAssets, options.assetsDestination)
-		if err != nil {
-			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
-		}
-	}
-
-	// Default middleware config
-	app.Use(recover.New())
-
-	if options.preloadJSONModels != "" {
-		if err := ApplyGalleryFromString(options.loader.ModelPath, options.preloadJSONModels, cm); err != nil {
-			return nil, err
-		}
-	}
-
-	if options.preloadModelsFromPath != "" {
-		if err := ApplyGalleryFromFile(options.loader.ModelPath, options.preloadModelsFromPath, cm); err != nil {
-			return nil, err
-		}
-	}
-
-	if options.cors {
-		if options.corsAllowOrigins == "" {
-			app.Use(cors.New())
-		} else {
-			app.Use(cors.New(cors.Config{
-				AllowOrigins: options.corsAllowOrigins,
-			}))
-		}
-	}
-
-	// LocalAI API endpoints
-	applier := newGalleryApplier(options.loader.ModelPath)
-	applier.start(options.context, cm)
-	app.Post("/models/apply", applyModelGallery(options.loader.ModelPath, cm, applier.C))
-	app.Get("/models/jobs/:uuid", getOpStatus(applier))
-
-	// openAI compatible API endpoint
-
-	// chat
-	app.Post("/v1/chat/completions", chatEndpoint(cm, options))
-	app.Post("/chat/completions", chatEndpoint(cm, options))
-
-	// edit
-	app.Post("/v1/edits", editEndpoint(cm, options))
-	app.Post("/edits", editEndpoint(cm, options))
-
-	// completion
-	app.Post("/v1/completions", completionEndpoint(cm, options))
-	app.Post("/completions", completionEndpoint(cm, options))
-
-	// embeddings
-	app.Post("/v1/embeddings", embeddingsEndpoint(cm, options))
-	app.Post("/embeddings", embeddingsEndpoint(cm, options))
-	app.Post("/v1/engines/:model/embeddings", embeddingsEndpoint(cm, options))
-
-	// audio
-	app.Post("/v1/audio/transcriptions", transcriptEndpoint(cm, options))
-
-	// images
-	app.Post("/v1/images/generations", imageEndpoint(cm, options))
-
-	if options.imageDir != "" {
-		app.Static("/generated-images", options.imageDir)
-	}
-
-	ok := func(c *fiber.Ctx) error {
-		return c.SendStatus(200)
-	}
-
-	// Kubernetes health checks
-	app.Get("/healthz", ok)
-	app.Get("/readyz", ok)
-
-	// models
-	app.Get("/v1/models", listModels(options.loader, cm))
-	app.Get("/models", listModels(options.loader, cm))
-
-	return app, nil
-}
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -1,429 +0,0 @@
-package api_test
-
-import (
-	"bytes"
-	"context"
-	"embed"
-	"encoding/json"
-	"fmt"
-	"io/ioutil"
-	"net/http"
-	"os"
-	"path/filepath"
-	"runtime"
-
-	. "github.com/go-skynet/LocalAI/api"
-	"github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-	"gopkg.in/yaml.v3"
-
-	openaigo "github.com/otiai10/openaigo"
-	"github.com/sashabaranov/go-openai"
-)
-
-type modelApplyRequest struct {
-	URL       string            `json:"url"`
-	Name      string            `json:"name"`
-	Overrides map[string]string `json:"overrides"`
-}
-
-func getModelStatus(url string) (response map[string]interface{}) {
-	// Create the HTTP request
-	resp, err := http.Get(url)
-	if err != nil {
-		fmt.Println("Error creating request:", err)
-		return
-	}
-	defer resp.Body.Close()
-
-	body, err := ioutil.ReadAll(resp.Body)
-	if err != nil {
-		fmt.Println("Error reading response body:", err)
-		return
-	}
-
-	// Unmarshal the response into a map[string]interface{}
-	err = json.Unmarshal(body, &response)
-	if err != nil {
-		fmt.Println("Error unmarshaling JSON response:", err)
-		return
-	}
-	return
-}
-func postModelApplyRequest(url string, request modelApplyRequest) (response map[string]interface{}) {
-
-	//url := "http://localhost:AI/models/apply"
-
-	// Create the request payload
-
-	payload, err := json.Marshal(request)
-	if err != nil {
-		fmt.Println("Error marshaling JSON:", err)
-		return
-	}
-
-	// Create the HTTP request
-	req, err := http.NewRequest("POST", url, bytes.NewBuffer(payload))
-	if err != nil {
-		fmt.Println("Error creating request:", err)
-		return
-	}
-	req.Header.Set("Content-Type", "application/json")
-
-	// Make the request
-	client := &http.Client{}
-	resp, err := client.Do(req)
-	if err != nil {
-		fmt.Println("Error making request:", err)
-		return
-	}
-	defer resp.Body.Close()
-
-	body, err := ioutil.ReadAll(resp.Body)
-	if err != nil {
-		fmt.Println("Error reading response body:", err)
-		return
-	}
-
-	// Unmarshal the response into a map[string]interface{}
-	err = json.Unmarshal(body, &response)
-	if err != nil {
-		fmt.Println("Error unmarshaling JSON response:", err)
-		return
-	}
-	return
-}
-
-//go:embed backend-assets/*
-var backendAssets embed.FS
-
-var _ = Describe("API test", func() {
-
-	var app *fiber.App
-	var modelLoader *model.ModelLoader
-	var client *openai.Client
-	var client2 *openaigo.Client
-	var c context.Context
-	var cancel context.CancelFunc
-	var tmpdir string
-
-	Context("API with ephemeral models", func() {
-		BeforeEach(func() {
-			var err error
-			tmpdir, err = os.MkdirTemp("", "")
-			Expect(err).ToNot(HaveOccurred())
-
-			modelLoader = model.NewModelLoader(tmpdir)
-			c, cancel = context.WithCancel(context.Background())
-
-			app, err = App(WithContext(c), WithModelLoader(modelLoader), WithBackendAssets(backendAssets), WithBackendAssetsOutput(tmpdir))
-			Expect(err).ToNot(HaveOccurred())
-			go app.Listen("127.0.0.1:9090")
-
-			defaultConfig := openai.DefaultConfig("")
-			defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"
-
-			client2 = openaigo.NewClient("")
-			client2.BaseURL = defaultConfig.BaseURL
-
-			// Wait for API to be ready
-			client = openai.NewClientWithConfig(defaultConfig)
-			Eventually(func() error {
-				_, err := client.ListModels(context.TODO())
-				return err
-			}, "2m").ShouldNot(HaveOccurred())
-		})
-
-		AfterEach(func() {
-			cancel()
-			app.Shutdown()
-			os.RemoveAll(tmpdir)
-		})
-
-		Context("Applying models", func() {
-			It("overrides models", func() {
-				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					URL:  "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
-					Name: "bert",
-					Overrides: map[string]string{
-						"backend": "llama",
-					},
-				})
-
-				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
-
-				uuid := response["uuid"].(string)
-
-				Eventually(func() bool {
-					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
-					fmt.Println(response)
-					return response["processed"].(bool)
-				}, "360s").Should(Equal(true))
-
-				dat, err := os.ReadFile(filepath.Join(tmpdir, "bert.yaml"))
-				Expect(err).ToNot(HaveOccurred())
-
-				content := map[string]interface{}{}
-				err = yaml.Unmarshal(dat, &content)
-				Expect(err).ToNot(HaveOccurred())
-				Expect(content["backend"]).To(Equal("llama"))
-			})
-			It("apply models without overrides", func() {
-				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					URL:       "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
-					Name:      "bert",
-					Overrides: map[string]string{},
-				})
-
-				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
-
-				uuid := response["uuid"].(string)
-
-				Eventually(func() bool {
-					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
-					fmt.Println(response)
-					return response["processed"].(bool)
-				}, "360s").Should(Equal(true))
-
-				dat, err := os.ReadFile(filepath.Join(tmpdir, "bert.yaml"))
-				Expect(err).ToNot(HaveOccurred())
-
-				content := map[string]interface{}{}
-				err = yaml.Unmarshal(dat, &content)
-				Expect(err).ToNot(HaveOccurred())
-				Expect(content["backend"]).To(Equal("bert-embeddings"))
-			})
-
-			It("runs openllama", Label("llama"), func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
-				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
-					Name:      "openllama_3b",
-					Overrides: map[string]string{},
-				})
-
-				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
-
-				uuid := response["uuid"].(string)
-
-				Eventually(func() bool {
-					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
-					fmt.Println(response)
-					return response["processed"].(bool)
-				}, "360s").Should(Equal(true))
-
-				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(len(resp.Choices)).To(Equal(1))
-				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
-			})
-
-			It("runs gpt4all", Label("gpt4all"), func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
-
-				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					URL:       "github:go-skynet/model-gallery/gpt4all-j.yaml",
-					Name:      "gpt4all-j",
-					Overrides: map[string]string{},
-				})
-
-				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
-
-				uuid := response["uuid"].(string)
-
-				Eventually(func() bool {
-					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
-					fmt.Println(response)
-					return response["processed"].(bool)
-				}, "360s").Should(Equal(true))
-
-				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-j", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "How are you?"}}})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(len(resp.Choices)).To(Equal(1))
-				Expect(resp.Choices[0].Message.Content).To(ContainSubstring("well"))
-			})
-		})
-	})
-
-	Context("API query", func() {
-		BeforeEach(func() {
-			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
-			c, cancel = context.WithCancel(context.Background())
-
-			var err error
-			app, err = App(WithContext(c), WithModelLoader(modelLoader))
-			Expect(err).ToNot(HaveOccurred())
-			go app.Listen("127.0.0.1:9090")
-
-			defaultConfig := openai.DefaultConfig("")
-			defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"
-
-			client2 = openaigo.NewClient("")
-			client2.BaseURL = defaultConfig.BaseURL
-
-			// Wait for API to be ready
-			client = openai.NewClientWithConfig(defaultConfig)
-			Eventually(func() error {
-				_, err := client.ListModels(context.TODO())
-				return err
-			}, "2m").ShouldNot(HaveOccurred())
-		})
-		AfterEach(func() {
-			cancel()
-			app.Shutdown()
-		})
-		It("returns the models list", func() {
-			models, err := client.ListModels(context.TODO())
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(models.Models)).To(Equal(10))
-		})
-		It("can generate completions", func() {
-			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel", Prompt: "abcdedfghikl"})
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(resp.Choices)).To(Equal(1))
-			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
-		})
-
-		It("can generate chat completions ", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "testmodel", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(resp.Choices)).To(Equal(1))
-			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
-		})
-
-		It("can generate completions from model configs", func() {
-			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "gpt4all", Prompt: "abcdedfghikl"})
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(resp.Choices)).To(Equal(1))
-			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
-		})
-
-		It("can generate chat completions from model configs", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-2", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(resp.Choices)).To(Equal(1))
-			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
-		})
-
-		It("returns errors", func() {
-			_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: "abcdedfghikl"})
-			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: could not load model - all backends returned error: 11 errors occurred:"))
-		})
-		It("transcribes audio", func() {
-			if runtime.GOOS != "linux" {
-				Skip("test supported only on linux")
-			}
-			resp, err := client.CreateTranscription(
-				context.Background(),
-				openai.AudioRequest{
-					Model:    openai.Whisper1,
-					FilePath: filepath.Join(os.Getenv("TEST_DIR"), "audio.wav"),
-				},
-			)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(resp.Text).To(ContainSubstring("This is the Micro Machine Man presenting"))
-		})
-
-		It("calculate embeddings", func() {
-			if runtime.GOOS != "linux" {
-				Skip("test supported only on linux")
-			}
-			resp, err := client.CreateEmbeddings(
-				context.Background(),
-				openai.EmbeddingRequest{
-					Model: openai.AdaEmbeddingV2,
-					Input: []string{"sun", "cat"},
-				},
-			)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 384))
-			Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 384))
-
-			sunEmbedding := resp.Data[0].Embedding
-			resp2, err := client.CreateEmbeddings(
-				context.Background(),
-				openai.EmbeddingRequest{
-					Model: openai.AdaEmbeddingV2,
-					Input: []string{"sun"},
-				},
-			)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(resp2.Data[0].Embedding).To(Equal(sunEmbedding))
-		})
-
-		Context("backends", func() {
-			It("runs rwkv", func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
-				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,"})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(len(resp.Choices) > 0).To(BeTrue())
-				Expect(resp.Choices[0].Text).To(Equal(" five."))
-			})
-		})
-	})
-
-	Context("Config file", func() {
-		BeforeEach(func() {
-			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
-			c, cancel = context.WithCancel(context.Background())
-
-			var err error
-			app, err = App(WithContext(c), WithModelLoader(modelLoader), WithConfigFile(os.Getenv("CONFIG_FILE")))
-			Expect(err).ToNot(HaveOccurred())
-			go app.Listen("127.0.0.1:9090")
-
-			defaultConfig := openai.DefaultConfig("")
-			defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"
-			client2 = openaigo.NewClient("")
-			client2.BaseURL = defaultConfig.BaseURL
-			// Wait for API to be ready
-			client = openai.NewClientWithConfig(defaultConfig)
-			Eventually(func() error {
-				_, err := client.ListModels(context.TODO())
-				return err
-			}, "2m").ShouldNot(HaveOccurred())
-		})
-		AfterEach(func() {
-			cancel()
-			app.Shutdown()
-		})
-		It("can generate chat completions from config file", func() {
-			models, err := client.ListModels(context.TODO())
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(models.Models)).To(Equal(12))
-		})
-		It("can generate chat completions from config file", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(resp.Choices)).To(Equal(1))
-			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
-		})
-		It("can generate chat completions from config file", func() {
-			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(resp.Choices)).To(Equal(1))
-			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
-		})
-		It("can generate edit completions from config file", func() {
-			request := openaigo.EditCreateRequestBody{
-				Model:       "list2",
-				Instruction: "foo",
-				Input:       "bar",
-			}
-			resp, err := client2.CreateEdit(context.Background(), request)
-			Expect(err).ToNot(HaveOccurred())
-			Expect(len(resp.Choices)).To(Equal(1))
-			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
-		})
-
-	})
-})
--- a/api/config.go
+++ b/api/config.go
@@ -1,354 +0,0 @@
-package api
-
-import (
-	"encoding/json"
-	"fmt"
-	"io/fs"
-	"os"
-	"path/filepath"
-	"strings"
-	"sync"
-
-	model "github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/gofiber/fiber/v2"
-	"github.com/rs/zerolog/log"
-	"gopkg.in/yaml.v3"
-)
-
-type Config struct {
-	OpenAIRequest  `yaml:"parameters"`
-	Name           string            `yaml:"name"`
-	StopWords      []string          `yaml:"stopwords"`
-	Cutstrings     []string          `yaml:"cutstrings"`
-	TrimSpace      []string          `yaml:"trimspace"`
-	ContextSize    int               `yaml:"context_size"`
-	F16            bool              `yaml:"f16"`
-	Threads        int               `yaml:"threads"`
-	Debug          bool              `yaml:"debug"`
-	Roles          map[string]string `yaml:"roles"`
-	Embeddings     bool              `yaml:"embeddings"`
-	Backend        string            `yaml:"backend"`
-	TemplateConfig TemplateConfig    `yaml:"template"`
-	MirostatETA    float64           `yaml:"mirostat_eta"`
-	MirostatTAU    float64           `yaml:"mirostat_tau"`
-	Mirostat       int               `yaml:"mirostat"`
-	NGPULayers     int               `yaml:"gpu_layers"`
-	MMap           bool              `yaml:"mmap"`
-	MMlock         bool              `yaml:"mmlock"`
-
-	TensorSplit           string `yaml:"tensor_split"`
-	MainGPU               string `yaml:"main_gpu"`
-	ImageGenerationAssets string `yaml:"asset_dir"`
-
-	PromptCachePath string `yaml:"prompt_cache_path"`
-	PromptCacheAll  bool   `yaml:"prompt_cache_all"`
-	PromptCacheRO   bool   `yaml:"prompt_cache_ro"`
-
-	PromptStrings, InputStrings []string
-	InputToken                  [][]int
-}
-
-type TemplateConfig struct {
-	Completion string `yaml:"completion"`
-	Chat       string `yaml:"chat"`
-	Edit       string `yaml:"edit"`
-}
-
-type ConfigMerger struct {
-	configs map[string]Config
-	sync.Mutex
-}
-
-func defaultConfig(modelFile string) *Config {
-	return &Config{
-		OpenAIRequest: defaultRequest(modelFile),
-	}
-}
-
-func NewConfigMerger() *ConfigMerger {
-	return &ConfigMerger{
-		configs: make(map[string]Config),
-	}
-}
-func ReadConfigFile(file string) ([]*Config, error) {
-	c := &[]*Config{}
-	f, err := os.ReadFile(file)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read config file: %w", err)
-	}
-	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
-	}
-
-	return *c, nil
-}
-
-func ReadConfig(file string) (*Config, error) {
-	c := &Config{}
-	f, err := os.ReadFile(file)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read config file: %w", err)
-	}
-	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
-	}
-
-	return c, nil
-}
-
-func (cm ConfigMerger) LoadConfigFile(file string) error {
-	cm.Lock()
-	defer cm.Unlock()
-	c, err := ReadConfigFile(file)
-	if err != nil {
-		return fmt.Errorf("cannot load config file: %w", err)
-	}
-
-	for _, cc := range c {
-		cm.configs[cc.Name] = *cc
-	}
-	return nil
-}
-
-func (cm ConfigMerger) LoadConfig(file string) error {
-	cm.Lock()
-	defer cm.Unlock()
-	c, err := ReadConfig(file)
-	if err != nil {
-		return fmt.Errorf("cannot read config file: %w", err)
-	}
-
-	cm.configs[c.Name] = *c
-	return nil
-}
-
-func (cm ConfigMerger) GetConfig(m string) (Config, bool) {
-	cm.Lock()
-	defer cm.Unlock()
-	v, exists := cm.configs[m]
-	return v, exists
-}
-
-func (cm ConfigMerger) ListConfigs() []string {
-	cm.Lock()
-	defer cm.Unlock()
-	var res []string
-	for k := range cm.configs {
-		res = append(res, k)
-	}
-	return res
-}
-
-func (cm ConfigMerger) LoadConfigs(path string) error {
-	cm.Lock()
-	defer cm.Unlock()
-	entries, err := os.ReadDir(path)
-	if err != nil {
-		return err
-	}
-	files := make([]fs.FileInfo, 0, len(entries))
-	for _, entry := range entries {
-		info, err := entry.Info()
-		if err != nil {
-			return err
-		}
-		files = append(files, info)
-	}
-	for _, file := range files {
-		// Skip templates, YAML and .keep files
-		if !strings.Contains(file.Name(), ".yaml") {
-			continue
-		}
-		c, err := ReadConfig(filepath.Join(path, file.Name()))
-		if err == nil {
-			cm.configs[c.Name] = *c
-		}
-	}
-
-	return nil
-}
-
-func updateConfig(config *Config, input *OpenAIRequest) {
-	if input.Echo {
-		config.Echo = input.Echo
-	}
-	if input.TopK != 0 {
-		config.TopK = input.TopK
-	}
-	if input.TopP != 0 {
-		config.TopP = input.TopP
-	}
-
-	if input.Temperature != 0 {
-		config.Temperature = input.Temperature
-	}
-
-	if input.Maxtokens != 0 {
-		config.Maxtokens = input.Maxtokens
-	}
-
-	switch stop := input.Stop.(type) {
-	case string:
-		if stop != "" {
-			config.StopWords = append(config.StopWords, stop)
-		}
-	case []interface{}:
-		for _, pp := range stop {
-			if s, ok := pp.(string); ok {
-				config.StopWords = append(config.StopWords, s)
-			}
-		}
-	}
-
-	if input.RepeatPenalty != 0 {
-		config.RepeatPenalty = input.RepeatPenalty
-	}
-
-	if input.Keep != 0 {
-		config.Keep = input.Keep
-	}
-
-	if input.Batch != 0 {
-		config.Batch = input.Batch
-	}
-
-	if input.F16 {
-		config.F16 = input.F16
-	}
-
-	if input.IgnoreEOS {
-		config.IgnoreEOS = input.IgnoreEOS
-	}
-
-	if input.Seed != 0 {
-		config.Seed = input.Seed
-	}
-
-	if input.Mirostat != 0 {
-		config.Mirostat = input.Mirostat
-	}
-
-	if input.MirostatETA != 0 {
-		config.MirostatETA = input.MirostatETA
-	}
-
-	if input.MirostatTAU != 0 {
-		config.MirostatTAU = input.MirostatTAU
-	}
-
-	if input.TypicalP != 0 {
-		config.TypicalP = input.TypicalP
-	}
-
-	switch inputs := input.Input.(type) {
-	case string:
-		if inputs != "" {
-			config.InputStrings = append(config.InputStrings, inputs)
-		}
-	case []interface{}:
-		for _, pp := range inputs {
-			switch i := pp.(type) {
-			case string:
-				config.InputStrings = append(config.InputStrings, i)
-			case []interface{}:
-				tokens := []int{}
-				for _, ii := range i {
-					tokens = append(tokens, int(ii.(float64)))
-				}
-				config.InputToken = append(config.InputToken, tokens)
-			}
-		}
-	}
-
-	switch p := input.Prompt.(type) {
-	case string:
-		config.PromptStrings = append(config.PromptStrings, p)
-	case []interface{}:
-		for _, pp := range p {
-			if s, ok := pp.(string); ok {
-				config.PromptStrings = append(config.PromptStrings, s)
-			}
-		}
-	}
-}
-func readInput(c *fiber.Ctx, loader *model.ModelLoader, randomModel bool) (string, *OpenAIRequest, error) {
-	input := new(OpenAIRequest)
-	// Get input data from the request body
-	if err := c.BodyParser(input); err != nil {
-		return "", nil, err
-	}
-
-	modelFile := input.Model
-
-	if c.Params("model") != "" {
-		modelFile = c.Params("model")
-	}
-
-	received, _ := json.Marshal(input)
-
-	log.Debug().Msgf("Request received: %s", string(received))
-
-	// Set model from bearer token, if available
-	bearer := strings.TrimLeft(c.Get("authorization"), "Bearer ")
-	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
-
-	// If no model was specified, take the first available
-	if modelFile == "" && !bearerExists && randomModel {
-		models, _ := loader.ListModels()
-		if len(models) > 0 {
-			modelFile = models[0]
-			log.Debug().Msgf("No model specified, using: %s", modelFile)
-		} else {
-			log.Debug().Msgf("No model specified, returning error")
-			return "", nil, fmt.Errorf("no model specified")
-		}
-	}
-
-	// If a model is found in bearer token takes precedence
-	if bearerExists {
-		log.Debug().Msgf("Using model from bearer token: %s", bearer)
-		modelFile = bearer
-	}
-	return modelFile, input, nil
-}
-
-func readConfig(modelFile string, input *OpenAIRequest, cm *ConfigMerger, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*Config, *OpenAIRequest, error) {
-	// Load a config file if present after the model name
-	modelConfig := filepath.Join(loader.ModelPath, modelFile+".yaml")
-	if _, err := os.Stat(modelConfig); err == nil {
-		if err := cm.LoadConfig(modelConfig); err != nil {
-			return nil, nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
-		}
-	}
-
-	var config *Config
-	cfg, exists := cm.GetConfig(modelFile)
-	if !exists {
-		config = defaultConfig(modelFile)
-		config.ContextSize = ctx
-		config.Threads = threads
-		config.F16 = f16
-		config.Debug = debug
-	} else {
-		config = &cfg
-	}
-
-	// Set the parameters for the language model prediction
-	updateConfig(config, input)
-
-	// Don't allow 0 as setting
-	if config.Threads == 0 {
-		if threads != 0 {
-			config.Threads = threads
-		} else {
-			config.Threads = 4
-		}
-	}
-
-	// Enforce debug flag if passed from CLI
-	if debug {
-		config.Debug = true
-	}
-
-	return config, input, nil
-}
--- a/api/config_test.go
+++ b/api/config_test.go
@@ -1,54 +0,0 @@
-package api
-
-import (
-	"os"
-
-	"github.com/go-skynet/LocalAI/pkg/model"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("Test cases for config related functions", func() {
-
-	var (
-		configFile string
-	)
-
-	Context("Test Read configuration functions", func() {
-		configFile = os.Getenv("CONFIG_FILE")
-		It("Test ReadConfigFile", func() {
-			config, err := ReadConfigFile(configFile)
-			Expect(err).To(BeNil())
-			Expect(config).ToNot(BeNil())
-			// two configs in config.yaml
-			Expect(config[0].Name).To(Equal("list1"))
-			Expect(config[1].Name).To(Equal("list2"))
-		})
-
-		It("Test LoadConfigs", func() {
-			cm := NewConfigMerger()
-			options := newOptions()
-			modelLoader := model.NewModelLoader(os.Getenv("MODELS_PATH"))
-			WithModelLoader(modelLoader)(options)
-
-			err := cm.LoadConfigs(options.loader.ModelPath)
-			Expect(err).To(BeNil())
-			Expect(cm.configs).ToNot(BeNil())
-
-			// config should includes gpt4all models's api.config
-			Expect(cm.configs).To(HaveKey("gpt4all"))
-
-			// config should includes gpt2 models's api.config
-			Expect(cm.configs).To(HaveKey("gpt4all-2"))
-
-			// config should includes text-embedding-ada-002 models's api.config
-			Expect(cm.configs).To(HaveKey("text-embedding-ada-002"))
-
-			// config should includes rwkv_test models's api.config
-			Expect(cm.configs).To(HaveKey("rwkv_test"))
-
-			// config should includes whisper-1 models's api.config
-			Expect(cm.configs).To(HaveKey("whisper-1"))
-		})
-	})
-})
--- a/api/gallery.go
+++ b/api/gallery.go
@@ -1,267 +0,0 @@
-package api
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"io/ioutil"
-	"net/http"
-	"net/url"
-	"os"
-	"strings"
-	"sync"
-	"time"
-
-	"github.com/go-skynet/LocalAI/pkg/gallery"
-	"github.com/gofiber/fiber/v2"
-	"github.com/google/uuid"
-	"github.com/rs/zerolog/log"
-	"gopkg.in/yaml.v3"
-)
-
-type galleryOp struct {
-	req ApplyGalleryModelRequest
-	id  string
-}
-
-type galleryOpStatus struct {
-	Error              error   `json:"error"`
-	Processed          bool    `json:"processed"`
-	Message            string  `json:"message"`
-	Progress           float64 `json:"progress"`
-	TotalFileSize      string  `json:"file_size"`
-	DownloadedFileSize string  `json:"downloaded_size"`
-}
-
-type galleryApplier struct {
-	modelPath string
-	sync.Mutex
-	C        chan galleryOp
-	statuses map[string]*galleryOpStatus
-}
-
-func newGalleryApplier(modelPath string) *galleryApplier {
-	return &galleryApplier{
-		modelPath: modelPath,
-		C:         make(chan galleryOp),
-		statuses:  make(map[string]*galleryOpStatus),
-	}
-}
-
-func applyGallery(modelPath string, req ApplyGalleryModelRequest, cm *ConfigMerger, downloadStatus func(string, string, string, float64)) error {
-	url, err := req.DecodeURL()
-	if err != nil {
-		return err
-	}
-
-	// Send a GET request to the URL
-	response, err := http.Get(url)
-	if err != nil {
-		return err
-	}
-	defer response.Body.Close()
-
-	// Read the response body
-	body, err := ioutil.ReadAll(response.Body)
-	if err != nil {
-		return err
-	}
-
-	// Unmarshal YAML data into a Config struct
-	var config gallery.Config
-	err = yaml.Unmarshal(body, &config)
-	if err != nil {
-		return err
-	}
-
-	config.Files = append(config.Files, req.AdditionalFiles...)
-
-	if err := gallery.Apply(modelPath, req.Name, &config, req.Overrides, downloadStatus); err != nil {
-		return err
-	}
-
-	// Reload models
-	return cm.LoadConfigs(modelPath)
-}
-
-func (g *galleryApplier) updatestatus(s string, op *galleryOpStatus) {
-	g.Lock()
-	defer g.Unlock()
-	g.statuses[s] = op
-}
-
-func (g *galleryApplier) getstatus(s string) *galleryOpStatus {
-	g.Lock()
-	defer g.Unlock()
-
-	return g.statuses[s]
-}
-
-func (g *galleryApplier) start(c context.Context, cm *ConfigMerger) {
-	go func() {
-		for {
-			select {
-			case <-c.Done():
-				return
-			case op := <-g.C:
-				g.updatestatus(op.id, &galleryOpStatus{Message: "processing", Progress: 0})
-
-				updateError := func(e error) {
-					g.updatestatus(op.id, &galleryOpStatus{Error: e, Processed: true})
-				}
-
-				if err := applyGallery(g.modelPath, op.req, cm, func(fileName string, current string, total string, percentage float64) {
-					g.updatestatus(op.id, &galleryOpStatus{Message: "processing", Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
-					displayDownload(fileName, current, total, percentage)
-				}); err != nil {
-					updateError(err)
-					continue
-				}
-
-				g.updatestatus(op.id, &galleryOpStatus{Processed: true, Message: "completed", Progress: 100})
-			}
-		}
-	}()
-}
-
-var lastProgress time.Time = time.Now()
-var startTime time.Time = time.Now()
-
-func displayDownload(fileName string, current string, total string, percentage float64) {
-	currentTime := time.Now()
-
-	if currentTime.Sub(lastProgress) >= 5*time.Second {
-
-		lastProgress = currentTime
-
-		// calculate ETA based on percentage and elapsed time
-		var eta time.Duration
-		if percentage > 0 {
-			elapsed := currentTime.Sub(startTime)
-			eta = time.Duration(float64(elapsed)*(100/percentage) - float64(elapsed))
-		}
-
-		if total != "" {
-			log.Debug().Msgf("Downloading %s: %s/%s (%.2f%%) ETA: %s", fileName, current, total, percentage, eta)
-		} else {
-			log.Debug().Msgf("Downloading: %s", current)
-		}
-	}
-}
-
-func ApplyGalleryFromFile(modelPath, s string, cm *ConfigMerger) error {
-	dat, err := os.ReadFile(s)
-	if err != nil {
-		return err
-	}
-	var requests []ApplyGalleryModelRequest
-	err = json.Unmarshal(dat, &requests)
-	if err != nil {
-		return err
-	}
-
-	for _, r := range requests {
-		if err := applyGallery(modelPath, r, cm, displayDownload); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-func ApplyGalleryFromString(modelPath, s string, cm *ConfigMerger) error {
-	var requests []ApplyGalleryModelRequest
-	err := json.Unmarshal([]byte(s), &requests)
-	if err != nil {
-		return err
-	}
-
-	for _, r := range requests {
-		if err := applyGallery(modelPath, r, cm, displayDownload); err != nil {
-			return err
-		}
-	}
-
-	return nil
-}
-
-// endpoints
-
-type ApplyGalleryModelRequest struct {
-	URL             string                 `json:"url"`
-	Name            string                 `json:"name"`
-	Overrides       map[string]interface{} `json:"overrides"`
-	AdditionalFiles []gallery.File         `json:"files"`
-}
-
-const (
-	githubURI = "github:"
-)
-
-func (request ApplyGalleryModelRequest) DecodeURL() (string, error) {
-	input := request.URL
-	var rawURL string
-
-	if strings.HasPrefix(input, githubURI) {
-		parts := strings.Split(input, ":")
-		repoParts := strings.Split(parts[1], "@")
-		branch := "main"
-
-		if len(repoParts) > 1 {
-			branch = repoParts[1]
-		}
-
-		repoPath := strings.Split(repoParts[0], "/")
-		org := repoPath[0]
-		project := repoPath[1]
-		projectPath := strings.Join(repoPath[2:], "/")
-
-		rawURL = fmt.Sprintf("https://raw.githubusercontent.com/%s/%s/%s/%s", org, project, branch, projectPath)
-	} else if strings.HasPrefix(input, "http://") || strings.HasPrefix(input, "https://") {
-		// Handle regular URLs
-		u, err := url.Parse(input)
-		if err != nil {
-			return "", fmt.Errorf("invalid URL: %w", err)
-		}
-		rawURL = u.String()
-	} else {
-		return "", fmt.Errorf("invalid URL format")
-	}
-
-	return rawURL, nil
-}
-
-func getOpStatus(g *galleryApplier) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-
-		status := g.getstatus(c.Params("uuid"))
-		if status == nil {
-			return fmt.Errorf("could not find any status for ID")
-		}
-
-		return c.JSON(status)
-	}
-}
-
-func applyModelGallery(modelPath string, cm *ConfigMerger, g chan galleryOp) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		input := new(ApplyGalleryModelRequest)
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-
-		uuid, err := uuid.NewUUID()
-		if err != nil {
-			return err
-		}
-		g <- galleryOp{
-			req: *input,
-			id:  uuid.String(),
-		}
-		return c.JSON(struct {
-			ID        string `json:"uuid"`
-			StatusURL string `json:"status"`
-		}{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
-	}
-}
--- a/api/gallery_test.go
+++ b/api/gallery_test.go
@@ -1,30 +0,0 @@
-package api_test
-
-import (
-	. "github.com/go-skynet/LocalAI/api"
-	. "github.com/onsi/ginkgo/v2"
-	. "github.com/onsi/gomega"
-)
-
-var _ = Describe("Gallery API tests", func() {
-	Context("requests", func() {
-		It("parses github with a branch", func() {
-			req := ApplyGalleryModelRequest{URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main"}
-			str, err := req.DecodeURL()
-			Expect(err).ToNot(HaveOccurred())
-			Expect(str).To(Equal("https://raw.githubusercontent.com/go-skynet/model-gallery/main/gpt4all-j.yaml"))
-		})
-		It("parses github without a branch", func() {
-			req := ApplyGalleryModelRequest{URL: "github:go-skynet/model-gallery/gpt4all-j.yaml"}
-			str, err := req.DecodeURL()
-			Expect(err).ToNot(HaveOccurred())
-			Expect(str).To(Equal("https://raw.githubusercontent.com/go-skynet/model-gallery/main/gpt4all-j.yaml"))
-		})
-		It("parses URLS", func() {
-			req := ApplyGalleryModelRequest{URL: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/gpt4all-j.yaml"}
-			str, err := req.DecodeURL()
-			Expect(err).ToNot(HaveOccurred())
-			Expect(str).To(Equal("https://raw.githubusercontent.com/go-skynet/model-gallery/main/gpt4all-j.yaml"))
-		})
-	})
-})
--- a/api/openai.go
+++ b/api/openai.go
@@ -1,755 +0,0 @@
-package api
-
-import (
-	"bufio"
-	"bytes"
-	"encoding/base64"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"net/http"
-	"os"
-	"path"
-	"path/filepath"
-	"strconv"
-	"strings"
-
-	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-	whisperutil "github.com/go-skynet/LocalAI/pkg/whisper"
-	llama "github.com/go-skynet/go-llama.cpp"
-	"github.com/gofiber/fiber/v2"
-	"github.com/rs/zerolog/log"
-	"github.com/valyala/fasthttp"
-)
-
-// APIError provides error information returned by the OpenAI API.
-type APIError struct {
-	Code    any     `json:"code,omitempty"`
-	Message string  `json:"message"`
-	Param   *string `json:"param,omitempty"`
-	Type    string  `json:"type"`
-}
-
-type ErrorResponse struct {
-	Error *APIError `json:"error,omitempty"`
-}
-
-type OpenAIUsage struct {
-	PromptTokens     int `json:"prompt_tokens"`
-	CompletionTokens int `json:"completion_tokens"`
-	TotalTokens      int `json:"total_tokens"`
-}
-
-type Item struct {
-	Embedding []float32 `json:"embedding"`
-	Index     int       `json:"index"`
-	Object    string    `json:"object,omitempty"`
-
-	// Images
-	URL     string `json:"url,omitempty"`
-	B64JSON string `json:"b64_json,omitempty"`
-}
-
-type OpenAIResponse struct {
-	Created int      `json:"created,omitempty"`
-	Object  string   `json:"object,omitempty"`
-	ID      string   `json:"id,omitempty"`
-	Model   string   `json:"model,omitempty"`
-	Choices []Choice `json:"choices,omitempty"`
-	Data    []Item   `json:"data,omitempty"`
-
-	Usage OpenAIUsage `json:"usage"`
-}
-
-type Choice struct {
-	Index        int      `json:"index,omitempty"`
-	FinishReason string   `json:"finish_reason,omitempty"`
-	Message      *Message `json:"message,omitempty"`
-	Delta        *Message `json:"delta,omitempty"`
-	Text         string   `json:"text,omitempty"`
-}
-
-type Message struct {
-	Role    string `json:"role,omitempty" yaml:"role"`
-	Content string `json:"content,omitempty" yaml:"content"`
-}
-
-type OpenAIModel struct {
-	ID     string `json:"id"`
-	Object string `json:"object"`
-}
-
-type OpenAIRequest struct {
-	Model string `json:"model" yaml:"model"`
-
-	// whisper
-	File     string `json:"file" validate:"required"`
-	Language string `json:"language"`
-	//whisper/image
-	ResponseFormat string `json:"response_format"`
-	// image
-	Size string `json:"size"`
-	// Prompt is read only by completion/image API calls
-	Prompt interface{} `json:"prompt" yaml:"prompt"`
-
-	// Edit endpoint
-	Instruction string      `json:"instruction" yaml:"instruction"`
-	Input       interface{} `json:"input" yaml:"input"`
-
-	Stop interface{} `json:"stop" yaml:"stop"`
-
-	// Messages is read only by chat/completion API calls
-	Messages []Message `json:"messages" yaml:"messages"`
-
-	Stream bool `json:"stream"`
-	Echo   bool `json:"echo"`
-	// Common options between all the API calls
-	TopP        float64 `json:"top_p" yaml:"top_p"`
-	TopK        int     `json:"top_k" yaml:"top_k"`
-	Temperature float64 `json:"temperature" yaml:"temperature"`
-	Maxtokens   int     `json:"max_tokens" yaml:"max_tokens"`
-
-	N int `json:"n"`
-
-	// Custom parameters - not present in the OpenAI API
-	Batch         int     `json:"batch" yaml:"batch"`
-	F16           bool    `json:"f16" yaml:"f16"`
-	IgnoreEOS     bool    `json:"ignore_eos" yaml:"ignore_eos"`
-	RepeatPenalty float64 `json:"repeat_penalty" yaml:"repeat_penalty"`
-	Keep          int     `json:"n_keep" yaml:"n_keep"`
-
-	MirostatETA float64 `json:"mirostat_eta" yaml:"mirostat_eta"`
-	MirostatTAU float64 `json:"mirostat_tau" yaml:"mirostat_tau"`
-	Mirostat    int     `json:"mirostat" yaml:"mirostat"`
-
-	FrequencyPenalty float64 `json:"frequency_penalty" yaml:"frequency_penalty"`
-	TFZ              float64 `json:"tfz" yaml:"tfz"`
-
-	Seed int `json:"seed" yaml:"seed"`
-
-	// Image (not supported by OpenAI)
-	Mode int `json:"mode"`
-	Step int `json:"step"`
-
-	TypicalP float64 `json:"typical_p" yaml:"typical_p"`
-}
-
-func defaultRequest(modelFile string) OpenAIRequest {
-	return OpenAIRequest{
-		TopP:        0.7,
-		TopK:        80,
-		Maxtokens:   512,
-		Temperature: 0.9,
-		Model:       modelFile,
-	}
-}
-
-// https://platform.openai.com/docs/api-reference/completions
-func completionEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {
-	process := func(s string, req *OpenAIRequest, config *Config, loader *model.ModelLoader, responses chan OpenAIResponse) {
-		ComputeChoices(s, req, config, o, loader, func(s string, c *[]Choice) {}, func(s string) bool {
-			resp := OpenAIResponse{
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []Choice{{Text: s}},
-				Object:  "text_completion",
-			}
-			log.Debug().Msgf("Sending goroutine: %s", s)
-
-			responses <- resp
-			return true
-		})
-		close(responses)
-	}
-
-	return func(c *fiber.Ctx) error {
-		model, input, err := readInput(c, o.loader, true)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		log.Debug().Msgf("`input`: %+v", input)
-
-		config, input, err := readConfig(model, input, cm, o.loader, o.debug, o.threads, o.ctxSize, o.f16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		log.Debug().Msgf("Parameter Config: %+v", config)
-
-		if input.Stream {
-			log.Debug().Msgf("Stream request received")
-			c.Context().SetContentType("text/event-stream")
-			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
-			//c.Set("Content-Type", "text/event-stream")
-			c.Set("Cache-Control", "no-cache")
-			c.Set("Connection", "keep-alive")
-			c.Set("Transfer-Encoding", "chunked")
-		}
-
-		templateFile := config.Model
-
-		if config.TemplateConfig.Completion != "" {
-			templateFile = config.TemplateConfig.Completion
-		}
-
-		if input.Stream {
-			if len(config.PromptStrings) > 1 {
-				return errors.New("cannot handle more than 1 `PromptStrings` when `Stream`ing")
-			}
-
-			predInput := config.PromptStrings[0]
-
-			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-			templatedInput, err := o.loader.TemplatePrefix(templateFile, struct {
-				Input string
-			}{Input: predInput})
-			if err == nil {
-				predInput = templatedInput
-				log.Debug().Msgf("Template found, input modified to: %s", predInput)
-			}
-
-			responses := make(chan OpenAIResponse)
-
-			go process(predInput, input, config, o.loader, responses)
-
-			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
-
-				for ev := range responses {
-					var buf bytes.Buffer
-					enc := json.NewEncoder(&buf)
-					enc.Encode(ev)
-
-					log.Debug().Msgf("Sending chunk: %s", buf.String())
-					fmt.Fprintf(w, "data: %v\n", buf.String())
-					w.Flush()
-				}
-
-				resp := &OpenAIResponse{
-					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []Choice{{FinishReason: "stop"}},
-				}
-				respData, _ := json.Marshal(resp)
-
-				w.WriteString(fmt.Sprintf("data: %s\n\n", respData))
-				w.WriteString("data: [DONE]\n\n")
-				w.Flush()
-			}))
-			return nil
-		}
-
-		var result []Choice
-		for _, i := range config.PromptStrings {
-			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-			templatedInput, err := o.loader.TemplatePrefix(templateFile, struct {
-				Input string
-			}{Input: i})
-			if err == nil {
-				i = templatedInput
-				log.Debug().Msgf("Template found, input modified to: %s", i)
-			}
-
-			r, err := ComputeChoices(i, input, config, o, o.loader, func(s string, c *[]Choice) {
-				*c = append(*c, Choice{Text: s})
-			}, nil)
-			if err != nil {
-				return err
-			}
-
-			result = append(result, r...)
-		}
-
-		resp := &OpenAIResponse{
-			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: result,
-			Object:  "text_completion",
-		}
-
-		jsonResult, _ := json.Marshal(resp)
-		log.Debug().Msgf("Response: %s", jsonResult)
-
-		// Return the prediction in the response body
-		return c.JSON(resp)
-	}
-}
-
-// https://platform.openai.com/docs/api-reference/embeddings
-func embeddingsEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		model, input, err := readInput(c, o.loader, true)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		config, input, err := readConfig(model, input, cm, o.loader, o.debug, o.threads, o.ctxSize, o.f16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		log.Debug().Msgf("Parameter Config: %+v", config)
-		items := []Item{}
-
-		for i, s := range config.InputToken {
-			// get the model function to call for the result
-			embedFn, err := ModelEmbedding("", s, o.loader, *config, o)
-			if err != nil {
-				return err
-			}
-
-			embeddings, err := embedFn()
-			if err != nil {
-				return err
-			}
-			items = append(items, Item{Embedding: embeddings, Index: i, Object: "embedding"})
-		}
-
-		for i, s := range config.InputStrings {
-			// get the model function to call for the result
-			embedFn, err := ModelEmbedding(s, []int{}, o.loader, *config, o)
-			if err != nil {
-				return err
-			}
-
-			embeddings, err := embedFn()
-			if err != nil {
-				return err
-			}
-			items = append(items, Item{Embedding: embeddings, Index: i, Object: "embedding"})
-		}
-
-		resp := &OpenAIResponse{
-			Model:  input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Data:   items,
-			Object: "list",
-		}
-
-		jsonResult, _ := json.Marshal(resp)
-		log.Debug().Msgf("Response: %s", jsonResult)
-
-		// Return the prediction in the response body
-		return c.JSON(resp)
-	}
-}
-
-func chatEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {
-
-	process := func(s string, req *OpenAIRequest, config *Config, loader *model.ModelLoader, responses chan OpenAIResponse) {
-		initialMessage := OpenAIResponse{
-			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: []Choice{{Delta: &Message{Role: "assistant"}}},
-			Object:  "chat.completion.chunk",
-		}
-		responses <- initialMessage
-
-		ComputeChoices(s, req, config, o, loader, func(s string, c *[]Choice) {}, func(s string) bool {
-			resp := OpenAIResponse{
-				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
-				Choices: []Choice{{Delta: &Message{Content: s}}},
-				Object:  "chat.completion.chunk",
-			}
-			log.Debug().Msgf("Sending goroutine: %s", s)
-
-			responses <- resp
-			return true
-		})
-		close(responses)
-	}
-	return func(c *fiber.Ctx) error {
-		model, input, err := readInput(c, o.loader, true)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		config, input, err := readConfig(model, input, cm, o.loader, o.debug, o.threads, o.ctxSize, o.f16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		log.Debug().Msgf("Parameter Config: %+v", config)
-
-		var predInput string
-
-		mess := []string{}
-		for _, i := range input.Messages {
-			var content string
-			r := config.Roles[i.Role]
-			if r != "" {
-				content = fmt.Sprint(r, " ", i.Content)
-			} else {
-				content = i.Content
-			}
-
-			mess = append(mess, content)
-		}
-
-		predInput = strings.Join(mess, "\n")
-
-		if input.Stream {
-			log.Debug().Msgf("Stream request received")
-			c.Context().SetContentType("text/event-stream")
-			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
-			//	c.Set("Content-Type", "text/event-stream")
-			c.Set("Cache-Control", "no-cache")
-			c.Set("Connection", "keep-alive")
-			c.Set("Transfer-Encoding", "chunked")
-		}
-
-		templateFile := config.Model
-
-		if config.TemplateConfig.Chat != "" {
-			templateFile = config.TemplateConfig.Chat
-		}
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		templatedInput, err := o.loader.TemplatePrefix(templateFile, struct {
-			Input string
-		}{Input: predInput})
-		if err == nil {
-			predInput = templatedInput
-			log.Debug().Msgf("Template found, input modified to: %s", predInput)
-		}
-
-		if input.Stream {
-			responses := make(chan OpenAIResponse)
-
-			go process(predInput, input, config, o.loader, responses)
-
-			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
-
-				for ev := range responses {
-					var buf bytes.Buffer
-					enc := json.NewEncoder(&buf)
-					enc.Encode(ev)
-
-					log.Debug().Msgf("Sending chunk: %s", buf.String())
-					fmt.Fprintf(w, "data: %v\n", buf.String())
-					w.Flush()
-				}
-
-				resp := &OpenAIResponse{
-					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-					Choices: []Choice{{FinishReason: "stop"}},
-				}
-				respData, _ := json.Marshal(resp)
-
-				w.WriteString(fmt.Sprintf("data: %s\n\n", respData))
-				w.WriteString("data: [DONE]\n\n")
-				w.Flush()
-			}))
-			return nil
-		}
-
-		result, err := ComputeChoices(predInput, input, config, o, o.loader, func(s string, c *[]Choice) {
-			*c = append(*c, Choice{Message: &Message{Role: "assistant", Content: s}})
-		}, nil)
-		if err != nil {
-			return err
-		}
-
-		resp := &OpenAIResponse{
-			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: result,
-			Object:  "chat.completion",
-		}
-		respData, _ := json.Marshal(resp)
-		log.Debug().Msgf("Response: %s", respData)
-
-		// Return the prediction in the response body
-		return c.JSON(resp)
-	}
-}
-
-func editEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		model, input, err := readInput(c, o.loader, true)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		config, input, err := readConfig(model, input, cm, o.loader, o.debug, o.threads, o.ctxSize, o.f16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		log.Debug().Msgf("Parameter Config: %+v", config)
-
-		templateFile := config.Model
-
-		if config.TemplateConfig.Edit != "" {
-			templateFile = config.TemplateConfig.Edit
-		}
-
-		var result []Choice
-		for _, i := range config.InputStrings {
-			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-			templatedInput, err := o.loader.TemplatePrefix(templateFile, struct {
-				Input       string
-				Instruction string
-			}{Input: i})
-			if err == nil {
-				i = templatedInput
-				log.Debug().Msgf("Template found, input modified to: %s", i)
-			}
-
-			r, err := ComputeChoices(i, input, config, o, o.loader, func(s string, c *[]Choice) {
-				*c = append(*c, Choice{Text: s})
-			}, nil)
-			if err != nil {
-				return err
-			}
-
-			result = append(result, r...)
-		}
-
-		resp := &OpenAIResponse{
-			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: result,
-			Object:  "edit",
-		}
-
-		jsonResult, _ := json.Marshal(resp)
-		log.Debug().Msgf("Response: %s", jsonResult)
-
-		// Return the prediction in the response body
-		return c.JSON(resp)
-	}
-}
-
-// https://platform.openai.com/docs/api-reference/images/create
-
-/*
-*
-
-	curl http://localhost:8080/v1/images/generations \
-	  -H "Content-Type: application/json" \
-	  -d '{
-	    "prompt": "A cute baby sea otter",
-	    "n": 1,
-	    "size": "512x512"
-	  }'
-
-*
-*/
-func imageEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		m, input, err := readInput(c, o.loader, false)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		if m == "" {
-			m = model.StableDiffusionBackend
-		}
-		log.Debug().Msgf("Loading model: %+v", m)
-
-		config, input, err := readConfig(m, input, cm, o.loader, o.debug, 0, 0, false)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		log.Debug().Msgf("Parameter Config: %+v", config)
-
-		// XXX: Only stablediffusion is supported for now
-		if config.Backend == "" {
-			config.Backend = model.StableDiffusionBackend
-		}
-
-		sizeParts := strings.Split(input.Size, "x")
-		if len(sizeParts) != 2 {
-			return fmt.Errorf("Invalid value for 'size'")
-		}
-		width, err := strconv.Atoi(sizeParts[0])
-		if err != nil {
-			return fmt.Errorf("Invalid value for 'size'")
-		}
-		height, err := strconv.Atoi(sizeParts[1])
-		if err != nil {
-			return fmt.Errorf("Invalid value for 'size'")
-		}
-
-		b64JSON := false
-		if input.ResponseFormat == "b64_json" {
-			b64JSON = true
-		}
-
-		var result []Item
-		for _, i := range config.PromptStrings {
-			n := input.N
-			if input.N == 0 {
-				n = 1
-			}
-			for j := 0; j < n; j++ {
-				prompts := strings.Split(i, "|")
-				positive_prompt := prompts[0]
-				negative_prompt := ""
-				if len(prompts) > 1 {
-					negative_prompt = prompts[1]
-				}
-
-				mode := 0
-				step := 15
-
-				if input.Mode != 0 {
-					mode = input.Mode
-				}
-
-				if input.Step != 0 {
-					step = input.Step
-				}
-
-				tempDir := ""
-				if !b64JSON {
-					tempDir = o.imageDir
-				}
-				// Create a temporary file
-				outputFile, err := ioutil.TempFile(tempDir, "b64")
-				if err != nil {
-					return err
-				}
-				outputFile.Close()
-				output := outputFile.Name() + ".png"
-				// Rename the temporary file
-				err = os.Rename(outputFile.Name(), output)
-				if err != nil {
-					return err
-				}
-
-				baseURL := c.BaseURL()
-
-				fn, err := ImageGeneration(height, width, mode, step, input.Seed, positive_prompt, negative_prompt, output, o.loader, *config, o)
-				if err != nil {
-					return err
-				}
-				if err := fn(); err != nil {
-					return err
-				}
-
-				item := &Item{}
-
-				if b64JSON {
-					defer os.RemoveAll(output)
-					data, err := os.ReadFile(output)
-					if err != nil {
-						return err
-					}
-					item.B64JSON = base64.StdEncoding.EncodeToString(data)
-				} else {
-					base := filepath.Base(output)
-					item.URL = baseURL + "/generated-images/" + base
-				}
-
-				result = append(result, *item)
-			}
-		}
-
-		resp := &OpenAIResponse{
-			Data: result,
-		}
-
-		jsonResult, _ := json.Marshal(resp)
-		log.Debug().Msgf("Response: %s", jsonResult)
-
-		// Return the prediction in the response body
-		return c.JSON(resp)
-	}
-}
-
-// https://platform.openai.com/docs/api-reference/audio/create
-func transcriptEndpoint(cm *ConfigMerger, o *Option) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		m, input, err := readInput(c, o.loader, false)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-
-		config, input, err := readConfig(m, input, cm, o.loader, o.debug, o.threads, o.ctxSize, o.f16)
-		if err != nil {
-			return fmt.Errorf("failed reading parameters from request:%w", err)
-		}
-		// retrieve the file data from the request
-		file, err := c.FormFile("file")
-		if err != nil {
-			return err
-		}
-		f, err := file.Open()
-		if err != nil {
-			return err
-		}
-		defer f.Close()
-
-		dir, err := os.MkdirTemp("", "whisper")
-
-		if err != nil {
-			return err
-		}
-		defer os.RemoveAll(dir)
-
-		dst := filepath.Join(dir, path.Base(file.Filename))
-		dstFile, err := os.Create(dst)
-		if err != nil {
-			return err
-		}
-
-		if _, err := io.Copy(dstFile, f); err != nil {
-			log.Debug().Msgf("Audio file copying error %+v - %+v - err %+v", file.Filename, dst, err)
-			return err
-		}
-
-		log.Debug().Msgf("Audio file copied to: %+v", dst)
-
-		whisperModel, err := o.loader.BackendLoader(model.WhisperBackend, config.Model, []llama.ModelOption{}, uint32(config.Threads), o.assetsDestination)
-		if err != nil {
-			return err
-		}
-
-		if whisperModel == nil {
-			return fmt.Errorf("could not load whisper model")
-		}
-
-		w, ok := whisperModel.(whisper.Model)
-		if !ok {
-			return fmt.Errorf("loader returned non-whisper object")
-		}
-
-		tr, err := whisperutil.Transcript(w, dst, input.Language, uint(config.Threads))
-		if err != nil {
-			return err
-		}
-
-		log.Debug().Msgf("Trascribed: %+v", tr)
-		// TODO: handle different outputs here
-		return c.Status(http.StatusOK).JSON(fiber.Map{"text": tr})
-	}
-}
-
-func listModels(loader *model.ModelLoader, cm *ConfigMerger) func(ctx *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		models, err := loader.ListModels()
-		if err != nil {
-			return err
-		}
-		var mm map[string]interface{} = map[string]interface{}{}
-
-		dataModels := []OpenAIModel{}
-		for _, m := range models {
-			mm[m] = nil
-			dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
-		}
-
-		for _, k := range cm.ListConfigs() {
-			if _, exists := mm[k]; !exists {
-				dataModels = append(dataModels, OpenAIModel{ID: k, Object: "model"})
-			}
-		}
-
-		return c.JSON(struct {
-			Object string        `json:"object"`
-			Data   []OpenAIModel `json:"data"`
-		}{
-			Object: "list",
-			Data:   dataModels,
-		})
-	}
-}
--- a/api/options.go
+++ b/api/options.go
@@ -1,137 +0,0 @@
-package api
-
-import (
-	"context"
-	"embed"
-
-	model "github.com/go-skynet/LocalAI/pkg/model"
-)
-
-type Option struct {
-	context                         context.Context
-	configFile                      string
-	loader                          *model.ModelLoader
-	uploadLimitMB, threads, ctxSize int
-	f16                             bool
-	debug, disableMessage           bool
-	imageDir                        string
-	cors                            bool
-	preloadJSONModels               string
-	preloadModelsFromPath           string
-	corsAllowOrigins                string
-
-	backendAssets     embed.FS
-	assetsDestination string
-}
-
-type AppOption func(*Option)
-
-func newOptions(o ...AppOption) *Option {
-	opt := &Option{
-		context:        context.Background(),
-		uploadLimitMB:  15,
-		threads:        1,
-		ctxSize:        512,
-		debug:          true,
-		disableMessage: true,
-	}
-	for _, oo := range o {
-		oo(opt)
-	}
-	return opt
-}
-
-func WithCors(b bool) AppOption {
-	return func(o *Option) {
-		o.cors = b
-	}
-}
-
-func WithCorsAllowOrigins(b string) AppOption {
-	return func(o *Option) {
-		o.corsAllowOrigins = b
-	}
-}
-
-func WithBackendAssetsOutput(out string) AppOption {
-	return func(o *Option) {
-		o.assetsDestination = out
-	}
-}
-
-func WithBackendAssets(f embed.FS) AppOption {
-	return func(o *Option) {
-		o.backendAssets = f
-	}
-}
-
-func WithContext(ctx context.Context) AppOption {
-	return func(o *Option) {
-		o.context = ctx
-	}
-}
-
-func WithYAMLConfigPreload(configFile string) AppOption {
-	return func(o *Option) {
-		o.preloadModelsFromPath = configFile
-	}
-}
-
-func WithJSONStringPreload(configFile string) AppOption {
-	return func(o *Option) {
-		o.preloadJSONModels = configFile
-	}
-}
-func WithConfigFile(configFile string) AppOption {
-	return func(o *Option) {
-		o.configFile = configFile
-	}
-}
-
-func WithModelLoader(loader *model.ModelLoader) AppOption {
-	return func(o *Option) {
-		o.loader = loader
-	}
-}
-
-func WithUploadLimitMB(limit int) AppOption {
-	return func(o *Option) {
-		o.uploadLimitMB = limit
-	}
-}
-
-func WithThreads(threads int) AppOption {
-	return func(o *Option) {
-		o.threads = threads
-	}
-}
-
-func WithContextSize(ctxSize int) AppOption {
-	return func(o *Option) {
-		o.ctxSize = ctxSize
-	}
-}
-
-func WithF16(f16 bool) AppOption {
-	return func(o *Option) {
-		o.f16 = f16
-	}
-}
-
-func WithDebug(debug bool) AppOption {
-	return func(o *Option) {
-		o.debug = debug
-	}
-}
-
-func WithDisableMessage(disableMessage bool) AppOption {
-	return func(o *Option) {
-		o.disableMessage = disableMessage
-	}
-}
-
-func WithImageDir(imageDir string) AppOption {
-	return func(o *Option) {
-		o.imageDir = imageDir
-	}
-}
--- a/api/prediction.go
+++ b/api/prediction.go
@@ -1,639 +0,0 @@
-package api
-
-import (
-	"fmt"
-	"os"
-	"path/filepath"
-	"regexp"
-	"strings"
-	"sync"
-
-	"github.com/donomii/go-rwkv.cpp"
-	"github.com/go-skynet/LocalAI/pkg/langchain"
-	model "github.com/go-skynet/LocalAI/pkg/model"
-	"github.com/go-skynet/LocalAI/pkg/stablediffusion"
-	"github.com/go-skynet/bloomz.cpp"
-	bert "github.com/go-skynet/go-bert.cpp"
-	transformers "github.com/go-skynet/go-ggml-transformers.cpp"
-	llama "github.com/go-skynet/go-llama.cpp"
-	gpt4all "github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
-)
-
-// mutex still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-var mutexMap sync.Mutex
-var mutexes map[string]*sync.Mutex = make(map[string]*sync.Mutex)
-
-func defaultLLamaOpts(c Config) []llama.ModelOption {
-	llamaOpts := []llama.ModelOption{}
-	if c.ContextSize != 0 {
-		llamaOpts = append(llamaOpts, llama.SetContext(c.ContextSize))
-	}
-	if c.F16 {
-		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
-	}
-	if c.Embeddings {
-		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
-	}
-
-	if c.NGPULayers != 0 {
-		llamaOpts = append(llamaOpts, llama.SetGPULayers(c.NGPULayers))
-	}
-
-	llamaOpts = append(llamaOpts, llama.SetMMap(c.MMap))
-	llamaOpts = append(llamaOpts, llama.SetMainGPU(c.MainGPU))
-	llamaOpts = append(llamaOpts, llama.SetTensorSplit(c.TensorSplit))
-	if c.Batch != 0 {
-		llamaOpts = append(llamaOpts, llama.SetNBatch(c.Batch))
-	} else {
-		llamaOpts = append(llamaOpts, llama.SetNBatch(512))
-	}
-
-	return llamaOpts
-}
-
-func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, dst string, loader *model.ModelLoader, c Config, o *Option) (func() error, error) {
-	if c.Backend != model.StableDiffusionBackend {
-		return nil, fmt.Errorf("endpoint only working with stablediffusion models")
-	}
-	inferenceModel, err := loader.BackendLoader(c.Backend, c.ImageGenerationAssets, []llama.ModelOption{}, uint32(c.Threads), o.assetsDestination)
-	if err != nil {
-		return nil, err
-	}
-
-	var fn func() error
-	switch model := inferenceModel.(type) {
-	case *stablediffusion.StableDiffusion:
-		fn = func() error {
-			return model.GenerateImage(height, width, mode, step, seed, positive_prompt, negative_prompt, dst)
-		}
-
-	default:
-		fn = func() error {
-			return fmt.Errorf("creation of images not supported by the backend")
-		}
-	}
-
-	return func() error {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[c.Backend]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[c.Backend] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
-		return fn()
-	}, nil
-}
-
-func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c Config, o *Option) (func() ([]float32, error), error) {
-	if !c.Embeddings {
-		return nil, fmt.Errorf("endpoint disabled for this model by API configuration")
-	}
-
-	modelFile := c.Model
-
-	llamaOpts := defaultLLamaOpts(c)
-
-	var inferenceModel interface{}
-	var err error
-	if c.Backend == "" {
-		inferenceModel, err = loader.GreedyLoader(modelFile, llamaOpts, uint32(c.Threads), o.assetsDestination)
-	} else {
-		inferenceModel, err = loader.BackendLoader(c.Backend, modelFile, llamaOpts, uint32(c.Threads), o.assetsDestination)
-	}
-	if err != nil {
-		return nil, err
-	}
-
-	var fn func() ([]float32, error)
-	switch model := inferenceModel.(type) {
-	case *llama.LLama:
-		fn = func() ([]float32, error) {
-			predictOptions := buildLLamaPredictOptions(c, loader.ModelPath)
-			if len(tokens) > 0 {
-				return model.TokenEmbeddings(tokens, predictOptions...)
-			}
-			return model.Embeddings(s, predictOptions...)
-		}
-	// bert embeddings
-	case *bert.Bert:
-		fn = func() ([]float32, error) {
-			if len(tokens) > 0 {
-				return model.TokenEmbeddings(tokens, bert.SetThreads(c.Threads))
-			}
-			return model.Embeddings(s, bert.SetThreads(c.Threads))
-		}
-	default:
-		fn = func() ([]float32, error) {
-			return nil, fmt.Errorf("embeddings not supported by the backend")
-		}
-	}
-
-	return func() ([]float32, error) {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[modelFile]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[modelFile] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
-		embeds, err := fn()
-		if err != nil {
-			return embeds, err
-		}
-		// Remove trailing 0s
-		for i := len(embeds) - 1; i >= 0; i-- {
-			if embeds[i] == 0.0 {
-				embeds = embeds[:i]
-			} else {
-				break
-			}
-		}
-		return embeds, nil
-	}, nil
-}
-
-func buildLLamaPredictOptions(c Config, modelPath string) []llama.PredictOption {
-	// Generate the prediction using the language model
-	predictOptions := []llama.PredictOption{
-		llama.SetTemperature(c.Temperature),
-		llama.SetTopP(c.TopP),
-		llama.SetTopK(c.TopK),
-		llama.SetTokens(c.Maxtokens),
-		llama.SetThreads(c.Threads),
-	}
-
-	if c.PromptCacheAll {
-		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
-	}
-
-	if c.PromptCacheRO {
-		predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
-	}
-
-	if c.PromptCachePath != "" {
-		// Create parent directory
-		p := filepath.Join(modelPath, c.PromptCachePath)
-		os.MkdirAll(filepath.Dir(p), 0755)
-		predictOptions = append(predictOptions, llama.SetPathPromptCache(p))
-	}
-
-	if c.Mirostat != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostat(c.Mirostat))
-	}
-
-	if c.MirostatETA != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatETA(c.MirostatETA))
-	}
-
-	if c.MirostatTAU != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatTAU(c.MirostatTAU))
-	}
-
-	if c.Debug {
-		predictOptions = append(predictOptions, llama.Debug)
-	}
-
-	predictOptions = append(predictOptions, llama.SetStopWords(c.StopWords...))
-
-	if c.RepeatPenalty != 0 {
-		predictOptions = append(predictOptions, llama.SetPenalty(c.RepeatPenalty))
-	}
-
-	if c.Keep != 0 {
-		predictOptions = append(predictOptions, llama.SetNKeep(c.Keep))
-	}
-
-	if c.Batch != 0 {
-		predictOptions = append(predictOptions, llama.SetBatch(c.Batch))
-	}
-
-	if c.F16 {
-		predictOptions = append(predictOptions, llama.EnableF16KV)
-	}
-
-	if c.IgnoreEOS {
-		predictOptions = append(predictOptions, llama.IgnoreEOS)
-	}
-
-	if c.Seed != 0 {
-		predictOptions = append(predictOptions, llama.SetSeed(c.Seed))
-	}
-
-	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
-
-	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(c.FrequencyPenalty))
-	predictOptions = append(predictOptions, llama.SetMlock(c.MMlock))
-	predictOptions = append(predictOptions, llama.SetMemoryMap(c.MMap))
-	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(c.MainGPU))
-	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(c.TensorSplit))
-	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(c.TFZ))
-	predictOptions = append(predictOptions, llama.SetTypicalP(c.TypicalP))
-
-	return predictOptions
-}
-
-func ModelInference(s string, loader *model.ModelLoader, c Config, o *Option, tokenCallback func(string) bool) (func() (string, error), error) {
-	supportStreams := false
-	modelFile := c.Model
-
-	llamaOpts := defaultLLamaOpts(c)
-
-	var inferenceModel interface{}
-	var err error
-	if c.Backend == "" {
-		inferenceModel, err = loader.GreedyLoader(modelFile, llamaOpts, uint32(c.Threads), o.assetsDestination)
-	} else {
-		inferenceModel, err = loader.BackendLoader(c.Backend, modelFile, llamaOpts, uint32(c.Threads), o.assetsDestination)
-	}
-	if err != nil {
-		return nil, err
-	}
-
-	var fn func() (string, error)
-
-	switch model := inferenceModel.(type) {
-	case *rwkv.RwkvState:
-		supportStreams = true
-
-		fn = func() (string, error) {
-			stopWord := "\n"
-			if len(c.StopWords) > 0 {
-				stopWord = c.StopWords[0]
-			}
-
-			if err := model.ProcessInput(s); err != nil {
-				return "", err
-			}
-
-			response := model.GenerateResponse(c.Maxtokens, stopWord, float32(c.Temperature), float32(c.TopP), tokenCallback)
-
-			return response, nil
-		}
-	case *transformers.GPTNeoX:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.Replit:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.Starcoder:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.MPT:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *bloomz.Bloomz:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []bloomz.PredictOption{
-				bloomz.SetTemperature(c.Temperature),
-				bloomz.SetTopP(c.TopP),
-				bloomz.SetTopK(c.TopK),
-				bloomz.SetTokens(c.Maxtokens),
-				bloomz.SetThreads(c.Threads),
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, bloomz.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.Falcon:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.GPTJ:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.Dolly:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *transformers.GPT2:
-		fn = func() (string, error) {
-			// Generate the prediction using the language model
-			predictOptions := []transformers.PredictOption{
-				transformers.SetTemperature(c.Temperature),
-				transformers.SetTopP(c.TopP),
-				transformers.SetTopK(c.TopK),
-				transformers.SetTokens(c.Maxtokens),
-				transformers.SetThreads(c.Threads),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, transformers.SetBatch(c.Batch))
-			}
-
-			if c.Seed != 0 {
-				predictOptions = append(predictOptions, transformers.SetSeed(c.Seed))
-			}
-
-			return model.Predict(
-				s,
-				predictOptions...,
-			)
-		}
-	case *gpt4all.Model:
-		supportStreams = true
-
-		fn = func() (string, error) {
-			if tokenCallback != nil {
-				model.SetTokenCallback(tokenCallback)
-			}
-
-			// Generate the prediction using the language model
-			predictOptions := []gpt4all.PredictOption{
-				gpt4all.SetTemperature(c.Temperature),
-				gpt4all.SetTopP(c.TopP),
-				gpt4all.SetTopK(c.TopK),
-				gpt4all.SetTokens(c.Maxtokens),
-			}
-
-			if c.Batch != 0 {
-				predictOptions = append(predictOptions, gpt4all.SetBatch(c.Batch))
-			}
-
-			str, er := model.Predict(
-				s,
-				predictOptions...,
-			)
-			// Seems that if we don't free the callback explicitly we leave functions registered (that might try to send on closed channels)
-			// For instance otherwise the API returns: {"error":{"code":500,"message":"send on closed channel","type":""}}
-			// after a stream event has occurred
-			model.SetTokenCallback(nil)
-			return str, er
-		}
-	case *llama.LLama:
-		supportStreams = true
-		fn = func() (string, error) {
-
-			if tokenCallback != nil {
-				model.SetTokenCallback(tokenCallback)
-			}
-
-			predictOptions := buildLLamaPredictOptions(c, loader.ModelPath)
-
-			str, er := model.Predict(
-				s,
-				predictOptions...,
-			)
-			// Seems that if we don't free the callback explicitly we leave functions registered (that might try to send on closed channels)
-			// For instance otherwise the API returns: {"error":{"code":500,"message":"send on closed channel","type":""}}
-			// after a stream event has occurred
-			model.SetTokenCallback(nil)
-			return str, er
-		}
-	case *langchain.HuggingFace:
-		fn = func() (string, error) {
-
-			// Generate the prediction using the language model
-			predictOptions := []langchain.PredictOption{
-				langchain.SetModel(c.Model),
-				langchain.SetMaxTokens(c.Maxtokens),
-				langchain.SetTemperature(c.Temperature),
-				langchain.SetStopWords(c.StopWords),
-			}
-
-			pred, er := model.PredictHuggingFace(s, predictOptions...)
-			if er != nil {
-				return "", er
-			}
-			return pred.Completion, nil
-		}
-	}
-
-	return func() (string, error) {
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[modelFile]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[modelFile] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
-		res, err := fn()
-		if tokenCallback != nil && !supportStreams {
-			tokenCallback(res)
-		}
-		return res, err
-	}, nil
-}
-
-func ComputeChoices(predInput string, input *OpenAIRequest, config *Config, o *Option, loader *model.ModelLoader, cb func(string, *[]Choice), tokenCallback func(string) bool) ([]Choice, error) {
-	result := []Choice{}
-
-	n := input.N
-
-	if input.N == 0 {
-		n = 1
-	}
-
-	// get the model function to call for the result
-	predFunc, err := ModelInference(predInput, loader, *config, o, tokenCallback)
-	if err != nil {
-		return result, err
-	}
-
-	for i := 0; i < n; i++ {
-		prediction, err := predFunc()
-		if err != nil {
-			return result, err
-		}
-
-		prediction = Finetune(*config, predInput, prediction)
-		cb(prediction, &result)
-
-		//result = append(result, Choice{Text: prediction})
-
-	}
-	return result, err
-}
-
-var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
-var mu sync.Mutex = sync.Mutex{}
-
-func Finetune(config Config, input, prediction string) string {
-	if config.Echo {
-		prediction = input + prediction
-	}
-
-	for _, c := range config.Cutstrings {
-		mu.Lock()
-		reg, ok := cutstrings[c]
-		if !ok {
-			cutstrings[c] = regexp.MustCompile(c)
-			reg = cutstrings[c]
-		}
-		mu.Unlock()
-		prediction = reg.ReplaceAllString(prediction, "")
-	}
-
-	for _, c := range config.TrimSpace {
-		prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
-	}
-	return prediction
-
-}
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -0,0 +1,296 @@
+syntax = "proto3";
+
+option go_package = "github.com/go-skynet/LocalAI/pkg/grpc/proto";
+option java_multiple_files = true;
+option java_package = "io.skynet.localai.backend";
+option java_outer_classname = "LocalAIBackend";
+
+package backend;
+
+service Backend {
+  rpc Health(HealthMessage) returns (Reply) {}
+  rpc Predict(PredictOptions) returns (Reply) {}
+  rpc LoadModel(ModelOptions) returns (Result) {}
+  rpc PredictStream(PredictOptions) returns (stream Reply) {}
+  rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
+  rpc GenerateImage(GenerateImageRequest) returns (Result) {}
+  rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
+  rpc TTS(TTSRequest) returns (Result) {}
+  rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
+  rpc Status(HealthMessage) returns (StatusResponse) {}
+
+  rpc StoresSet(StoresSetOptions) returns (Result) {}
+  rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
+  rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
+  rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
+
+  rpc Rerank(RerankRequest) returns (RerankResult) {}
+}
+
+message RerankRequest {
+  string query = 1;
+  repeated string documents = 2;
+  int32 top_n = 3;
+}
+
+message RerankResult {
+  Usage usage = 1;
+  repeated DocumentResult results = 2;
+}
+
+message Usage {
+  int32 total_tokens = 1;
+  int32 prompt_tokens = 2;
+}
+
+message DocumentResult {
+  int32 index = 1;
+  string text = 2;
+  float relevance_score = 3;
+}
+
+message StoresKey {
+  repeated float Floats = 1;
+}
+
+message StoresValue {
+  bytes Bytes = 1;
+}
+
+message StoresSetOptions {
+  repeated StoresKey Keys = 1;
+  repeated StoresValue Values = 2;
+}
+
+message StoresDeleteOptions {
+  repeated StoresKey Keys = 1;
+}
+
+message StoresGetOptions {
+  repeated StoresKey Keys = 1;
+}
+
+message StoresGetResult {
+  repeated StoresKey Keys = 1;
+  repeated StoresValue Values = 2;
+}
+
+message StoresFindOptions {
+  StoresKey Key = 1;
+  int32 TopK = 2;
+}
+
+message StoresFindResult {
+  repeated StoresKey Keys = 1;
+  repeated StoresValue Values = 2;
+  repeated float Similarities = 3;
+}
+
+message HealthMessage {}
+
+// The request message containing the user's name.
+message PredictOptions {
+  string Prompt = 1;
+  int32 Seed = 2;
+  int32 Threads = 3;
+  int32 Tokens = 4;
+  int32 TopK = 5;
+  int32 Repeat = 6;
+  int32 Batch = 7;
+  int32 NKeep = 8;
+  float Temperature = 9;
+  float Penalty = 10;
+  bool F16KV = 11;
+  bool DebugMode = 12;
+  repeated string StopPrompts = 13;
+  bool IgnoreEOS = 14;
+  float TailFreeSamplingZ = 15;
+  float TypicalP = 16;
+  float FrequencyPenalty = 17;
+  float PresencePenalty = 18;
+  int32 Mirostat = 19;
+  float MirostatETA = 20;
+  float MirostatTAU = 21;
+  bool PenalizeNL = 22;
+  string LogitBias = 23;
+  bool MLock = 25;
+  bool MMap = 26;
+  bool PromptCacheAll = 27;
+  bool PromptCacheRO = 28;
+  string Grammar = 29;
+  string MainGPU = 30;
+  string TensorSplit = 31;
+  float TopP = 32;
+  string PromptCachePath = 33;
+  bool Debug = 34;
+  repeated int32 EmbeddingTokens = 35;
+  string Embeddings = 36;
+  float RopeFreqBase = 37;
+  float RopeFreqScale = 38;
+  float NegativePromptScale = 39;
+  string NegativePrompt = 40;
+  int32 NDraft = 41;
+  repeated string Images = 42;
+  bool UseTokenizerTemplate = 43;
+  repeated Message Messages = 44;
+}
+
+// The response message containing the result
+message Reply {
+  bytes message = 1;
+  int32 tokens = 2;
+  int32 prompt_tokens = 3;
+}
+
+message ModelOptions {
+  string Model = 1;
+  int32 ContextSize = 2;
+  int32 Seed = 3;
+  int32 NBatch = 4;
+  bool F16Memory = 5;
+  bool MLock = 6;
+  bool MMap = 7;
+  bool VocabOnly = 8;
+  bool LowVRAM = 9;
+  bool Embeddings = 10;
+  bool NUMA = 11;
+  int32 NGPULayers = 12;
+  string MainGPU = 13;
+  string TensorSplit = 14;
+  int32 Threads = 15;
+  string LibrarySearchPath = 16;
+  float RopeFreqBase = 17;
+  float RopeFreqScale = 18;
+  float RMSNormEps = 19;
+  int32 NGQA = 20;
+  string ModelFile = 21;
+
+  // AutoGPTQ
+  string Device = 22;
+  bool UseTriton = 23;
+  string ModelBaseName = 24;
+  bool UseFastTokenizer = 25;
+
+  // Diffusers
+  string PipelineType = 26;
+  string SchedulerType = 27;
+  bool CUDA = 28;
+  float CFGScale = 29;
+  bool IMG2IMG = 30;
+  string CLIPModel = 31;
+  string CLIPSubfolder = 32;
+  int32 CLIPSkip = 33;
+  string ControlNet = 48;
+
+  string Tokenizer = 34;
+
+  // LLM (llama.cpp)
+  string LoraBase = 35;
+  string LoraAdapter = 36;
+  float LoraScale = 42;
+
+  bool NoMulMatQ = 37;
+  string DraftModel = 39;
+
+  string AudioPath = 38;
+
+  // vllm
+  string Quantization = 40;
+  float  GPUMemoryUtilization = 50;
+  bool   TrustRemoteCode = 51;
+  bool   EnforceEager = 52;
+  int32  SwapSpace = 53;
+  int32  MaxModelLen = 54;
+  int32  TensorParallelSize = 55;
+
+  string MMProj = 41;
+
+  string RopeScaling = 43;
+  float YarnExtFactor = 44;
+  float YarnAttnFactor = 45;
+  float YarnBetaFast = 46;
+  float YarnBetaSlow = 47;
+
+  string Type = 49;
+
+  bool FlashAttention = 56;
+  bool NoKVOffload = 57;
+}
+
+message Result {
+  string message = 1;
+  bool success = 2;
+}
+
+message EmbeddingResult {
+  repeated float embeddings = 1;
+}
+
+message TranscriptRequest {
+  string dst = 2;
+  string language = 3;
+  uint32 threads = 4;
+}
+
+message TranscriptResult {
+  repeated TranscriptSegment segments = 1;
+  string text = 2;
+}
+
+message TranscriptSegment {
+  int32 id = 1;
+  int64 start = 2;
+  int64 end = 3;
+  string text = 4;
+  repeated int32 tokens = 5;
+}
+
+message GenerateImageRequest {
+  int32 height = 1;
+  int32 width = 2;
+  int32 mode = 3;
+  int32 step = 4;
+  int32 seed = 5;
+  string positive_prompt = 6;
+  string negative_prompt = 7;
+  string dst = 8;
+  string src = 9;
+
+  // Diffusers
+  string EnableParameters = 10;
+  int32 CLIPSkip = 11;
+}
+
+message TTSRequest {
+  string text = 1;
+  string model = 2;
+  string dst = 3;
+  string voice = 4;
+  optional string language = 5;
+}
+
+message TokenizationResponse {
+  int32 length = 1;
+  repeated int32 tokens = 2;
+}
+
+message MemoryUsageData {
+  uint64 total = 1;
+  map<string, uint64> breakdown = 2;
+}
+
+message StatusResponse {
+  enum State {
+    UNINITIALIZED = 0;
+    BUSY = 1;
+    READY = 2;
+    ERROR = -1;
+  }
+  State state = 1;
+  MemoryUsageData memory = 2;
+}
+
+message Message {
+  string role = 1;
+  string content = 2;
+}
--- a/backend/cpp/grpc/.gitignore
+++ b/backend/cpp/grpc/.gitignore
@@ -0,0 +1,3 @@
+installed_packages/
+grpc_build/
+grpc_repo/
--- a/backend/cpp/grpc/Makefile
+++ b/backend/cpp/grpc/Makefile
@@ -0,0 +1,65 @@
+# Basic platform detection
+HOST_SYSTEM = $(shell uname | cut -f 1 -d_)
+SYSTEM ?= $(HOST_SYSTEM)
+
+TAG_LIB_GRPC?=v1.59.0
+GIT_REPO_LIB_GRPC?=https://github.com/grpc/grpc.git
+GIT_CLONE_DEPTH?=1
+
+INSTALLED_PACKAGES=installed_packages
+GRPC_REPO=grpc_repo
+GRPC_BUILD=grpc_build
+
+export CMAKE_ARGS?=
+CMAKE_ARGS+=-DCMAKE_BUILD_TYPE=Release
+CMAKE_ARGS+=-DgRPC_INSTALL=ON
+CMAKE_ARGS+=-DEXECUTABLE_OUTPUT_PATH=../$(INSTALLED_PACKAGES)/grpc/bin
+CMAKE_ARGS+=-DLIBRARY_OUTPUT_PATH=../$(INSTALLED_PACKAGES)/grpc/lib
+CMAKE_ARGS+=-DgRPC_BUILD_TESTS=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_CSHARP_EXT=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_CPP_PLUGIN=ON
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_CSHARP_PLUGIN=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_NODE_PLUGIN=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_OBJECTIVE_C_PLUGIN=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_PHP_PLUGIN=OFF
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_PYTHON_PLUGIN=ON
+CMAKE_ARGS+=-DgRPC_BUILD_GRPC_RUBY_PLUGIN=OFF
+CMAKE_ARGS+=-Dprotobuf_WITH_ZLIB=ON
+CMAKE_ARGS+=-DRE2_BUILD_TESTING=OFF
+CMAKE_ARGS+=-DCMAKE_INSTALL_PREFIX=../$(INSTALLED_PACKAGES)
+
+# windows need to set OPENSSL_NO_ASM. Results in slower crypto performance but doesn't build otherwise.
+# May be resolvable, but for now its set. More info: https://stackoverflow.com/a/75240504/480673
+ifeq ($(SYSTEM),MSYS)
+CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
+endif
+ifeq ($(SYSTEM),MINGW64)
+CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
+endif
+ifeq ($(SYSTEM),MINGW32)
+CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
+endif
+ifeq ($(SYSTEM),CYGWIN)
+CMAKE_ARGS+=-DOPENSSL_NO_ASM=ON
+endif
+
+$(INSTALLED_PACKAGES): grpc_build
+
+$(GRPC_REPO):
+	git clone --depth $(GIT_CLONE_DEPTH) -b $(TAG_LIB_GRPC) $(GIT_REPO_LIB_GRPC) $(GRPC_REPO)/grpc
+	cd $(GRPC_REPO)/grpc && git submodule update --jobs 2 --init --recursive --depth $(GIT_CLONE_DEPTH)
+
+$(GRPC_BUILD): $(GRPC_REPO)
+	mkdir -p $(GRPC_BUILD)
+	cd $(GRPC_BUILD) && cmake $(CMAKE_ARGS) ../$(GRPC_REPO)/grpc && cmake --build . && cmake --build . --target install
+
+build: $(INSTALLED_PACKAGES)
+
+rebuild:
+	rm -rf grpc_build
+	$(MAKE) grpc_build
+
+clean:
+	rm -rf grpc_build
+	rm -rf grpc_repo
+	rm -rf installed_packages
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -0,0 +1,86 @@
+
+## XXX: In some versions of CMake clip wasn't being built before llama.
+## This is an hack for now, but it should be fixed in the future.
+set(TARGET myclip)
+add_library(${TARGET} clip.cpp clip.h llava.cpp llava.h)
+install(TARGETS ${TARGET} LIBRARY)
+target_include_directories(myclip PUBLIC .)
+target_include_directories(myclip PUBLIC ../..)
+target_include_directories(myclip PUBLIC ../../common)
+target_link_libraries(${TARGET} PRIVATE common ggml llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if (NOT MSVC)
+    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
+endif()
+# END CLIP hack
+
+
+set(TARGET grpc-server)
+set(CMAKE_CXX_STANDARD 17)
+cmake_minimum_required(VERSION 3.15)
+set(TARGET grpc-server)
+set(_PROTOBUF_LIBPROTOBUF libprotobuf)
+set(_REFLECTION grpc++_reflection)
+
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    # Set correct Homebrew install folder for Apple Silicon and Intel Macs
+    if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "arm64")
+        set(HOMEBREW_DEFAULT_PREFIX "/opt/homebrew")
+    else()
+        set(HOMEBREW_DEFAULT_PREFIX "/usr/local")
+    endif()
+
+    link_directories("${HOMEBREW_DEFAULT_PREFIX}/lib")
+    include_directories("${HOMEBREW_DEFAULT_PREFIX}/include")
+endif()
+
+find_package(absl CONFIG REQUIRED)
+find_package(Protobuf CONFIG REQUIRED)
+find_package(gRPC CONFIG REQUIRED)
+
+find_program(_PROTOBUF_PROTOC protoc)
+set(_GRPC_GRPCPP grpc++)
+find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${Protobuf_INCLUDE_DIRS})
+
+message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
+
+# Proto file
+get_filename_component(hw_proto "../../../../../../backend/backend.proto" ABSOLUTE)
+get_filename_component(hw_proto_path "${hw_proto}" PATH)
+
+# Generated sources
+set(hw_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.cc")
+set(hw_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.h")
+set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.cc")
+set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.h")
+
+add_custom_command(
+      OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
+      COMMAND ${_PROTOBUF_PROTOC}
+      ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+        -I "${hw_proto_path}"
+        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+        "${hw_proto}"
+      DEPENDS "${hw_proto}")
+
+# hw_grpc_proto
+add_library(hw_grpc_proto
+  ${hw_grpc_srcs}
+  ${hw_grpc_hdrs}
+  ${hw_proto_srcs}
+  ${hw_proto_hdrs} )
+
+add_executable(${TARGET} grpc-server.cpp utils.hpp json.hpp)
+target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+  absl::flags_parse
+  gRPC::${_REFLECTION}
+  gRPC::${_GRPC_GRPCPP}
+  protobuf::${_PROTOBUF_LIBPROTOBUF})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -0,0 +1,69 @@
+
+LLAMA_VERSION?=
+
+CMAKE_ARGS?=
+BUILD_TYPE?=
+ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
+
+# If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DLLAMA_CUBLAS=ON
+# If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+# to CMAKE_ARGS automatically
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+# If build type is clblas (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblas)
+	CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
+# If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
+# But if it's OSX without metal, disable it here
+else ifeq ($(OS),darwin)
+	ifneq ($(BUILD_TYPE),metal)
+		CMAKE_ARGS+=-DLLAMA_METAL=OFF
+	endif
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f16)
+	CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f32)
+	CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+endif
+
+llama.cpp:
+	git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
+	if [ -z "$(LLAMA_VERSION)" ]; then \
+		exit 1; \
+	fi
+	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1
+
+llama.cpp/examples/grpc-server: llama.cpp
+	mkdir -p llama.cpp/examples/grpc-server
+	bash prepare.sh
+
+rebuild:
+	bash prepare.sh
+	rm -rf grpc-server
+	$(MAKE) grpc-server
+
+purge:
+	rm -rf llama.cpp/build
+	rm -rf llama.cpp/examples/grpc-server
+	rm -rf grpc-server
+
+clean: purge
+	rm -rf llama.cpp
+
+grpc-server: llama.cpp llama.cpp/examples/grpc-server
+	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
+ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+	bash -c "source $(ONEAPI_VARS); \
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && $(MAKE)"
+else
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && $(MAKE)
+endif
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/cpp/llama/json.hpp
+++ b/backend/cpp/llama/json.hpp
--- a/backend/cpp/llama/prepare.sh
+++ b/backend/cpp/llama/prepare.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
+cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
+cp -rfv json.hpp llama.cpp/examples/grpc-server/
+cp -rfv utils.hpp llama.cpp/examples/grpc-server/
+    
+if grep -q "grpc-server" llama.cpp/examples/CMakeLists.txt; then
+    echo "grpc-server already added"
+else
+    echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
+fi
+
+## XXX: In some versions of CMake clip wasn't being built before llama.
+## This is an hack for now, but it should be fixed in the future.
+cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+cp -rfv llama.cpp/examples/llava/llava.cpp llama.cpp/examples/grpc-server/llava.cpp
+echo '#include "llama.h"' > llama.cpp/examples/grpc-server/llava.h
+cat llama.cpp/examples/llava/llava.h >> llama.cpp/examples/grpc-server/llava.h
+cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -0,0 +1,510 @@
+// https://github.com/ggerganov/llama.cpp/blob/master/examples/server/utils.hpp
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <set>
+#include <mutex>
+#include <condition_variable>
+#include <unordered_map>
+
+#include "json.hpp"
+
+#include "../llava/clip.h"
+
+using json = nlohmann::json;
+
+extern bool server_verbose;
+
+#ifndef SERVER_VERBOSE
+#define SERVER_VERBOSE 1
+#endif
+
+#if SERVER_VERBOSE != 1
+#define LOG_VERBOSE(MSG, ...)
+#else
+#define LOG_VERBOSE(MSG, ...)                                            \
+    do                                                                   \
+    {                                                                    \
+        if (server_verbose)                                              \
+        {                                                                \
+            server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
+        }                                                                \
+    } while (0)
+#endif
+
+#define LOG_ERROR(  MSG, ...) server_log("ERROR",   __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
+#define LOG_INFO(   MSG, ...) server_log("INFO",    __func__, __LINE__, MSG, __VA_ARGS__)
+
+//
+// parallel
+//
+
+enum server_state {
+    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
+    SERVER_STATE_READY,          // Server is ready and model is loaded
+    SERVER_STATE_ERROR           // An error occurred, load_model failed
+};
+
+enum task_type {
+    TASK_TYPE_COMPLETION,
+    TASK_TYPE_CANCEL,
+    TASK_TYPE_NEXT_RESPONSE
+};
+
+struct task_server {
+    int id = -1; // to be filled by llama_server_queue
+    int target_id;
+    task_type type;
+    json data;
+    bool infill_mode = false;
+    bool embedding_mode = false;
+    int multitask_id = -1;
+};
+
+struct task_result {
+    int id;
+    int multitask_id = -1;
+    bool stop;
+    bool error;
+    json result_json;
+};
+
+struct task_multi {
+    int id;
+    std::set<int> subtasks_remaining{};
+    std::vector<task_result> results{};
+};
+
+// TODO: can become bool if we can't find use of more states
+enum slot_state
+{
+    IDLE,
+    PROCESSING,
+};
+
+enum slot_command
+{
+    NONE,
+    LOAD_PROMPT,
+    RELEASE,
+};
+
+struct slot_params
+{
+    bool stream       = true;
+    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+
+    uint32_t seed      = -1; // RNG seed
+    int32_t  n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t  n_predict = -1; // new tokens to predict
+
+    std::vector<std::string> antiprompt;
+
+    json input_prefix;
+    json input_suffix;
+};
+
+struct slot_image
+{
+    int32_t id;
+
+    bool request_encode_image = false;
+    float * image_embedding = nullptr;
+    int32_t image_tokens = 0;
+
+    clip_image_u8 * img_data;
+
+    std::string prefix_prompt; // before of this image
+};
+
+// completion token output with probabilities
+struct completion_token_output
+{
+    struct token_prob
+    {
+        llama_token tok;
+        float prob;
+    };
+
+    std::vector<token_prob> probs;
+    llama_token tok;
+    std::string text_to_send;
+};
+
+static inline void server_log(const char *level, const char *function, int line,
+                       const char *message, const nlohmann::ordered_json &extra)
+{
+    nlohmann::ordered_json log
+    {
+        {"timestamp", time(nullptr)},
+        {"level",     level},
+        {"function",  function},
+        {"line",      line},
+        {"message",   message},
+    };
+
+    if (!extra.empty())
+    {
+        log.merge_patch(extra);
+    }
+
+    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
+    printf("%.*s\n", (int)str.size(), str.data());
+    fflush(stdout);
+}
+
+//
+// server utils
+//
+
+template <typename T>
+static T json_value(const json &body, const std::string &key, const T &default_value)
+{
+    // Fallback null to default value
+    return body.contains(key) && !body.at(key).is_null()
+        ? body.value(key, default_value)
+        : default_value;
+}
+
+inline std::string format_chatml(std::vector<json> messages)
+{
+    std::ostringstream chatml_msgs;
+
+    for (auto it = messages.begin(); it != messages.end(); ++it) {
+        chatml_msgs << "<|im_start|>"
+                    << json_value(*it, "role",    std::string("user")) << '\n';
+        chatml_msgs << json_value(*it, "content", std::string(""))
+                    << "<|im_end|>\n";
+    }
+
+    chatml_msgs << "<|im_start|>assistant" << '\n';
+
+    return chatml_msgs.str();
+}
+
+//
+// work queue utils
+//
+
+struct llama_server_queue {
+    int id = 0;
+    std::mutex mutex_tasks;
+    // queues
+    std::vector<task_server> queue_tasks;
+    std::vector<task_server> queue_tasks_deferred;
+    std::vector<task_multi> queue_multitasks;
+    std::condition_variable condition_tasks;
+    // callback functions
+    std::function<void(task_server&)> callback_new_task;
+    std::function<void(task_multi&)> callback_finish_multitask;
+    std::function<void(void)> callback_all_task_finished;
+
+    // Add a new task to the end of the queue
+    int post(task_server task) {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        if (task.id == -1) {
+            task.id = id++;
+        }
+        queue_tasks.push_back(std::move(task));
+        condition_tasks.notify_one();
+        return task.id;
+    }
+
+    // Add a new task, but defer until one slot is available
+    void defer(task_server task) {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        queue_tasks_deferred.push_back(std::move(task));
+    }
+
+    // Get the next id for creating anew task
+    int get_new_id() {
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        return id++;
+    }
+
+    // Register function to process a new task
+    void on_new_task(std::function<void(task_server&)> callback) {
+        callback_new_task = callback;
+    }
+
+    // Register function to process a multitask
+    void on_finish_multitask(std::function<void(task_multi&)> callback) {
+        callback_finish_multitask = callback;
+    }
+
+    // Register the function to be called when the batch of tasks is finished
+    void on_all_tasks_finished(std::function<void(void)> callback) {
+        callback_all_task_finished = callback;
+    }
+
+    // Call when the state of one slot is changed
+    void notify_slot_changed() {
+        // move deferred tasks back to main loop
+        std::unique_lock<std::mutex> lock(mutex_tasks);
+        for (auto & task : queue_tasks_deferred) {
+            queue_tasks.push_back(std::move(task));
+        }
+        queue_tasks_deferred.clear();
+    }
+
+    // Start the main loop. This call is blocking
+    [[noreturn]]
+    void start_loop() {
+        while (true) {
+            // new task arrived
+            LOG_VERBOSE("have new task", {});
+            {
+                while (true)
+                {
+                    std::unique_lock<std::mutex> lock(mutex_tasks);
+                    if (queue_tasks.empty()) {
+                        lock.unlock();
+                        break;
+                    }
+                    task_server task = queue_tasks.front();
+                    queue_tasks.erase(queue_tasks.begin());
+                    lock.unlock();
+                    LOG_VERBOSE("callback_new_task", {});
+                    callback_new_task(task);
+                }
+                LOG_VERBOSE("callback_all_task_finished", {});
+                // process and update all the multitasks
+                auto queue_iterator = queue_multitasks.begin();
+                while (queue_iterator != queue_multitasks.end())
+                {
+                    if (queue_iterator->subtasks_remaining.empty())
+                    {
+                        // all subtasks done == multitask is done
+                        task_multi current_multitask = *queue_iterator;
+                        callback_finish_multitask(current_multitask);
+                        // remove this multitask
+                        queue_iterator = queue_multitasks.erase(queue_iterator);
+                    }
+                    else
+                    {
+                        ++queue_iterator;
+                    }
+                }
+                // all tasks in the current loop is finished
+                callback_all_task_finished();
+            }
+            LOG_VERBOSE("wait for new task", {});
+            // wait for new task
+            {
+                std::unique_lock<std::mutex> lock(mutex_tasks);
+                if (queue_tasks.empty()) {
+                    condition_tasks.wait(lock, [&]{
+                        return !queue_tasks.empty();
+                    });
+                }
+            }
+        }
+    }
+
+    //
+    // functions to manage multitasks
+    //
+
+    // add a multitask by specifying the id of all subtask (subtask is a task_server)
+    void add_multitask(int multitask_id, std::vector<int>& sub_ids)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        task_multi multi;
+        multi.id = multitask_id;
+        std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
+        queue_multitasks.push_back(multi);
+    }
+
+    // updatethe remaining subtasks, while appending results to multitask
+    void update_multitask(int multitask_id, int subtask_id, task_result& result)
+    {
+        std::lock_guard<std::mutex> lock(mutex_tasks);
+        for (auto& multitask : queue_multitasks)
+        {
+            if (multitask.id == multitask_id)
+            {
+                multitask.subtasks_remaining.erase(subtask_id);
+                multitask.results.push_back(result);
+            }
+        }
+    }
+};
+
+struct llama_server_response {
+    typedef std::function<void(int, int, task_result&)> callback_multitask_t;
+    callback_multitask_t callback_update_multitask;
+    // for keeping track of all tasks waiting for the result
+    std::set<int> waiting_task_ids;
+    // the main result queue
+    std::vector<task_result> queue_results;
+    std::mutex mutex_results;
+    std::condition_variable condition_results;
+
+    void add_waiting_task_id(int task_id) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        waiting_task_ids.insert(task_id);
+    }
+
+    void remove_waiting_task_id(int task_id) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        waiting_task_ids.erase(task_id);
+    }
+
+    // This function blocks the thread until there is a response for this task_id
+    task_result recv(int task_id) {
+        while (true)
+        {
+            std::unique_lock<std::mutex> lock(mutex_results);
+            condition_results.wait(lock, [&]{
+                return !queue_results.empty();
+            });
+            LOG_VERBOSE("condition_results unblock", {});
+
+            for (int i = 0; i < (int) queue_results.size(); i++)
+            {
+                if (queue_results[i].id == task_id)
+                {
+                    assert(queue_results[i].multitask_id == -1);
+                    task_result res = queue_results[i];
+                    queue_results.erase(queue_results.begin() + i);
+                    return res;
+                }
+            }
+        }
+
+        // should never reach here
+    }
+
+    // Register the function to update multitask
+    void on_multitask_update(callback_multitask_t callback) {
+        callback_update_multitask = callback;
+    }
+
+    // Send a new result to a waiting task_id
+    void send(task_result result) {
+        std::unique_lock<std::mutex> lock(mutex_results);
+        LOG_VERBOSE("send new result", {});
+        for (auto& task_id : waiting_task_ids) {
+            // LOG_TEE("waiting task id %i \n", task_id);
+            // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
+            if (result.multitask_id == task_id)
+            {
+                LOG_VERBOSE("callback_update_multitask", {});
+                callback_update_multitask(task_id, result.id, result);
+                continue;
+            }
+
+            if (result.id == task_id)
+            {
+                LOG_VERBOSE("queue_results.push_back", {});
+                queue_results.push_back(result);
+                condition_results.notify_one();
+                return;
+            }
+        }
+    }
+};
+
+//
+// base64 utils (TODO: move to common in the future)
+//
+
+static const std::string base64_chars =
+             "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+             "abcdefghijklmnopqrstuvwxyz"
+             "0123456789+/";
+
+static inline bool is_base64(uint8_t c)
+{
+    return (isalnum(c) || (c == '+') || (c == '/'));
+}
+
+static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
+{
+    int i = 0;
+    int j = 0;
+    int in_ = 0;
+
+    int in_len = encoded_string.size();
+
+    uint8_t char_array_4[4];
+    uint8_t char_array_3[3];
+
+    std::vector<uint8_t> ret;
+
+    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
+    {
+        char_array_4[i++] = encoded_string[in_]; in_++;
+        if (i == 4)
+        {
+            for (i = 0; i <4; i++)
+            {
+                char_array_4[i] = base64_chars.find(char_array_4[i]);
+            }
+
+            char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+            char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+            char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+            for (i = 0; (i < 3); i++)
+            {
+                ret.push_back(char_array_3[i]);
+            }
+            i = 0;
+        }
+    }
+
+    if (i)
+    {
+        for (j = i; j <4; j++)
+        {
+            char_array_4[j] = 0;
+        }
+
+        for (j = 0; j <4; j++)
+        {
+            char_array_4[j] = base64_chars.find(char_array_4[j]);
+        }
+
+        char_array_3[0] = ((char_array_4[0]      ) << 2) + ((char_array_4[1] & 0x30) >> 4);
+        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) +   char_array_4[3];
+
+        for (j = 0; (j < i - 1); j++)
+        {
+            ret.push_back(char_array_3[j]);
+        }
+    }
+
+    return ret;
+}
+
+//
+// random string / id
+//
+
+static std::string random_string()
+{
+    static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+    std::random_device rd;
+    std::mt19937 generator(rd());
+
+    std::string result(32, ' ');
+
+    for (int i = 0; i < 32; ++i) {
+        result[i] = str[generator() % str.size()];
+    }
+
+    return result;
+}
+
+static std::string gen_chatcmplid()
+{
+    std::stringstream chatcmplid;
+    chatcmplid << "chatcmpl-" << random_string();
+    return chatcmplid.str();
+}
--- a/backend/go/image/stablediffusion/main.go
+++ b/backend/go/image/stablediffusion/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &Image{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/image/stablediffusion/stablediffusion.go
+++ b/backend/go/image/stablediffusion/stablediffusion.go
@@ -0,0 +1,33 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/LocalAI/pkg/stablediffusion"
+)
+
+type Image struct {
+	base.SingleThread
+	stablediffusion *stablediffusion.StableDiffusion
+}
+
+func (image *Image) Load(opts *pb.ModelOptions) error {
+	var err error
+	// Note: the Model here is a path to a directory containing the model files
+	image.stablediffusion, err = stablediffusion.New(opts.ModelFile)
+	return err
+}
+
+func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
+	return image.stablediffusion.GenerateImage(
+		int(opts.Height),
+		int(opts.Width),
+		int(opts.Mode),
+		int(opts.Step),
+		int(opts.Seed),
+		opts.PositivePrompt,
+		opts.NegativePrompt,
+		opts.Dst)
+}
--- a/backend/go/image/tinydream/main.go
+++ b/backend/go/image/tinydream/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &Image{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/image/tinydream/tinydream.go
+++ b/backend/go/image/tinydream/tinydream.go
@@ -0,0 +1,32 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/LocalAI/pkg/tinydream"
+)
+
+type Image struct {
+	base.SingleThread
+	tinydream *tinydream.TinyDream
+}
+
+func (image *Image) Load(opts *pb.ModelOptions) error {
+	var err error
+	// Note: the Model here is a path to a directory containing the model files
+	image.tinydream, err = tinydream.New(opts.ModelFile)
+	return err
+}
+
+func (image *Image) GenerateImage(opts *pb.GenerateImageRequest) error {
+	return image.tinydream.GenerateImage(
+		int(opts.Height),
+		int(opts.Width),
+		int(opts.Step),
+		int(opts.Seed),
+		opts.PositivePrompt,
+		opts.NegativePrompt,
+		opts.Dst)
+}
--- a/backend/go/llm/bert/bert.go
+++ b/backend/go/llm/bert/bert.go
@@ -0,0 +1,34 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	bert "github.com/go-skynet/go-bert.cpp"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+)
+
+type Embeddings struct {
+	base.SingleThread
+	bert *bert.Bert
+}
+
+func (llm *Embeddings) Load(opts *pb.ModelOptions) error {
+	model, err := bert.New(opts.ModelFile)
+	llm.bert = model
+	return err
+}
+
+func (llm *Embeddings) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
+
+	if len(opts.EmbeddingTokens) > 0 {
+		tokens := []int{}
+		for _, t := range opts.EmbeddingTokens {
+			tokens = append(tokens, int(t))
+		}
+		return llm.bert.TokenEmbeddings(tokens, bert.SetThreads(int(opts.Threads)))
+	}
+
+	return llm.bert.Embeddings(opts.Embeddings, bert.SetThreads(int(opts.Threads)))
+}
--- a/backend/go/llm/bert/main.go
+++ b/backend/go/llm/bert/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &Embeddings{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/llm/gpt4all/gpt4all.go
+++ b/backend/go/llm/gpt4all/gpt4all.go
@@ -0,0 +1,62 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	gpt4all "github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
+)
+
+type LLM struct {
+	base.SingleThread
+
+	gpt4all *gpt4all.Model
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	model, err := gpt4all.New(opts.ModelFile,
+		gpt4all.SetThreads(int(opts.Threads)),
+		gpt4all.SetLibrarySearchPath(opts.LibrarySearchPath))
+	llm.gpt4all = model
+	return err
+}
+
+func buildPredictOptions(opts *pb.PredictOptions) []gpt4all.PredictOption {
+	predictOptions := []gpt4all.PredictOption{
+		gpt4all.SetTemperature(float64(opts.Temperature)),
+		gpt4all.SetTopP(float64(opts.TopP)),
+		gpt4all.SetTopK(int(opts.TopK)),
+		gpt4all.SetTokens(int(opts.Tokens)),
+	}
+
+	if opts.Batch != 0 {
+		predictOptions = append(predictOptions, gpt4all.SetBatch(int(opts.Batch)))
+	}
+	return predictOptions
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	return llm.gpt4all.Predict(opts.Prompt, buildPredictOptions(opts)...)
+}
+
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	predictOptions := buildPredictOptions(opts)
+
+	go func() {
+		llm.gpt4all.SetTokenCallback(func(token string) bool {
+			results <- token
+			return true
+		})
+		_, err := llm.gpt4all.Predict(opts.Prompt, predictOptions...)
+		if err != nil {
+			fmt.Println("err: ", err)
+		}
+		llm.gpt4all.SetTokenCallback(nil)
+		close(results)
+	}()
+
+	return nil
+}
--- a/backend/go/llm/gpt4all/main.go
+++ b/backend/go/llm/gpt4all/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/llm/langchain/langchain.go
+++ b/backend/go/llm/langchain/langchain.go
@@ -0,0 +1,64 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+	"os"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/LocalAI/pkg/langchain"
+)
+
+type LLM struct {
+	base.Base
+
+	langchain *langchain.HuggingFace
+	model     string
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	var err error
+	hfToken := os.Getenv("HUGGINGFACEHUB_API_TOKEN")
+	if hfToken == "" {
+		return fmt.Errorf("no huggingface token provided")
+	}
+	llm.langchain, err = langchain.NewHuggingFace(opts.Model, hfToken)
+	llm.model = opts.Model
+	return err
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	o := []langchain.PredictOption{
+		langchain.SetModel(llm.model),
+		langchain.SetMaxTokens(int(opts.Tokens)),
+		langchain.SetTemperature(float64(opts.Temperature)),
+		langchain.SetStopWords(opts.StopPrompts),
+	}
+	pred, err := llm.langchain.PredictHuggingFace(opts.Prompt, o...)
+	if err != nil {
+		return "", err
+	}
+	return pred.Completion, nil
+}
+
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	o := []langchain.PredictOption{
+		langchain.SetModel(llm.model),
+		langchain.SetMaxTokens(int(opts.Tokens)),
+		langchain.SetTemperature(float64(opts.Temperature)),
+		langchain.SetStopWords(opts.StopPrompts),
+	}
+	go func() {
+		res, err := llm.langchain.PredictHuggingFace(opts.Prompt, o...)
+
+		if err != nil {
+			fmt.Println("err: ", err)
+		}
+		results <- res.Completion
+		close(results)
+	}()
+
+	return nil
+}
--- a/backend/go/llm/langchain/main.go
+++ b/backend/go/llm/langchain/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/llm/llama-ggml/llama.go
+++ b/backend/go/llm/llama-ggml/llama.go
@@ -0,0 +1,204 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/go-llama.cpp"
+)
+
+type LLM struct {
+	base.SingleThread
+
+	llama *llama.LLama
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	ropeFreqBase := float32(10000)
+	ropeFreqScale := float32(1)
+
+	if opts.RopeFreqBase != 0 {
+		ropeFreqBase = opts.RopeFreqBase
+	}
+	if opts.RopeFreqScale != 0 {
+		ropeFreqScale = opts.RopeFreqScale
+	}
+
+	llamaOpts := []llama.ModelOption{
+		llama.WithRopeFreqBase(ropeFreqBase),
+		llama.WithRopeFreqScale(ropeFreqScale),
+	}
+
+	if opts.NGQA != 0 {
+		llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
+	}
+
+	if opts.RMSNormEps != 0 {
+		llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
+	}
+
+	if opts.ContextSize != 0 {
+		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
+	}
+	if opts.F16Memory {
+		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
+	}
+	if opts.Embeddings {
+		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
+	}
+	if opts.NGPULayers != 0 {
+		llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
+	}
+
+	llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
+	llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
+	llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
+	if opts.NBatch != 0 {
+		llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
+	} else {
+		llamaOpts = append(llamaOpts, llama.SetNBatch(512))
+	}
+
+	if opts.NUMA {
+		llamaOpts = append(llamaOpts, llama.EnableNUMA)
+	}
+
+	if opts.LowVRAM {
+		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
+	}
+
+	model, err := llama.New(opts.ModelFile, llamaOpts...)
+	llm.llama = model
+
+	return err
+}
+
+func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
+	ropeFreqBase := float32(10000)
+	ropeFreqScale := float32(1)
+
+	if opts.RopeFreqBase != 0 {
+		ropeFreqBase = opts.RopeFreqBase
+	}
+	if opts.RopeFreqScale != 0 {
+		ropeFreqScale = opts.RopeFreqScale
+	}
+	predictOptions := []llama.PredictOption{
+		llama.SetTemperature(opts.Temperature),
+		llama.SetTopP(opts.TopP),
+		llama.SetTopK(int(opts.TopK)),
+		llama.SetTokens(int(opts.Tokens)),
+		llama.SetThreads(int(opts.Threads)),
+		llama.WithGrammar(opts.Grammar),
+		llama.SetRopeFreqBase(ropeFreqBase),
+		llama.SetRopeFreqScale(ropeFreqScale),
+		llama.SetNegativePromptScale(opts.NegativePromptScale),
+		llama.SetNegativePrompt(opts.NegativePrompt),
+	}
+
+	if opts.PromptCacheAll {
+		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
+	}
+
+	if opts.PromptCacheRO {
+		predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
+	}
+
+	// Expected absolute path
+	if opts.PromptCachePath != "" {
+		predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
+	}
+
+	if opts.Mirostat != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
+	}
+
+	if opts.MirostatETA != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
+	}
+
+	if opts.MirostatTAU != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
+	}
+
+	if opts.Debug {
+		predictOptions = append(predictOptions, llama.Debug)
+	}
+
+	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
+
+	if opts.PresencePenalty != 0 {
+		predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
+	}
+
+	if opts.NKeep != 0 {
+		predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
+	}
+
+	if opts.Batch != 0 {
+		predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
+	}
+
+	if opts.F16KV {
+		predictOptions = append(predictOptions, llama.EnableF16KV)
+	}
+
+	if opts.IgnoreEOS {
+		predictOptions = append(predictOptions, llama.IgnoreEOS)
+	}
+
+	if opts.Seed != 0 {
+		predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
+	}
+
+	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
+
+	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
+	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
+	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
+	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
+	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
+	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
+	predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
+	return predictOptions
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
+}
+
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	predictOptions := buildPredictOptions(opts)
+
+	predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
+		results <- token
+		return true
+	}))
+
+	go func() {
+		_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
+		if err != nil {
+			fmt.Println("err: ", err)
+		}
+		close(results)
+	}()
+
+	return nil
+}
+
+func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
+	predictOptions := buildPredictOptions(opts)
+
+	if len(opts.EmbeddingTokens) > 0 {
+		tokens := []int{}
+		for _, t := range opts.EmbeddingTokens {
+			tokens = append(tokens, int(t))
+		}
+		return llm.llama.TokenEmbeddings(tokens, predictOptions...)
+	}
+
+	return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
+}
--- a/backend/go/llm/llama-ggml/main.go
+++ b/backend/go/llm/llama-ggml/main.go
@@ -0,0 +1,19 @@
+package main
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/llm/llama/llama.go
+++ b/backend/go/llm/llama/llama.go
@@ -0,0 +1,257 @@
+package main
+
+// This is a wrapper to statisfy the GRPC service interface
+// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
+import (
+	"fmt"
+	"path/filepath"
+
+	"github.com/go-skynet/LocalAI/pkg/grpc/base"
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	"github.com/go-skynet/go-llama.cpp"
+)
+
+type LLM struct {
+	base.SingleThread
+
+	llama      *llama.LLama
+	draftModel *llama.LLama
+}
+
+func (llm *LLM) Load(opts *pb.ModelOptions) error {
+	ropeFreqBase := float32(10000)
+	ropeFreqScale := float32(1)
+
+	if opts.RopeFreqBase != 0 {
+		ropeFreqBase = opts.RopeFreqBase
+	}
+	if opts.RopeFreqScale != 0 {
+		ropeFreqScale = opts.RopeFreqScale
+	}
+
+	llamaOpts := []llama.ModelOption{
+		llama.WithRopeFreqBase(ropeFreqBase),
+		llama.WithRopeFreqScale(ropeFreqScale),
+	}
+
+	if opts.NoMulMatQ {
+		llamaOpts = append(llamaOpts, llama.SetMulMatQ(false))
+	}
+
+	// Get base path of opts.ModelFile and use the same for lora (assume the same path)
+	basePath := filepath.Dir(opts.ModelFile)
+
+	if opts.LoraAdapter != "" {
+		llamaOpts = append(llamaOpts, llama.SetLoraAdapter(filepath.Join(basePath, opts.LoraAdapter)))
+	}
+
+	if opts.LoraBase != "" {
+		llamaOpts = append(llamaOpts, llama.SetLoraBase(filepath.Join(basePath, opts.LoraBase)))
+	}
+
+	if opts.ContextSize != 0 {
+		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
+	}
+	if opts.F16Memory {
+		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
+	}
+	if opts.Embeddings {
+		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
+	}
+	if opts.NGPULayers != 0 {
+		llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
+	}
+
+	llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
+	llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
+	llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
+	if opts.NBatch != 0 {
+		llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
+	} else {
+		llamaOpts = append(llamaOpts, llama.SetNBatch(512))
+	}
+
+	if opts.NUMA {
+		llamaOpts = append(llamaOpts, llama.EnableNUMA)
+	}
+
+	if opts.LowVRAM {
+		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
+	}
+
+	if opts.DraftModel != "" {
+		// https://github.com/ggerganov/llama.cpp/blob/71ca2fad7d6c0ef95ef9944fb3a1a843e481f314/examples/speculative/speculative.cpp#L40
+		llamaOpts = append(llamaOpts, llama.SetPerplexity(true))
+	}
+
+	model, err := llama.New(opts.ModelFile, llamaOpts...)
+
+	if opts.DraftModel != "" {
+		// opts.DraftModel is relative to opts.ModelFile, so we need to get the basepath of opts.ModelFile
+		if !filepath.IsAbs(opts.DraftModel) {
+			dir := filepath.Dir(opts.ModelFile)
+			opts.DraftModel = filepath.Join(dir, opts.DraftModel)
+		}
+
+		draftModel, err := llama.New(opts.DraftModel, llamaOpts...)
+		if err != nil {
+			return err
+		}
+		llm.draftModel = draftModel
+	}
+
+	llm.llama = model
+
+	return err
+}
+
+func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
+	ropeFreqBase := float32(10000)
+	ropeFreqScale := float32(1)
+
+	if opts.RopeFreqBase != 0 {
+		ropeFreqBase = opts.RopeFreqBase
+	}
+	if opts.RopeFreqScale != 0 {
+		ropeFreqScale = opts.RopeFreqScale
+	}
+	predictOptions := []llama.PredictOption{
+		llama.SetTemperature(opts.Temperature),
+		llama.SetTopP(opts.TopP),
+		llama.SetTopK(int(opts.TopK)),
+		llama.SetTokens(int(opts.Tokens)),
+		llama.SetThreads(int(opts.Threads)),
+		llama.WithGrammar(opts.Grammar),
+		llama.SetRopeFreqBase(ropeFreqBase),
+		llama.SetRopeFreqScale(ropeFreqScale),
+		llama.SetNegativePromptScale(opts.NegativePromptScale),
+		llama.SetNegativePrompt(opts.NegativePrompt),
+	}
+
+	if opts.PromptCacheAll {
+		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
+	}
+
+	if opts.PromptCacheRO {
+		predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
+	}
+
+	// Expected absolute path
+	if opts.PromptCachePath != "" {
+		predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
+	}
+
+	if opts.Mirostat != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
+	}
+
+	if opts.MirostatETA != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
+	}
+
+	if opts.MirostatTAU != 0 {
+		predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
+	}
+
+	if opts.Debug {
+		predictOptions = append(predictOptions, llama.Debug)
+	}
+
+	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
+
+	if opts.PresencePenalty != 0 {
+		predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
+	}
+
+	if opts.NKeep != 0 {
+		predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
+	}
+
+	if opts.Batch != 0 {
+		predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
+	}
+
+	if opts.F16KV {
+		predictOptions = append(predictOptions, llama.EnableF16KV)
+	}
+
+	if opts.IgnoreEOS {
+		predictOptions = append(predictOptions, llama.IgnoreEOS)
+	}
+
+	if opts.Seed != 0 {
+		predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
+	}
+
+	if opts.NDraft != 0 {
+		predictOptions = append(predictOptions, llama.SetNDraft(int(opts.NDraft)))
+	}
+	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
+
+	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
+	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
+	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
+	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
+	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
+	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
+	predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
+	return predictOptions
+}
+
+func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	if llm.draftModel != nil {
+		return llm.llama.SpeculativeSampling(llm.draftModel, opts.Prompt, buildPredictOptions(opts)...)
+	}
+	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
+}
+
+func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
+	predictOptions := buildPredictOptions(opts)
+
+	predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
+		results <- token
+		return true
+	}))
+
+	go func() {
+		var err error
+		if llm.draftModel != nil {
+			_, err = llm.llama.SpeculativeSampling(llm.draftModel, opts.Prompt, buildPredictOptions(opts)...)
+		} else {
+			_, err = llm.llama.Predict(opts.Prompt, predictOptions...)
+		}
+
+		if err != nil {
+			fmt.Println("err: ", err)
+		}
+		close(results)
+	}()
+
+	return nil
+}
+
+func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
+	predictOptions := buildPredictOptions(opts)
+
+	if len(opts.EmbeddingTokens) > 0 {
+		tokens := []int{}
+		for _, t := range opts.EmbeddingTokens {
+			tokens = append(tokens, int(t))
+		}
+		return llm.llama.TokenEmbeddings(tokens, predictOptions...)
+	}
+
+	return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
+}
+
+func (llm *LLM) TokenizeString(opts *pb.PredictOptions) (pb.TokenizationResponse, error) {
+	predictOptions := buildPredictOptions(opts)
+	l, tokens, err := llm.llama.TokenizeString(opts.Prompt, predictOptions...)
+	if err != nil {
+		return pb.TokenizationResponse{}, err
+	}
+	return pb.TokenizationResponse{
+		Length: l,
+		Tokens: tokens,
+	}, nil
+}
--- a/backend/go/llm/llama/main.go
+++ b/backend/go/llm/llama/main.go
@@ -0,0 +1,23 @@
+package main
+
+// GRPC Falcon server
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/llm/rwkv/main.go
+++ b/backend/go/llm/rwkv/main.go
@@ -0,0 +1,21 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/Show More
+++ b/Show More