fixup: create piper libdir also when not built

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Merge branch 'master' into enable_gpu
2026-07-06 22:37:03 -04:00 · 2023-11-12 22:17:11 +01:00 · 2023-11-11 19:20:36 +01:00 · 2023-11-11 18:40:48 +01:00 · 2023-11-11 18:40:26 +01:00 · 2023-11-11 13:14:59 +01:00
361 changed files with 52098 additions and 1537 deletions
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -1,3 +0,0 @@
-ARG GO_VERSION=1.20
-FROM mcr.microsoft.com/devcontainers/go:0-$GO_VERSION-bullseye
-RUN apt-get update && apt-get install -y cmake
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,46 +0,0 @@
-// For format details, see https://aka.ms/devcontainer.json. For config options, see the
-// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-docker-compose
-{
-	"name": "Existing Docker Compose (Extend)",
-
-	// Update the 'dockerComposeFile' list if you have more compose files or use different names.
-	// The .devcontainer/docker-compose.yml file contains any overrides you need/want to make.
-	"dockerComposeFile": [
-		"../docker-compose.yaml",
-		"docker-compose.yml"
-	],
-
-	// The 'service' property is the name of the service for the container that VS Code should
-	// use. Update this value and .devcontainer/docker-compose.yml to the real service name.
-	"service": "api",
-
-	// The optional 'workspaceFolder' property is the path VS Code should open by default when
-	// connected. This is typically a file mount in .devcontainer/docker-compose.yml
-	"workspaceFolder": "/workspace",
-
-	"features": {
-		"ghcr.io/devcontainers/features/go:1": {},
-		"ghcr.io/azutake/devcontainer-features/go-packages-install:0": {}
-	},
-
-	// Features to add to the dev container. More info: https://containers.dev/features.
-	// "features": {},
-
-	// Use 'forwardPorts' to make a list of ports inside the container available locally.
-	// "forwardPorts": [],
-
-	// Uncomment the next line if you want start specific services in your Docker Compose config.
-	// "runServices": [],
-
-	// Uncomment the next line if you want to keep your containers running after VS Code shuts down.
-	// "shutdownAction": "none",
-
-	// Uncomment the next line to run commands after the container is created.
-	"postCreateCommand": "make prepare"
-
-	// Configure tool-specific properties.
-	// "customizations": {},
-
-	// Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root.
-	// "remoteUser": "devcontainer"
-}
--- a/.devcontainer/docker-compose.yml
+++ b/.devcontainer/docker-compose.yml
@@ -1,26 +0,0 @@
-version: '3.6'
-services:
-  # Update this to the name of the service you want to work with in your docker-compose.yml file
-  api:
-    # Uncomment if you want to override the service's Dockerfile to one in the .devcontainer 
-    # folder. Note that the path of the Dockerfile and context is relative to the *primary* 
-    # docker-compose.yml file (the first in the devcontainer.json "dockerComposeFile"
-    # array). The sample below assumes your primary file is in the root of your project.
-    #
-    build:
-      context: .
-      dockerfile: .devcontainer/Dockerfile
-
-    volumes:
-      # Update this to wherever you want VS Code to mount the folder of your project
-      - .:/workspace:cached
-
-    # Uncomment the next four lines if you will use a ptrace-based debugger like C++, Go, and Rust.
-    # cap_add:
-    #   - SYS_PTRACE
-    # security_opt:
-    #   - seccomp:unconfined
-
-    # Overrides default command so things don't shut down after the process ends.
-    command: /bin/sh -c "while sleep 1000; do :; done"
- 
--- a/.dockerignore
+++ b/.dockerignore
@@ -1 +1,5 @@
+.idea
 models
+examples/chatbot-ui/models
+examples/rwkv/models
+examples/**/models
--- a/.env
+++ b/.env
@@ -1,5 +1,72 @@
+## Set number of threads.
+## Note: prefer the number of physical cores. Overbooking the CPU degrades performance notably.
 # THREADS=14
+
+## Specify a different bind address (defaults to ":8080")
+# ADDRESS=127.0.0.1:8080
+
+## Default models context size
 # CONTEXT_SIZE=512
+#
+## Define galleries.
+## models will to install will be visible in `/models/available`
+# GALLERIES=[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}]
+
+## CORS settings
+# CORS=true
+# CORS_ALLOW_ORIGINS=*
+
+## Default path for models
+#
 MODELS_PATH=/models
+
+## Enable debug mode
 # DEBUG=true
-# BUILD_TYPE=generic
+
+## Disables COMPEL (Diffusers)
+# COMPEL=0
+
+## Enable/Disable single backend (useful if only one GPU is available)
+# SINGLE_ACTIVE_BACKEND=true
+
+## Specify a build type. Available: cublas, openblas, clblas.
+## cuBLAS: This is a GPU-accelerated version of the complete standard BLAS (Basic Linear Algebra Subprograms) library. It's provided by Nvidia and is part of their CUDA toolkit.
+## OpenBLAS: This is an open-source implementation of the BLAS library that aims to provide highly optimized code for various platforms. It includes support for multi-threading and can be compiled to use hardware-specific features for additional performance. OpenBLAS can run on many kinds of hardware, including CPUs from Intel, AMD, and ARM.
+## clBLAS:   This is an open-source implementation of the BLAS library that uses OpenCL, a framework for writing programs that execute across heterogeneous platforms consisting of CPUs, GPUs, and other processors. clBLAS is designed to take advantage of the parallel computing power of GPUs but can also run on any hardware that supports OpenCL. This includes hardware from different vendors like Nvidia, AMD, and Intel.
+# BUILD_TYPE=openblas
+
+## Uncomment and set to true to enable rebuilding from source
+# REBUILD=true
+
+## Enable go tags, available: stablediffusion, tts
+## stablediffusion: image generation with stablediffusion
+## tts: enables text-to-speech with go-piper 
+## (requires REBUILD=true)
+#
+# GO_TAGS=stablediffusion
+
+## Path where to store generated images
+# IMAGE_PATH=/tmp
+
+## Specify a default upload limit in MB (whisper)
+# UPLOAD_LIMIT
+
+## List of external GRPC backends (note on the container image this variable is already set to use extra backends available in extra/)
+# EXTERNAL_GRPC_BACKENDS=my-backend:127.0.0.1:9000,my-backend2:/usr/bin/backend.py
+
+### Advanced settings ###
+### Those are not really used by LocalAI, but from components in the stack ###
+##
+### Preload libraries
+# LD_PRELOAD=
+
+### Huggingface cache for models
+# HUGGINGFACE_HUB_CACHE=/usr/local/huggingface
+
+### Python backends GRPC max workers
+### Default number of workers for GRPC Python backends.
+### This actually controls wether a backend can process multiple requests or not.
+# PYTHON_GRPC_MAX_WORKERS=1
+
+### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
+# LLAMACPP_PARALLEL=1
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.sh text eol=lf
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -0,0 +1,5 @@
+# These are supported funding model platforms
+
+github: [mudler]
+custom: 
+- https://www.buymeacoffee.com/mudler
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,31 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: mudler
+
+---
+
+<!-- Thanks for helping us to improve LocalAI! We welcome all bug reports. Please fill out each area of the template so we can better help you. Comments like this will be hidden when you post but you can delete them if you wish. -->
+
+**LocalAI version:**
+<!-- Container Image or LocalAI tag/commit -->
+
+**Environment, CPU architecture, OS, and Version:**
+<!-- Provide the output from "uname -a", HW specs, if it's a VM  -->
+
+**Describe the bug**
+<!-- A clear and concise description of what the bug is. -->
+
+**To Reproduce**
+<!-- Steps to reproduce the behavior, including the LocalAI command used, if any -->
+
+**Expected behavior**
+<!-- A clear and concise description of what you expected to happen. -->
+
+**Logs**
+<!-- If applicable, add logs while running LocalAI in debug mode (`--debug` or `DEBUG=true`) to help explain your problem.  -->
+
+**Additional context**
+<!-- Add any other context about the problem here. -->
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,8 @@
+blank_issues_enabled: false
+contact_links:
+  - name: Community Support
+    url: https://github.com/go-skynet/LocalAI/discussions
+    about: Please ask and answer questions here.
+  - name: Discord
+    url: https://discord.gg/uJAeKSAGDy
+    about: Join our community on Discord!
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,22 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: enhancement
+assignees: mudler
+
+---
+
+<!-- Thanks for helping us to improve LocalAI! We welcome all feature requests. Please fill out each area of the template so we can better help you. Comments like this will be hidden when you post but you can delete them if you wish. -->
+
+**Is your feature request related to a problem? Please describe.**
+<!-- A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]  -->
+
+**Describe the solution you'd like**
+<!-- A clear and concise description of what you want to happen.  -->
+
+**Describe alternatives you've considered**
+<!-- A clear and concise description of any alternative solutions or features you've considered.  -->
+
+**Additional context**
+<!-- Add any other context or screenshots about the feature request here. -->
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,31 @@
+**Description**
+
+This PR fixes #
+
+**Notes for Reviewers**
+
+
+**[Signed commits](../CONTRIBUTING.md#signing-off-on-commits-developer-certificate-of-origin)**
+- [ ] Yes, I signed my commits.
+ 
+<!--
+Thank you for contributing to LocalAI! 
+
+Contributing Conventions
+-------------------------
+
+The draft above helps to give a quick overview of your PR.
+
+Remember to remove this comment and to at least:
+
+1. Include descriptive PR titles with [<component-name>] prepended. We use [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/).
+2. Build and test your changes before submitting a PR (`make build`). 
+3. Sign your commits
+4. **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below).
+5. **X/Twitter handle:** we announce bigger features on X/Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out!
+
+By following the community's contribution conventions upfront, the review process will 
+be accelerated and your PR merged more quickly.
+
+If no one reviews your PR within a few days, please @-mention @mudler.
+-->
--- a/.github/bump_deps.sh
+++ b/.github/bump_deps.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -xe
+REPO=$1
+BRANCH=$2
+VAR=$3
+
+LAST_COMMIT=$(curl -s -H "Accept: application/vnd.github.VERSION.sha" "https://api.github.com/repos/$REPO/commits/$BRANCH")
+
+sed -i Makefile -e "s/$VAR?=.*/$VAR?=$LAST_COMMIT/"
--- a/.github/release.yml
+++ b/.github/release.yml
@@ -0,0 +1,24 @@
+# .github/release.yml
+
+changelog:
+  exclude:
+    labels:
+      - ignore-for-release
+  categories:
+    - title: Breaking Changes 🛠
+      labels:
+        - Semver-Major
+        - breaking-change
+    - title: "Bug fixes :bug:"
+      labels:
+        - bug
+    - title: Exciting New Features 🎉
+      labels:
+        - Semver-Minor
+        - enhancement
+    - title: 👒 Dependencies
+      labels:
+        - dependencies
+    - title: Other Changes
+      labels:
+        - "*"
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -0,0 +1,18 @@
+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 45
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 10
+# Issues with these labels will never be considered stale
+exemptLabels:
+  - issue/willfix
+# Label to use when marking an issue as stale
+staleLabel: issue/stale
+# Comment to post when marking an issue as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Thank you
+  for your contributions.
+# Comment to post when closing a stale issue. Set to `false` to disable
+closeComment: >
+  This issue is being automatically closed due to inactivity.
+  However, you may choose to reopen this issue.
--- a/.github/workflows/bump_deps.yaml
+++ b/.github/workflows/bump_deps.yaml
@@ -0,0 +1,63 @@
+name: Bump dependencies
+on:
+  schedule:
+    - cron: 0 20 * * *
+  workflow_dispatch:
+jobs:
+  bump:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: "go-skynet/go-llama.cpp"
+            variable: "GOLLAMA_VERSION"
+            branch: "master"
+          - repository: "ggerganov/llama.cpp"
+            variable: "CPPLLAMA_VERSION"
+            branch: "master"
+          - repository: "go-skynet/go-ggml-transformers.cpp"
+            variable: "GOGGMLTRANSFORMERS_VERSION"
+            branch: "master"
+          - repository: "donomii/go-rwkv.cpp"
+            variable: "RWKV_VERSION"
+            branch: "main"
+          - repository: "ggerganov/whisper.cpp"
+            variable: "WHISPER_CPP_VERSION"
+            branch: "master"
+          - repository: "go-skynet/go-bert.cpp"
+            variable: "BERT_VERSION"
+            branch: "master"
+          - repository: "go-skynet/bloomz.cpp"
+            variable: "BLOOMZ_VERSION"
+            branch: "main"
+          - repository: "nomic-ai/gpt4all"
+            variable: "GPT4ALL_VERSION"
+            branch: "main"
+          - repository: "mudler/go-ggllm.cpp"
+            variable: "GOGGLLM_VERSION"
+            branch: "master"
+          - repository: "mudler/go-stable-diffusion"
+            variable: "STABLEDIFFUSION_VERSION"
+            branch: "master"
+          - repository: "mudler/go-piper"
+            variable: "PIPER_VERSION"
+            branch: "master"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Bump dependencies 🔧
+        run: |
+          bash .github/bump_deps.sh ${{ matrix.repository }} ${{ matrix.branch }} ${{ matrix.variable }}
+      - name: Create Pull Request
+        uses: peter-evans/create-pull-request@v5
+        with:
+          token: ${{ secrets.UPDATE_BOT_TOKEN }}
+          push-to-fork: ci-forks/LocalAI
+          commit-message: ':arrow_up: Update ${{ matrix.repository }}'
+          title: ':arrow_up: Update ${{ matrix.repository }}'
+          branch: "update/${{ matrix.variable }}"
+          body: Bump of ${{ matrix.repository }} version
+          signoff: true
+
+
+
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -9,36 +9,115 @@ on:
    tags:
      - '*'

+concurrency:
+  group: ci-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
 jobs:
-  docker:
-    runs-on: ubuntu-latest
+  image-build:
+    strategy:
+      matrix:
+        include:
+          - build-type: ''
+            #platforms: 'linux/amd64,linux/arm64'
+            platforms: 'linux/amd64'
+            tag-latest: 'auto'
+            tag-suffix: ''
+            ffmpeg: ''
+          - build-type: ''
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-ffmpeg'
+            ffmpeg: 'true'
+          - build-type: 'cublas'
+            cuda-major-version: 11
+            cuda-minor-version: 7
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda11'
+            ffmpeg: ''
+          - build-type: 'cublas'
+            cuda-major-version: 12
+            cuda-minor-version: 1
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12'
+            ffmpeg: ''
+          - build-type: 'cublas'
+            cuda-major-version: 11
+            cuda-minor-version: 7
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda11-ffmpeg'
+            ffmpeg: 'true'
+          - build-type: 'cublas'
+            cuda-major-version: 12
+            cuda-minor-version: 1
+            platforms: 'linux/amd64'
+            tag-latest: 'false'
+            tag-suffix: '-cublas-cuda12-ffmpeg'
+            ffmpeg: 'true'
+
+    runs-on: arc-runner-set 
    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-
-      - name: Prepare
-        id: prep
+      - name: Force Install GIT latest
        run: |
-          DOCKER_IMAGE=quay.io/go-skynet/local-ai
-          VERSION=master
-          SHORTREF=${GITHUB_SHA::8}
-
-          # If this is git tag, use the tag name as a docker tag
-          if [[ $GITHUB_REF == refs/tags/* ]]; then
-            VERSION=${GITHUB_REF#refs/tags/}
-          fi
-          TAGS="${DOCKER_IMAGE}:${VERSION},${DOCKER_IMAGE}:${SHORTREF}"
-
-          # If the VERSION looks like a version number, assume that
-          # this is the most recent version of the image and also
-          # tag it 'latest'.
-          if [[ $VERSION =~ ^v[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
-            TAGS="$TAGS,${DOCKER_IMAGE}:latest"
-          fi
-
-          # Set output parameters.
-          echo ::set-output name=tags::${TAGS}
-          echo ::set-output name=docker_image::${DOCKER_IMAGE}
+          sudo apt-get update \
+          && sudo apt-get install -y software-properties-common \
+          && sudo apt-get update \
+          && sudo add-apt-repository -y ppa:git-core/ppa \
+          && sudo apt-get update \
+          && sudo apt-get install -y git
+      - name: Checkout
+        uses: actions/checkout@v4
+      # - name: Release space from worker
+      #   run: |
+      #     echo "Listing top largest packages"
+      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+      #     head -n 30 <<< "${pkgs}"
+      #     echo
+      #     df -h
+      #     echo
+      #     sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+      #     sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+      #     sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+      #     sudo rm -rf /usr/local/lib/android
+      #     sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+      #     sudo rm -rf /usr/share/dotnet
+      #     sudo apt-get remove -y '^mono-.*' || true
+      #     sudo apt-get remove -y '^ghc-.*' || true
+      #     sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+      #     sudo apt-get remove -y 'php.*' || true
+      #     sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+      #     sudo apt-get remove -y '^google-.*' || true
+      #     sudo apt-get remove -y azure-cli || true
+      #     sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+      #     sudo apt-get remove -y '^gfortran-.*' || true
+      #     sudo apt-get remove -y microsoft-edge-stable || true
+      #     sudo apt-get remove -y firefox || true
+      #     sudo apt-get remove -y powershell || true
+      #     sudo apt-get remove -y r-base-core || true
+      #     sudo apt-get autoremove -y
+      #     sudo apt-get clean
+      #     echo
+      #     echo "Listing top largest packages"
+      #     pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+      #     head -n 30 <<< "${pkgs}"
+      #     echo
+      #     sudo rm -rfv build || true
+      #     df -h
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: quay.io/go-skynet/local-ai
+          tags: |
+            type=ref,event=branch
+            type=semver,pattern={{raw}}
+            type=sha
+          flavor: |
+            latest=${{ matrix.tag-latest }}
+            suffix=${{ matrix.tag-suffix }}

      - name: Set up QEMU
        uses: docker/setup-qemu-action@master
@@ -51,28 +130,24 @@ jobs:

      - name: Login to DockerHub
        if: github.event_name != 'pull_request'
-        uses: docker/login-action@v2
+        uses: docker/login-action@v3
        with:
          registry: quay.io
-          username: ${{ secrets.QUAY_USERNAME }}
-          password: ${{ secrets.QUAY_PASSWORD }}
-      - name: Build
-        if: github.event_name != 'pull_request'
-        uses: docker/build-push-action@v4
+          username: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
+          password: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v5
        with:
          builder: ${{ steps.buildx.outputs.name }}
+          build-args: |
+            BUILD_TYPE=${{ matrix.build-type }}
+            CUDA_MAJOR_VERSION=${{ matrix.cuda-major-version }}
+            CUDA_MINOR_VERSION=${{ matrix.cuda-minor-version }}
+            FFMPEG=${{ matrix.ffmpeg }}
          context: .
          file: ./Dockerfile
-          platforms: linux/amd64,linux/arm64
-          push: true
-          tags: ${{ steps.prep.outputs.tags }}
-      - name: Build PRs
-        if: github.event_name == 'pull_request'
-        uses: docker/build-push-action@v4
-        with:
-          builder: ${{ steps.buildx.outputs.name }}
-          context: .
-          file: ./Dockerfile
-          platforms: linux/amd64
-          push: false
-          tags: ${{ steps.prep.outputs.tags }}
+          platforms: ${{ matrix.platforms }}
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -0,0 +1,99 @@
+name: Build and Release
+
+on: push
+
+permissions:
+  contents: write
+
+jobs:
+  build-linux:
+    strategy:
+      matrix:
+        include:
+          - build: 'avx2'
+            defines: ''
+          - build: 'avx'
+            defines: '-DLLAMA_AVX2=OFF'
+          - build: 'avx512'
+            defines: '-DLLAMA_AVX512=ON'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '>=1.21.0'
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential ffmpeg
+
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && sudo make -j12 install
+
+      - name: Build
+        id: build
+        env:
+          CMAKE_ARGS: "${{ matrix.defines }}"
+          BUILD_ID: "${{ matrix.build }}"
+        run: |
+          STATIC=true make dist
+      - uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.build }}
+          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v1
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*
+
+  build-macOS:
+    strategy:
+      matrix:
+        include:
+          - build: 'avx2'
+            defines: ''
+          - build: 'avx'
+            defines: '-DLLAMA_AVX2=OFF'
+          - build: 'avx512'
+            defines: '-DLLAMA_AVX512=ON'
+    runs-on: macOS-latest
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+      - uses: actions/setup-go@v4
+        with:
+          go-version: '>=1.21.0'
+      - name: Dependencies
+        run: |
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && make -j12 install && rm -rf grpc
+      - name: Build
+        id: build
+        env:
+          CMAKE_ARGS: "${{ matrix.defines }}"
+          BUILD_ID: "${{ matrix.build }}"
+        run: |
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
+          make dist
+      - uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.build }}
+          path: release/
+      - name: Release
+        uses: softprops/action-gh-release@v1
+        if: startsWith(github.ref, 'refs/tags/')
+        with:
+          files: |
+            release/*
--- a/.github/workflows/release.yml.disabled
+++ b/.github/workflows/release.yml.disabled
@@ -1,26 +0,0 @@
-name: goreleaser
-
-on:
-  push:
-    tags:
-      - 'v*'
-
-jobs:
-  goreleaser:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-      - name: Set up Go
-        uses: actions/setup-go@v3
-        with:
-          go-version: 1.18
-      - name: Run GoReleaser
-        uses: goreleaser/goreleaser-action@v4
-        with:
-          version: latest
-          args: release --clean
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/test-gpu.yml
+++ b/.github/workflows/test-gpu.yml
@@ -0,0 +1,63 @@
+---
+name: 'GPU tests'
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+    tags:
+      - '*'
+
+concurrency:
+  group: ci-gpu-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true
+
+jobs:
+  ubuntu-latest:
+    runs-on: gpu
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with: 
+          submodules: true
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo DEBIAN_FRONTEND=noninteractive apt-get install -y make wget
+      - name: Build
+        run: |
+          if [ ! -e /run/systemd/system ]; then
+            sudo mkdir /run/systemd/system
+          fi
+          sudo mkdir -p /host/tests/${{ github.head_ref || github.ref }}
+          sudo chmod -R 777 /host/tests/${{ github.head_ref || github.ref }}
+          make \
+            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
+            BUILD_TYPE=cublas \
+            prepare-e2e run-e2e-image test-e2e
+      - name: Release space from worker ♻
+        if: always()
+        run: |
+          sudo rm -rf build || true
+          sudo rm -rf bin || true
+          sudo rm -rf dist || true
+          sudo docker logs $(sudo docker ps -q --filter ancestor=localai-tests) > logs.txt
+          sudo cat logs.txt || true
+          sudo rm -rf logs.txt
+          make clean || true
+          make \
+            TEST_DIR="/host/tests/${{ github.head_ref || github.ref }}" \
+            teardown-e2e || true
+          sudo rm -rf /host/tests/${{ github.head_ref || github.ref }} || true
+          docker system prune -f -a --volumes || true
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -9,36 +9,118 @@ on:
    tags:
      - '*'

-jobs:
-  ubuntu-latest:
-    runs-on: ubuntu-latest
+concurrency:
+  group: ci-tests-${{ github.head_ref || github.ref }}-${{ github.repository }}
+  cancel-in-progress: true

+jobs:
+  tests-linux:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
    steps:
+      - name: Release space from worker
+        run: |
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          df -h
+          echo
+          sudo apt-get remove -y '^llvm-.*|^libllvm.*' || true
+          sudo apt-get remove --auto-remove android-sdk-platform-tools || true
+          sudo apt-get purge --auto-remove android-sdk-platform-tools || true
+          sudo rm -rf /usr/local/lib/android
+          sudo apt-get remove -y '^dotnet-.*|^aspnetcore-.*' || true
+          sudo rm -rf /usr/share/dotnet
+          sudo apt-get remove -y '^mono-.*' || true
+          sudo apt-get remove -y '^ghc-.*' || true
+          sudo apt-get remove -y '.*jdk.*|.*jre.*' || true
+          sudo apt-get remove -y 'php.*' || true
+          sudo apt-get remove -y hhvm powershell firefox monodoc-manual msbuild || true
+          sudo apt-get remove -y '^google-.*' || true
+          sudo apt-get remove -y azure-cli || true
+          sudo apt-get remove -y '^mongo.*-.*|^postgresql-.*|^mysql-.*|^mssql-.*' || true
+          sudo apt-get remove -y '^gfortran-.*' || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          echo
+          echo "Listing top largest packages"
+          pkgs=$(dpkg-query -Wf '${Installed-Size}\t${Package}\t${Status}\n' | awk '$NF == "installed"{print $1 "\t" $2}' | sort -nr)
+          head -n 30 <<< "${pkgs}"
+          echo
+          sudo rm -rfv build || true
+          df -h
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with: 
          submodules: true
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential
+          sudo apt-get install build-essential ffmpeg
+          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+              gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list' && \
+             sudo /bin/bash -c 'echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list' && \
+             sudo apt-get update && \
+             sudo apt-get install -y conda
+          sudo apt-get install -y ca-certificates cmake curl patch
+          sudo apt-get install -y libopencv-dev && sudo ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2
+          
+          sudo rm -rfv /usr/bin/conda || true
+          PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface
+
+          # Pre-build piper before we start tests in order to have shared libraries in place
+          make go-piper && \
+          GO_TAGS="tts" make -C go-piper piper.o && \
+          sudo cp -rfv go-piper/piper/build/pi/lib/. /usr/lib/ && \
+
+          # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn)
+          GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && sudo make -j12 install
      - name: Test
        run: |
-          make test
+          GO_TAGS="stablediffusion tts" make test

-  macOS-latest:
+  tests-apple:
    runs-on: macOS-latest
-
+    strategy:
+      matrix:
+        go-version: ['1.21.x']
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with: 
          submodules: true
-
+      - name: Setup Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v4
+        with:
+          go-version: ${{ matrix.go-version }}
+      # You can test your matrix by printing the current Go version
+      - name: Display Go version
+        run: go version
      - name: Dependencies
        run: |
-          brew update
-          brew install sdl2
+          git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+              cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+                -DgRPC_BUILD_TESTS=OFF \
+                ../.. && make -j12 install && rm -rf grpc
      - name: Test
        run: |
-          make test
+          export C_INCLUDE_PATH=/usr/local/include
+          export CPLUS_INCLUDE_PATH=/usr/local/include
+          CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make test
--- a/.gitignore
+++ b/.gitignore
@@ -1,15 +1,45 @@
 # go-llama build artifacts
 go-llama
-go-gpt4all-j
+go-llama-stable
+/gpt4all
+go-stable-diffusion
+go-piper
+/go-bert
+go-ggllm
+/piper
+__pycache__/
+*.a
+get-sources
+/backend/cpp/llama/grpc-server
+/backend/cpp/llama/llama.cpp
+
+go-ggml-transformers
 go-gpt2
+go-rwkv
+whisper.cpp
+/bloomz
+go-bert

 # LocalAI build binary
 LocalAI
 local-ai
 # prevent above rules from omitting the helm chart
 !charts/*
+# prevent above rules from omitting the api/localai folder
+!api/localai

 # Ignore models
-models/*.bin
-models/ggml-*
-test-models/
+models/*
+test-models/
+test-dir/
+
+release/
+
+# just in case
+.DS_Store
+.idea
+
+# Generated during build
+backend-assets/
+prepare
+/ggml-metal.metal
--- a/.goreleaser.yaml
+++ b/.goreleaser.yaml
@@ -1,15 +0,0 @@
-# Make sure to check the documentation at http://goreleaser.com
-project_name: local-ai
-builds:
-  - ldflags:
-      - -w -s
-    env:
-      - CGO_ENABLED=0
-    goos:
-      - linux
-      - darwin
-      - windows
-    goarch:
-      - amd64
-      - arm64
-    binary: '{{ .ProjectName }}'
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -2,7 +2,20 @@
    "version": "0.2.0",
    "configurations": [
        {
-            "name": "Launch Go",
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "cwd": "${workspaceFolder}/examples/langchain-chroma",
+            "env": {
+                "OPENAI_API_BASE": "http://localhost:8080/v1",
+                "OPENAI_API_KEY": "abc"
+            }
+        },
+        {
+            "name": "Launch LocalAI API",
            "type": "go",
            "request": "launch",
            "mode": "debug",
@@ -11,8 +24,8 @@
                "api"
            ],
            "env": {
-                "C_INCLUDE_PATH": "/workspace/go-llama:/workspace/go-gpt4all-j:/workspace/go-gpt2",
-                "LIBRARY_PATH": "/workspace/go-llama:/workspace/go-gpt4all-j:/workspace/go-gpt2",
+                "C_INCLUDE_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
+                "LIBRARY_PATH": "${workspaceFolder}/go-llama:${workspaceFolder}/go-stable-diffusion/:${workspaceFolder}/gpt4all/gpt4all-bindings/golang/:${workspaceFolder}/go-gpt2:${workspaceFolder}/go-rwkv:${workspaceFolder}/whisper.cpp:${workspaceFolder}/go-bert:${workspaceFolder}/bloomz",
                "DEBUG": "true"
            }
        }
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -0,0 +1,72 @@
+# Contributing to localAI
+
+Thank you for your interest in contributing to LocalAI! We appreciate your time and effort in helping to improve our project. Before you get started, please take a moment to review these guidelines.
+
+## Table of Contents
+
+- [Getting Started](#getting-started)
+  - [Prerequisites](#prerequisites)
+  - [Setting up the Development Environment](#setting-up-the-development-environment)
+- [Contributing](#contributing)
+  - [Submitting an Issue](#submitting-an-issue)
+  - [Creating a Pull Request (PR)](#creating-a-pull-request-pr)
+- [Coding Guidelines](#coding-guidelines)
+- [Testing](#testing)
+- [Documentation](#documentation)
+- [Community and Communication](#community-and-communication)
+
+
+
+## Getting Started
+
+### Prerequisites
+
+- Golang [1.21]
+- Git
+- macOS/Linux
+
+### Setting up the Development Environment and running localAI in the local environment
+
+1. Clone the repository: `git clone https://github.com/go-skynet/LocalAI.git`
+2. Navigate to the project directory: `cd LocalAI`
+3. Install the required dependencies: `make prepare`
+4. Run LocalAI: `make run`
+
+## Contributing
+
+We welcome contributions from everyone! To get started, follow these steps:
+
+### Submitting an Issue
+
+If you find a bug, have a feature request, or encounter any issues, please check the [issue tracker](https://github.com/go-skynet/LocalAI/issues) to see if a similar issue has already been reported. If not, feel free to [create a new issue](https://github.com/go-skynet/LocalAI/issues/new) and provide as much detail as possible.
+
+### Creating a Pull Request (PR)
+
+1. Fork the repository.
+2. Create a new branch with a descriptive name: `git checkout -b [branch name]`
+3. Make your changes and commit them.
+4. Push the changes to your fork: `git push origin [branch name]`
+5. Create a new pull request from your branch to the main project's `main` or `master` branch.
+6. Provide a clear description of your changes in the pull request.
+7. Make any requested changes during the review process.
+8. Once your PR is approved, it will be merged into the main project.
+
+## Coding Guidelines
+
+- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
+
+## Testing
+
+`make test` cannot handle all the model now. Please be sure to add a test case for the new features or the part was changed.
+
+## Documentation
+
+- We are welcome the contribution of the documents, please open new PR in the official document repo [localai-website](https://github.com/go-skynet/localai-website)
+
+## Community and Communication
+
+- You can reach out via the Github issue tracker.
+- Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
+- Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
+
+---
--- a/209
+++ b/209
@@ -1,13 +1,206 @@
-ARG GO_VERSION=1.20
-ARG DEBIAN_VERSION=11
-ARG BUILD_TYPE=
+ARG GO_VERSION=1.21-bullseye
+ARG IMAGE_TYPE=extras
+# extras or core
+
+
+FROM golang:$GO_VERSION as requirements-core
+
+ARG BUILD_TYPE
+ARG CUDA_MAJOR_VERSION=11
+ARG CUDA_MINOR_VERSION=7
+ARG TARGETARCH
+ARG TARGETVARIANT
+
+ENV BUILD_TYPE=${BUILD_TYPE}
+ENV EXTERNAL_GRPC_BACKENDS="huggingface-embeddings:/build/extra/grpc/huggingface/run.sh,autogptq:/build/extra/grpc/autogptq/run.sh,bark:/build/extra/grpc/bark/run.sh,diffusers:/build/extra/grpc/diffusers/run.sh,exllama:/build/extra/grpc/exllama/run.sh,vall-e-x:/build/extra/grpc/vall-e-x/run.sh,vllm:/build/extra/grpc/vllm/run.sh"
+ENV GALLERIES='[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'
+ARG GO_TAGS="stablediffusion tts"
+
+RUN apt-get update && \
+    apt-get install -y ca-certificates curl patch pip cmake && apt-get clean
+
+
+COPY --chmod=644 custom-ca-certs/* /usr/local/share/ca-certificates/
+RUN update-ca-certificates
+
+# Use the variables in subsequent instructions
+RUN echo "Target Architecture: $TARGETARCH"
+RUN echo "Target Variant: $TARGETVARIANT"
+
+# CuBLAS requirements
+RUN if [ "${BUILD_TYPE}" = "cublas" ]; then \
+    apt-get install -y software-properties-common && \
+    apt-add-repository contrib && \
+    curl -O https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.0-1_all.deb && \
+    dpkg -i cuda-keyring_1.0-1_all.deb && \
+    rm -f cuda-keyring_1.0-1_all.deb && \
+    apt-get update && \
+    apt-get install -y cuda-nvcc-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcublas-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusparse-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION} libcusolver-dev-${CUDA_MAJOR_VERSION}-${CUDA_MINOR_VERSION}  && apt-get clean \
+    ; fi
+ENV PATH /usr/local/cuda/bin:${PATH}
+
+# OpenBLAS requirements and stable diffusion
+RUN apt-get install -y \
+    libopenblas-dev \
+    libopencv-dev \ 
+    && apt-get clean
+
+# Set up OpenCV
+RUN ln -s /usr/include/opencv4/opencv2 /usr/include/opencv2

-FROM golang:$GO_VERSION as builder
 WORKDIR /build
-RUN apt-get update && apt-get install -y cmake
+
+RUN test -n "$TARGETARCH" \
+    || (echo 'warn: missing $TARGETARCH, either set this `ARG` manually, or run using `docker buildkit`')
+
+# Extras requirements
+FROM requirements-core as requirements-extras
+
+RUN curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
+    install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
+    gpg --keyring /usr/share/keyrings/conda-archive-keyring.gpg --no-default-keyring --fingerprint 34161F5BF5EB1D4BFBBB8F0A8AEB4F8B29D82806 && \
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" > /etc/apt/sources.list.d/conda.list && \
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/conda-archive-keyring.gpg] https://repo.anaconda.com/pkgs/misc/debrepo/conda stable main" | tee -a /etc/apt/sources.list.d/conda.list && \
+    apt-get update && \
+    apt-get install -y conda
+
+COPY extra/requirements.txt /build/extra/requirements.txt
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN pip install --upgrade pip
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+#RUN if [ "${TARGETARCH}" = "amd64" ]; then \
+#        pip install git+https://github.com/suno-ai/bark.git diffusers invisible_watermark transformers accelerate safetensors;\
+#    fi
+#RUN if [ "${BUILD_TYPE}" = "cublas" ] && [ "${TARGETARCH}" = "amd64" ]; then \
+#        pip install torch vllm && pip install auto-gptq https://github.com/jllllll/exllama/releases/download/0.0.10/exllama-0.0.10+cu${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}-cp39-cp39-linux_x86_64.whl;\
+ #   fi
+#RUN pip install -r /build/extra/requirements.txt && rm -rf /build/extra/requirements.txt
+
+# Vall-e-X
+RUN git clone https://github.com/Plachtaa/VALL-E-X.git /usr/lib/vall-e-x && cd /usr/lib/vall-e-x && pip install -r requirements.txt
+
+# \
+#    ; fi
+
+###################################
+###################################
+
+FROM requirements-${IMAGE_TYPE} as builder
+
+ARG GO_TAGS="stablediffusion tts"
+ARG GRPC_BACKENDS
+ARG BUILD_GRPC=true
+ENV GRPC_BACKENDS=${GRPC_BACKENDS}
+ENV GO_TAGS=${GO_TAGS}
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
+ENV NVIDIA_VISIBLE_DEVICES=all
+
+WORKDIR /build
+
+COPY Makefile .
+RUN make get-sources
+COPY go.mod .
+RUN make prepare
 COPY . .
+COPY .git .
+
+# stablediffusion does not tolerate a newer version of abseil, build it first
+RUN GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build
+
+RUN if [ "${BUILD_GRPC}" = "true" ]; then \
+    git clone --recurse-submodules -b v1.58.0 --depth 1 --shallow-submodules https://github.com/grpc/grpc && \
+    cd grpc && mkdir -p cmake/build && cd cmake/build && cmake -DgRPC_INSTALL=ON \
+      -DgRPC_BUILD_TESTS=OFF \
+       ../.. && make -j12 install && rm -rf grpc \
+    ; fi
+
+# Rebuild with defaults backends
 RUN make build

-FROM debian:$DEBIAN_VERSION
-COPY --from=builder /build/local-ai /usr/bin/local-ai
-ENTRYPOINT [ "/usr/bin/local-ai" ]
+RUN if [ ! -d "/build/go-piper/piper/build/pi/lib/" ]; then \
+    mkdir -p /build/go-piper/piper/build/pi/lib/ \
+    touch /build/go-piper/piper/build/pi/lib/keep \
+    ; fi
+
+###################################
+###################################
+
+FROM requirements-${IMAGE_TYPE}
+
+ARG FFMPEG
+ARG BUILD_TYPE
+ARG TARGETARCH
+ARG IMAGE_TYPE=extras
+
+ENV BUILD_TYPE=${BUILD_TYPE}
+ENV REBUILD=false
+ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
+
+ARG CUDA_MAJOR_VERSION=11
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
+ENV NVIDIA_REQUIRE_CUDA="cuda>=${CUDA_MAJOR_VERSION}.0"
+ENV NVIDIA_VISIBLE_DEVICES=all
+
+# Add FFmpeg
+RUN if [ "${FFMPEG}" = "true" ]; then \
+    apt-get install -y ffmpeg \
+    ; fi
+
+WORKDIR /build
+
+# we start fresh & re-copy all assets because `make build` does not clean up nicely after itself
+# so when `entrypoint.sh` runs `make build` again (which it does by default), the build would fail
+# see https://github.com/go-skynet/LocalAI/pull/658#discussion_r1241971626 and
+# https://github.com/go-skynet/LocalAI/pull/434
+COPY . .
+RUN make prepare-sources
+
+# Copy the binary
+COPY --from=builder /build/local-ai ./
+
+# Copy shared libraries for piper
+COPY --from=builder /build/go-piper/piper/build/pi/lib/* /usr/lib/
+
+# do not let stablediffusion rebuild (requires an older version of absl)
+COPY --from=builder /build/backend-assets/grpc/stablediffusion ./backend-assets/grpc/stablediffusion
+
+## Duplicated from Makefile to avoid having a big layer that's hard to push
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/autogptq \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/bark \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/diffusers \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/vllm \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/huggingface \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/vall-e-x \
+    ; fi
+RUN if [ "${IMAGE_TYPE}" = "extras" ]; then \
+	PATH=$PATH:/opt/conda/bin make -C extra/grpc/exllama \
+    ; fi
+
+# Copy VALLE-X as it's not a real "lib"
+RUN if [ -d /usr/lib/vall-e-x ]; then \
+    cp -rfv /usr/lib/vall-e-x/* ./ ; \ 
+    fi
+
+# we also copy exllama libs over to resolve exllama import error
+RUN if [ -d /usr/local/lib/python3.9/dist-packages/exllama ]; then \
+        cp -rfv /usr/local/lib/python3.9/dist-packages/exllama extra/grpc/exllama/;\
+    fi
+
+# Define the health check command
+HEALTHCHECK --interval=1m --timeout=10m --retries=10 \
+  CMD curl -f $HEALTHCHECK_ENDPOINT || exit 1
+
+EXPOSE 8080
+ENTRYPOINT [ "/build/entrypoint.sh" ]
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2023 go-skynet authors
+Copyright (c) 2023 Ettore Di Giacinto

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/521
+++ b/521
@@ -2,109 +2,361 @@ GOCMD=go
 GOTEST=$(GOCMD) test
 GOVET=$(GOCMD) vet
 BINARY_NAME=local-ai
-# renovate: datasource=github-tags depName=go-skynet/go-llama.cpp
-GOLLAMA_VERSION?=llama.cpp-25d7abb
-# renovate: datasource=git-refs packageNameTemplate=https://github.com/go-skynet/go-gpt4all-j.cpp currentValueTemplate=master depNameTemplate=go-gpt4all-j.cpp
-GOGPT4ALLJ_VERSION?=1f7bff57f66cb7062e40d0ac3abd2217815e5109
-# renovate: datasource=git-refs packageNameTemplate=https://github.com/go-skynet/go-gpt2.cpp currentValueTemplate=master depNameTemplate=go-gpt2.cpp
-GOGPT2_VERSION?=245a5bfe6708ab80dc5c733dcdbfbe3cfd2acdaa

+# llama.cpp versions
+GOLLAMA_VERSION?=aeba71ee842819da681ea537e78846dc75949ac0
+
+GOLLAMA_STABLE_VERSION?=50cee7712066d9e38306eccadcfbb44ea87df4b7
+
+CPPLLAMA_VERSION?=a75fa576abba9d37f463580c379e4bbf1e1ad03c
+
+# gpt4all version
+GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
+GPT4ALL_VERSION?=27a8b020c36b0df8f8b82a252d261cda47cf44b8
+
+# go-ggml-transformers version
+GOGGMLTRANSFORMERS_VERSION?=ffb09d7dd71e2cbc6c5d7d05357d230eea6f369a
+
+# go-rwkv version
+RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
+RWKV_VERSION?=c898cd0f62df8f2a7830e53d1d513bef4f6f792b
+
+# whisper.cpp version
+WHISPER_CPP_VERSION?=85ed71aaec8e0612a84c0b67804bde75aa75a273
+
+# bert.cpp version
+BERT_VERSION?=6abe312cded14042f6b7c3cd8edf082713334a4d
+
+# go-piper version
+PIPER_VERSION?=736f6fb639ab8e3397356e48eeb6bdcb9da88a78
+
+# stablediffusion version
+STABLEDIFFUSION_VERSION?=d89260f598afb809279bc72aa0107b4292587632
+
+export BUILD_TYPE?=
+export STABLE_BUILD_TYPE?=$(BUILD_TYPE)
+export CMAKE_ARGS?=
+CGO_LDFLAGS?=
+CUDA_LIBPATH?=/usr/local/cuda/lib64/
+GO_TAGS?=
+BUILD_ID?=git
+
+TEST_DIR=/tmp/test
+
+RANDOM := $(shell bash -c 'echo $$RANDOM')
+
+VERSION?=$(shell git describe --always --tags || echo "dev" )
+# go tool nm ./local-ai | grep Commit
+LD_FLAGS?=
+override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Version=$(VERSION)"
+override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"
+
+OPTIONAL_TARGETS?=
+
+OS := $(shell uname -s)
+ARCH := $(shell uname -m)
 GREEN  := $(shell tput -Txterm setaf 2)
 YELLOW := $(shell tput -Txterm setaf 3)
 WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)

-C_INCLUDE_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2
-LIBRARY_PATH=$(shell pwd)/go-llama:$(shell pwd)/go-gpt4all-j:$(shell pwd)/go-gpt2
+# Default Docker bridge IP
+E2E_BRIDGE_IP?=172.17.0.1

-# Use this if you want to set the default behavior
-ifndef BUILD_TYPE
-	BUILD_TYPE:=default
+ifndef UNAME_S
+UNAME_S := $(shell uname -s)
 endif

-ifeq ($(BUILD_TYPE), "generic")
-	GENERIC_PREFIX:=generic-
-else
-	GENERIC_PREFIX:=
+ifeq ($(UNAME_S),Darwin)
+	CGO_LDFLAGS += -lcblas -framework Accelerate
+ifneq ($(BUILD_TYPE),metal)
+    # explicit disable metal if on Darwin and metal is disabled
+	CMAKE_ARGS+=-DLLAMA_METAL=OFF
+endif
+endif
+
+ifeq ($(BUILD_TYPE),openblas)
+	CGO_LDFLAGS+=-lopenblas
+endif
+
+ifeq ($(BUILD_TYPE),cublas)
+	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
+	export LLAMA_CUBLAS=1
+endif
+
+ifeq ($(BUILD_TYPE),hipblas)
+	ROCM_HOME ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	# Llama-stable has no hipblas support, so override it here.
+	export STABLE_BUILD_TYPE=
+	GPU_TARGETS ?= gfx900,gfx90a,gfx1030,gfx1031,gfx1100
+	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
+	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link
+endif
+
+ifeq ($(BUILD_TYPE),metal)
+	CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+	export LLAMA_METAL=1
+endif
+
+ifeq ($(BUILD_TYPE),clblas)
+	CGO_LDFLAGS+=-lOpenCL -lclblast
+endif
+
+# glibc-static or glibc-devel-static required
+ifeq ($(STATIC),true)
+	LD_FLAGS=-linkmode external -extldflags -static
+endif
+
+ifeq ($(findstring stablediffusion,$(GO_TAGS)),stablediffusion)
+#	OPTIONAL_TARGETS+=go-stable-diffusion/libstablediffusion.a
+	OPTIONAL_GRPC+=backend-assets/grpc/stablediffusion
+endif
+
+ifeq ($(findstring tts,$(GO_TAGS)),tts)
+#	OPTIONAL_TARGETS+=go-piper/libpiper_binding.a
+#	OPTIONAL_TARGETS+=backend-assets/espeak-ng-data
+	PIPER_CGO_CXXFLAGS+=-I$(shell pwd)/go-piper/piper/src/cpp -I$(shell pwd)/go-piper/piper/build/fi/include -I$(shell pwd)/go-piper/piper/build/pi/include -I$(shell pwd)/go-piper/piper/build/si/include
+ 	PIPER_CGO_LDFLAGS+=-L$(shell pwd)/go-piper/piper/build/fi/lib -L$(shell pwd)/go-piper/piper/build/pi/lib -L$(shell pwd)/go-piper/piper/build/si/lib -lfmt -lspdlog
+	OPTIONAL_GRPC+=backend-assets/grpc/piper
+endif
+
+ALL_GRPC_BACKENDS=backend-assets/grpc/langchain-huggingface backend-assets/grpc/falcon-ggml backend-assets/grpc/bert-embeddings backend-assets/grpc/llama backend-assets/grpc/llama-cpp backend-assets/grpc/llama-stable backend-assets/grpc/gpt4all backend-assets/grpc/dolly backend-assets/grpc/gpt2 backend-assets/grpc/gptj backend-assets/grpc/gptneox backend-assets/grpc/mpt backend-assets/grpc/replit backend-assets/grpc/starcoder backend-assets/grpc/rwkv backend-assets/grpc/whisper $(OPTIONAL_GRPC)
+GRPC_BACKENDS?=$(ALL_GRPC_BACKENDS) $(OPTIONAL_GRPC)
+
+# If empty, then we build all
+ifeq ($(GRPC_BACKENDS),)
+	GRPC_BACKENDS=$(ALL_GRPC_BACKENDS)
 endif

 .PHONY: all test build vendor

 all: help

-## Build:
+## GPT4ALL
+gpt4all:
+	git clone --recurse-submodules $(GPT4ALL_REPO) gpt4all
+	cd gpt4all && git checkout -b build $(GPT4ALL_VERSION) && git submodule update --init --recursive --depth 1

-build: prepare ## Build the project
-	$(info ${GREEN}I local-ai build info:${RESET})
-	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
-	C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} $(GOCMD) build -o $(BINARY_NAME) ./
+## go-piper
+go-piper:
+	git clone --recurse-submodules https://github.com/mudler/go-piper go-piper
+	cd go-piper && git checkout -b build $(PIPER_VERSION) && git submodule update --init --recursive --depth 1

-generic-build: ## Build the project using generic
-	BUILD_TYPE="generic" $(MAKE) build
+## BERT embeddings
+go-bert:
+	git clone --recurse-submodules https://github.com/go-skynet/go-bert.cpp go-bert
+	cd go-bert && git checkout -b build $(BERT_VERSION) && git submodule update --init --recursive --depth 1

-## GPT4ALL-J
-go-gpt4all-j:
-	git clone --recurse-submodules https://github.com/go-skynet/go-gpt4all-j.cpp go-gpt4all-j
-	cd go-gpt4all-j && git checkout -b build $(GOGPT4ALLJ_VERSION)
-	# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
-	@find ./go-gpt4all-j -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
-	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
-	@find ./go-gpt4all-j -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_gptj_/g' {} +
-	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/gpt_/gptj_/g' {} +
-	@find ./go-gpt4all-j -type f -name "*.h" -exec sed -i'' -e 's/gpt_/gptj_/g' {} +
-	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/json_/json_gptj_/g' {} +
-	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/void replace/void json_gptj_replace/g' {} +
-	@find ./go-gpt4all-j -type f -name "*.cpp" -exec sed -i'' -e 's/::replace/::json_gptj_replace/g' {} +
+## stable diffusion
+go-stable-diffusion:
+	git clone --recurse-submodules https://github.com/mudler/go-stable-diffusion go-stable-diffusion
+	cd go-stable-diffusion && git checkout -b build $(STABLEDIFFUSION_VERSION) && git submodule update --init --recursive --depth 1

-go-gpt4all-j/libgptj.a: go-gpt4all-j
-	$(MAKE) -C go-gpt4all-j $(GENERIC_PREFIX)libgptj.a
+go-stable-diffusion/libstablediffusion.a:
+	$(MAKE) -C go-stable-diffusion libstablediffusion.a

-# CEREBRAS GPT
-go-gpt2:
-	git clone --recurse-submodules https://github.com/go-skynet/go-gpt2.cpp go-gpt2
-	cd go-gpt2 && git checkout -b build $(GOGPT2_VERSION)
-	# This is hackish, but needed as both go-llama and go-gpt4allj have their own version of ggml..
-	@find ./go-gpt2 -type f -name "*.c" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
-	@find ./go-gpt2 -type f -name "*.cpp" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
-	@find ./go-gpt2 -type f -name "*.h" -exec sed -i'' -e 's/ggml_/ggml_gpt2_/g' {} +
-	@find ./go-gpt2 -type f -name "*.cpp" -exec sed -i'' -e 's/gpt_/gpt2_/g' {} +
-	@find ./go-gpt2 -type f -name "*.h" -exec sed -i'' -e 's/gpt_/gpt2_/g' {} +
-	@find ./go-gpt2 -type f -name "*.cpp" -exec sed -i'' -e 's/json_/json_gpt2_/g' {} +
+## RWKV
+go-rwkv:
+	git clone --recurse-submodules $(RWKV_REPO) go-rwkv
+	cd go-rwkv && git checkout -b build $(RWKV_VERSION) && git submodule update --init --recursive --depth 1

-go-gpt2/libgpt2.a: go-gpt2
-	$(MAKE) -C go-gpt2 $(GENERIC_PREFIX)libgpt2.a
-	
+go-rwkv/librwkv.a: go-rwkv
+	cd go-rwkv && cd rwkv.cpp &&	cmake . -DRWKV_BUILD_SHARED_LIBRARY=OFF &&	cmake --build . && 	cp librwkv.a ..
+
+go-bert/libgobert.a: go-bert
+	$(MAKE) -C go-bert libgobert.a
+
+backend-assets/gpt4all: gpt4all/gpt4all-bindings/golang/libgpt4all.a
+	mkdir -p backend-assets/gpt4all
+	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.so backend-assets/gpt4all/ || true
+	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dylib backend-assets/gpt4all/ || true
+	@cp gpt4all/gpt4all-bindings/golang/buildllm/*.dll backend-assets/gpt4all/ || true
+
+backend-assets/espeak-ng-data: go-piper
+	mkdir -p backend-assets/espeak-ng-data
+	$(MAKE) -C go-piper piper.o
+	@cp -rf go-piper/piper/build/pi/share/espeak-ng-data/. backend-assets/espeak-ng-data
+
+gpt4all/gpt4all-bindings/golang/libgpt4all.a: gpt4all
+	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ libgpt4all.a
+
+## CEREBRAS GPT
+go-ggml-transformers:
+	git clone --recurse-submodules https://github.com/go-skynet/go-ggml-transformers.cpp go-ggml-transformers
+	cd go-ggml-transformers && git checkout -b build $(GOGPT2_VERSION) && git submodule update --init --recursive --depth 1
+
+go-ggml-transformers/libtransformers.a: go-ggml-transformers
+	$(MAKE) -C go-ggml-transformers BUILD_TYPE=$(BUILD_TYPE) libtransformers.a
+
+whisper.cpp:
+	git clone https://github.com/ggerganov/whisper.cpp.git
+	cd whisper.cpp && git checkout -b build $(WHISPER_CPP_VERSION) && git submodule update --init --recursive --depth 1
+
+whisper.cpp/libwhisper.a: whisper.cpp
+	cd whisper.cpp && make libwhisper.a

 go-llama:
-	git clone -b $(GOLLAMA_VERSION) --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama
+	cd go-llama && git checkout -b build $(GOLLAMA_VERSION) && git submodule update --init --recursive --depth 1
+
+go-llama-stable:
+	git clone --recurse-submodules https://github.com/go-skynet/go-llama.cpp go-llama-stable
+	cd go-llama-stable && git checkout -b build $(GOLLAMA_STABLE_VERSION) && git submodule update --init --recursive --depth 1

 go-llama/libbinding.a: go-llama
-	$(MAKE) -C go-llama $(GENERIC_PREFIX)libbinding.a
+	$(MAKE) -C go-llama BUILD_TYPE=$(BUILD_TYPE) libbinding.a
+
+go-llama-stable/libbinding.a: go-llama-stable
+	$(MAKE) -C go-llama-stable BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
+
+go-piper/libpiper_binding.a: go-piper
+	$(MAKE) -C go-piper libpiper_binding.a example/main
+
+get-sources: go-llama go-llama-stable go-ggml-transformers gpt4all go-piper go-rwkv whisper.cpp go-bert go-stable-diffusion
+	touch $@

 replace:
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt4all-j.cpp=$(shell pwd)/go-gpt4all-j
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-gpt2.cpp=$(shell pwd)/go-gpt2
+	$(GOCMD) mod edit -replace github.com/nomic-ai/gpt4all/gpt4all-bindings/golang=$(shell pwd)/gpt4all/gpt4all-bindings/golang
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-ggml-transformers.cpp=$(shell pwd)/go-ggml-transformers
+	$(GOCMD) mod edit -replace github.com/donomii/go-rwkv.cpp=$(shell pwd)/go-rwkv
+	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(shell pwd)/whisper.cpp
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(shell pwd)/go-bert
+	$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(shell pwd)/go-stable-diffusion
+	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(shell pwd)/go-piper

-prepare: go-llama/libbinding.a go-gpt4all-j/libgptj.a go-gpt2/libgpt2.a replace
+prepare-sources: get-sources replace
+	$(GOCMD) mod download
+
+## GENERIC
+rebuild: ## Rebuilds the project
+	$(GOCMD) clean -cache
+	$(MAKE) -C go-llama clean
+	$(MAKE) -C go-llama-stable clean
+	$(MAKE) -C gpt4all/gpt4all-bindings/golang/ clean
+	$(MAKE) -C go-ggml-transformers clean
+	$(MAKE) -C go-rwkv clean
+	$(MAKE) -C whisper.cpp clean
+	$(MAKE) -C go-stable-diffusion clean
+	$(MAKE) -C go-bert clean
+	$(MAKE) -C go-piper clean
+	$(MAKE) build
+
+prepare: prepare-sources $(OPTIONAL_TARGETS)
+	touch $@

 clean: ## Remove build related file
-	rm -fr ./go-llama
-	rm -rf ./go-gpt4all-j
+	$(GOCMD) clean -cache
+	rm -f prepare
+	rm -rf ./go-llama
+	rm -rf ./gpt4all
+	rm -rf ./go-llama-stable
 	rm -rf ./go-gpt2
+	rm -rf ./go-stable-diffusion
+	rm -rf ./go-ggml-transformers
+	rm -rf ./backend-assets
+	rm -rf ./go-rwkv
+	rm -rf ./go-bert
+	rm -rf ./whisper.cpp
+	rm -rf ./go-piper
 	rm -rf $(BINARY_NAME)
+	rm -rf release/
+	rm -rf ./backend/cpp/grpc/grpc_repo
+	rm -rf ./backend/cpp/grpc/build
+	rm -rf ./backend/cpp/grpc/installed_packages
+	$(MAKE) -C backend/cpp/llama clean

-## Run:
-run: prepare
-	C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} $(GOCMD) run ./main.go
+## Build:
+
+build: grpcs prepare ## Build the project
+	$(info ${GREEN}I local-ai build info:${RESET})
+	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
+	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
+	$(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
+
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./
+
+dist: build
+	mkdir -p release
+	cp $(BINARY_NAME) release/$(BINARY_NAME)-$(BUILD_ID)-$(OS)-$(ARCH)
+
+## Run
+run: prepare ## run local-ai
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./

 test-models/testmodel:
 	mkdir test-models
-	wget https://huggingface.co/concedo/cerebras-111M-ggml/resolve/main/cerberas-111m-q4_0.bin -O test-models/testmodel
+	mkdir test-dir
+	wget -q https://huggingface.co/nnakasato/ggml-model-test/resolve/main/ggml-model-q4.bin -O test-models/testmodel
+	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
+	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
+	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
+	wget -q https://huggingface.co/mudler/rwkv-4-raven-1.5B-ggml/resolve/main/RWKV-4-Raven-1B5-v11-Eng99%2525-Other1%2525-20230425-ctx4096_Q4_0.bin -O test-models/rwkv
+	wget -q https://raw.githubusercontent.com/saharNooby/rwkv.cpp/5eb8f09c146ea8124633ab041d9ea0b1f1db4459/rwkv/20B_tokenizer.json -O test-models/rwkv.tokenizer.json
+	cp tests/models_fixtures/* test-models

-test: prepare test-models/testmodel
-	@C_INCLUDE_PATH=${C_INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} MODELS_PATH=$(abspath ./)/test-models $(GOCMD) test -v ./...
+prepare-test: grpcs
+	cp -rf backend-assets api
+	cp tests/models_fixtures/* test-models
+
+test: prepare test-models/testmodel grpcs
+	@echo 'Running tests'
+	export GO_TAGS="tts stablediffusion"
+	$(MAKE) prepare-test
+	HUGGINGFACE_GRPC=$(abspath ./)/extra/grpc/huggingface/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!gpt4all && !llama && !llama-gguf"  --flake-attempts 5 --fail-fast -v -r ./api ./pkg
+	$(MAKE) test-gpt4all
+	$(MAKE) test-llama
+	$(MAKE) test-llama-gguf
+	$(MAKE) test-tts
+	$(MAKE) test-stablediffusion
+
+prepare-e2e:
+	mkdir -p $(TEST_DIR)
+	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
+	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
+	docker build --build-arg BUILD_GRPC=true --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=11 --build-arg CUDA_MINOR_VERSION=7 --build-arg FFMPEG=true -t localai-tests .
+
+run-e2e-image:
+	ls -liah $(abspath ./tests/e2e-fixtures)
+	docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
+
+test-e2e:
+	@echo 'Running e2e tests'
+	BUILD_TYPE=$(BUILD_TYPE) \
+	LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
+
+teardown-e2e:
+	rm -rf $(TEST_DIR) || true
+	docker stop $$(docker ps -q --filter ancestor=localai-tests)
+
+test-gpt4all: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="gpt4all" --flake-attempts 5 -v -r ./api ./pkg
+
+test-llama: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r ./api ./pkg
+
+test-llama-gguf: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r ./api ./pkg
+
+test-tts: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r ./api ./pkg
+
+test-stablediffusion: prepare-test
+	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r ./api ./pkg
+
+test-container:
+	docker build --target requirements -t local-ai-test-container .
+	docker run -ti --rm --entrypoint /bin/bash -ti -v $(abspath ./):/build local-ai-test-container

 ## Help:
 help: ## Show this help.
@@ -117,3 +369,142 @@ help: ## Show this help.
 		if (/^[a-zA-Z_-]+:.*?##.*$$/) {printf "    ${YELLOW}%-20s${GREEN}%s${RESET}\n", $$1, $$2} \
 		else if (/^## .*$$/) {printf "  ${CYAN}%s${RESET}\n", substr($$1,4)} \
 		}' $(MAKEFILE_LIST)
+
+protogen: protogen-go protogen-python
+
+protogen-go:
+	protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative \
+    pkg/grpc/proto/backend.proto
+
+protogen-python:
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/huggingface/ --grpc_python_out=extra/grpc/huggingface/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/autogptq/ --grpc_python_out=extra/grpc/autogptq/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/exllama/ --grpc_python_out=extra/grpc/exllama/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/bark/ --grpc_python_out=extra/grpc/bark/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/diffusers/ --grpc_python_out=extra/grpc/diffusers/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vall-e-x/ --grpc_python_out=extra/grpc/vall-e-x/ pkg/grpc/proto/backend.proto
+	python3 -m grpc_tools.protoc -Ipkg/grpc/proto/ --python_out=extra/grpc/vllm/ --grpc_python_out=extra/grpc/vllm/ pkg/grpc/proto/backend.proto
+
+## GRPC
+# Note: it is duplicated in the Dockerfile
+prepare-extra-conda-environments:
+	$(MAKE) -C extra/grpc/autogptq
+	$(MAKE) -C extra/grpc/bark
+	$(MAKE) -C extra/grpc/diffusers
+	$(MAKE) -C extra/grpc/vllm
+	$(MAKE) -C extra/grpc/huggingface
+	$(MAKE) -C extra/grpc/vall-e-x
+	$(MAKE) -C extra/grpc/exllama
+
+
+backend-assets/grpc:
+	mkdir -p backend-assets/grpc
+
+backend-assets/grpc/llama: backend-assets/grpc go-llama/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama LIBRARY_PATH=$(shell pwd)/go-llama \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama ./cmd/grpc/llama/
+# TODO: every binary should have its own folder instead, so can have different metal implementations
+ifeq ($(BUILD_TYPE),metal)
+	cp go-llama/build/bin/ggml-metal.metal backend-assets/grpc/
+endif
+
+## BACKEND CPP LLAMA START
+# Sets the variables in case it has to build the gRPC locally.
+INSTALLED_PACKAGES=$(CURDIR)/backend/cpp/grpc/installed_packages
+INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
+ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
+                 -DProtobuf_DIR=${INSTALLED_LIB_CMAKE}/protobuf \
+                 -Dutf8_range_DIR=${INSTALLED_LIB_CMAKE}/utf8_range \
+                 -DgRPC_DIR=${INSTALLED_LIB_CMAKE}/grpc \
+                 -DCMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES=${INSTALLED_PACKAGES}/include
+
+backend/cpp/llama/grpc-server:
+ifdef BUILD_GRPC_FOR_BACKEND_LLAMA
+	backend/cpp/grpc/script/build_grpc.sh ${INSTALLED_PACKAGES}
+	export _PROTOBUF_PROTOC=${INSTALLED_PACKAGES}/bin/proto && \
+	export _GRPC_CPP_PLUGIN_EXECUTABLE=${INSTALLED_PACKAGES}/bin/grpc_cpp_plugin && \
+	export PATH=${PATH}:${INSTALLED_PACKAGES}/bin && \
+	CMAKE_ARGS="${ADDED_CMAKE_ARGS}" LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server 
+else
+	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
+	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama grpc-server			
+endif
+## BACKEND CPP LLAMA END
+		
+##
+backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/grpc-server
+	cp -rfv backend/cpp/llama/grpc-server backend-assets/grpc/llama-cpp
+# TODO: every binary should have its own folder instead, so can have different metal implementations
+ifeq ($(BUILD_TYPE),metal)
+	cp backend/cpp/llama/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
+endif
+
+backend-assets/grpc/llama-stable: backend-assets/grpc go-llama-stable/libbinding.a
+	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(shell pwd)/go-llama-stable
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-llama-stable LIBRARY_PATH=$(shell pwd)/go-llama \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-stable ./cmd/grpc/llama-stable/
+
+backend-assets/grpc/gpt4all: backend-assets/grpc backend-assets/gpt4all gpt4all/gpt4all-bindings/golang/libgpt4all.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(shell pwd)/gpt4all/gpt4all-bindings/golang/ \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./cmd/grpc/gpt4all/
+
+backend-assets/grpc/dolly: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/dolly ./cmd/grpc/dolly/
+
+backend-assets/grpc/gpt2: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt2 ./cmd/grpc/gpt2/
+
+backend-assets/grpc/gptj: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptj ./cmd/grpc/gptj/
+
+backend-assets/grpc/gptneox: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gptneox ./cmd/grpc/gptneox/
+
+backend-assets/grpc/mpt: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/mpt ./cmd/grpc/mpt/
+
+backend-assets/grpc/replit: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/replit ./cmd/grpc/replit/
+
+backend-assets/grpc/falcon-ggml: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/falcon-ggml ./cmd/grpc/falcon-ggml/
+
+backend-assets/grpc/starcoder: backend-assets/grpc go-ggml-transformers/libtransformers.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-ggml-transformers LIBRARY_PATH=$(shell pwd)/go-ggml-transformers \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/starcoder ./cmd/grpc/starcoder/
+
+backend-assets/grpc/rwkv: backend-assets/grpc go-rwkv/librwkv.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-rwkv LIBRARY_PATH=$(shell pwd)/go-rwkv \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./cmd/grpc/rwkv/
+
+backend-assets/grpc/bert-embeddings: backend-assets/grpc go-bert/libgobert.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-bert LIBRARY_PATH=$(shell pwd)/go-bert \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./cmd/grpc/bert-embeddings/
+
+backend-assets/grpc/langchain-huggingface: backend-assets/grpc
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/langchain-huggingface ./cmd/grpc/langchain-huggingface/
+
+backend-assets/grpc/stablediffusion: backend-assets/grpc
+	if [ ! -f backend-assets/grpc/stablediffusion ]; then \
+		$(MAKE) go-stable-diffusion/libstablediffusion.a; \
+		CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/go-stable-diffusion/ LIBRARY_PATH=$(shell pwd)/go-stable-diffusion/ \
+		$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./cmd/grpc/stablediffusion/; \
+	fi
+
+backend-assets/grpc/piper: backend-assets/grpc backend-assets/espeak-ng-data go-piper/libpiper_binding.a
+	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(shell pwd)/go-piper \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./cmd/grpc/piper/
+
+backend-assets/grpc/whisper: backend-assets/grpc whisper.cpp/libwhisper.a
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(shell pwd)/whisper.cpp LIBRARY_PATH=$(shell pwd)/whisper.cpp \
+	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./cmd/grpc/whisper/
+
+grpcs: prepare $(GRPC_BACKENDS)
--- a/README.md
+++ b/README.md
@@ -1,358 +1,179 @@
 <h1 align="center">
  <br>
-  <img height="300" src="https://user-images.githubusercontent.com/2420543/233147843-88697415-6dbf-4368-a862-ab217f9f7342.jpeg"> <br>
+  <img height="300" src="https://github.com/go-skynet/LocalAI/assets/2420543/0966aa2a-166e-4f99-a3e5-6c915fc997dd"> <br>
    LocalAI
 <br>
 </h1>

-> :warning: This project has been renamed from `llama-cli` to `LocalAI` to reflect the fact that we are focusing on a fast drop-in OpenAI API rather on the CLI interface. We think that there are already many projects that can be used as a CLI interface already, for instance  [llama.cpp](https://github.com/ggerganov/llama.cpp) and [gpt4all](https://github.com/nomic-ai/gpt4all). If you are were using `llama-cli` for CLI interactions and want to keep using it, use older versions or please open up an issue - contributions are welcome!
+<p align="center">
+<a href="https://github.com/go-skynet/LocalAI/fork" target="blank">
+<img src="https://img.shields.io/github/forks/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI forks"/>
+</a>
+<a href="https://github.com/go-skynet/LocalAI/stargazers" target="blank">
+<img src="https://img.shields.io/github/stars/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI stars"/>
+</a>
+<a href="https://github.com/go-skynet/LocalAI/pulls" target="blank">
+<img src="https://img.shields.io/github/issues-pr/go-skynet/LocalAI?style=for-the-badge" alt="LocalAI pull-requests"/>
+</a>
+<a href='https://github.com/go-skynet/LocalAI/releases'>
+<img src='https://img.shields.io/github/release/go-skynet/LocalAI?&label=Latest&style=for-the-badge'>
+</a>
+</p>
+
+> :bulb: Get help - [❓FAQ](https://localai.io/faq/) [💭Discussions](https://github.com/go-skynet/LocalAI/discussions) [:speech_balloon: Discord](https://discord.gg/uJAeKSAGDy) [:book: Documentation website](https://localai.io/)
+> 
+> [💻 Quickstart](https://localai.io/basics/getting_started/) [📣 News](https://localai.io/basics/news/) [ 🛫 Examples ](https://github.com/go-skynet/LocalAI/tree/master/examples/) [ 🖼️ Models ](https://localai.io/models/)


-[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml) [![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)
+[![tests](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/test.yml)[![Build and Release](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/release.yaml)[![build container images](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/image.yml)[![Bump dependencies](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml/badge.svg)](https://github.com/go-skynet/LocalAI/actions/workflows/bump_deps.yaml)[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/localai)](https://artifacthub.io/packages/search?repo=localai)

-[![](https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted)](https://discord.gg/uJAeKSAGDy) 
+**LocalAI** is a drop-in replacement REST API that's compatible with OpenAI API specifications for local inferencing. It allows you to run LLMs (and not only) locally or on-prem with consumer grade hardware, supporting multiple model families that are compatible with the ggml format, pytorch and more. Does not require GPU.

-**LocalAI** is a straightforward, drop-in replacement API compatible with OpenAI for local CPU inferencing, based on [llama.cpp](https://github.com/ggerganov/llama.cpp), [gpt4all](https://github.com/nomic-ai/gpt4all) and [ggml](https://github.com/ggerganov/ggml), including support GPT4ALL-J which is Apache 2.0 Licensed and can be used for commercial purposes.
+<p align="center"><b>Follow LocalAI </b></p>

- OpenAI compatible API
- Supports multiple-models
- Once loaded the first time, it keep models loaded in memory for faster inference
- Support for prompt templates
- Doesn't shell-out, but uses C bindings for a faster inference and better performance. Uses [go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) and [go-gpt4all-j.cpp](https://github.com/go-skynet/go-gpt4all-j.cpp).
+<p align="center">
+<a href="https://twitter.com/LocalAI_API" target="blank">
+<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
+</a>
+<a href="https://discord.gg/uJAeKSAGDy" target="blank">
+<img src="https://dcbadge.vercel.app/api/server/uJAeKSAGDy?style=flat-square&theme=default-inverted" alt="Join LocalAI Discord Community"/>
+</a>

-Reddit post: https://www.reddit.com/r/selfhosted/comments/12w4p2f/localai_openai_compatible_api_to_run_llm_models/
+<p align="center"><b>Connect with the Creator </b></p>

-## Model compatibility
+<p align="center">
+<a href="https://twitter.com/mudler_it" target="blank">
+<img src="https://img.shields.io/twitter/follow/mudler_it?label=Follow: mudler_it&style=social" alt="Follow mudler_it"/>
+</a>
+<a href='https://github.com/mudler'>
+<img alt="Follow on Github" src="https://img.shields.io/badge/Follow-mudler-black?logo=github&link=https%3A%2F%2Fgithub.com%2Fmudler">
+</a>
+</p>

-It is compatible with the models supported by [llama.cpp](https://github.com/ggerganov/llama.cpp) supports also [GPT4ALL-J](https://github.com/nomic-ai/gpt4all) and [cerebras-GPT with ggml](https://huggingface.co/lxe/Cerebras-GPT-2.7B-Alpaca-SP-ggml).
+<p align="center"><b>Share LocalAI Repository</b></p>

-Tested with:
- Vicuna
- Alpaca
- [GPT4ALL](https://github.com/nomic-ai/gpt4all)
- [GPT4ALL-J](https://gpt4all.io/models/ggml-gpt4all-j.bin)
- Koala
- [cerebras-GPT with ggml](https://huggingface.co/lxe/Cerebras-GPT-2.7B-Alpaca-SP-ggml)
+<p align="center">

-It should also be compatible with StableLM and GPTNeoX ggml models (untested)
+<a href="https://twitter.com/intent/tweet?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI&hashtags=LocalAI,AI" target="blank">
+<img src="https://img.shields.io/twitter/follow/_LocalAI?label=Share Repo on Twitter&style=social" alt="Follow _LocalAI"/></a> 
+<a href="https://t.me/share/url?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.&url=https://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Telegram&logo=Telegram&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Telegram"/></a>
+<a href="https://api.whatsapp.com/send?text=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%20https://github.com/go-skynet/LocalAI"><img src="https://img.shields.io/twitter/url?label=whatsapp&logo=whatsapp&style=social&url=https://github.com/go-skynet/LocalAI" /></a> <a href="https://www.reddit.com/submit?url=https://github.com/go-skynet/LocalAI&title=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.
+" target="blank">
+<img src="https://img.shields.io/twitter/url?label=Reddit&logo=Reddit&style=social&url=https://github.com/go-skynet/LocalAI" alt="Share on Reddit"/>
+</a> <a href="mailto:?subject=Check%20this%20GitHub%20repository%20out.%20LocalAI%20-%20Let%27s%20you%20easily%20run%20LLM%20locally.%3A%0Ahttps://github.com/go-skynet/LocalAI" target="_blank"><img src="https://img.shields.io/twitter/url?label=Gmail&logo=Gmail&style=social&url=https://github.com/go-skynet/LocalAI"/></a> <a href="https://www.buymeacoffee.com/mudler" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/default-orange.png" alt="Buy Me A Coffee" height="23" width="100" style="border-radius:1px"></a>

-Note: You might need to convert older models to the new format, see [here](https://github.com/ggerganov/llama.cpp#using-gpt4all) for instance to run `gpt4all`.
+</p>

-## Usage
+<hr>

-> `LocalAI` comes by default as a container image. You can check out all the available images with corresponding tags [here](https://quay.io/repository/go-skynet/local-ai?tab=tags&tag=latest).
+In a nutshell:

-The easiest way to run LocalAI is by using `docker-compose`:
+- Local, OpenAI drop-in alternative REST API. You own your data.
+- NO GPU required. NO Internet access is required either
+  - Optional, GPU Acceleration is available in `llama.cpp`-compatible LLMs. See also the [build section](https://localai.io/basics/build/index.html). 
+- Supports multiple models
+- 🏃 Once loaded the first time, it keep models loaded in memory for faster inference
+- ⚡ Doesn't shell-out, but uses C++ bindings for a faster inference and better performance.

-```bash
+LocalAI was created by [Ettore Di Giacinto](https://github.com/mudler/) and is a community-driven project, focused on making the AI accessible to anyone. Any contribution, feedback and PR is welcome! 

-git clone https://github.com/go-skynet/LocalAI
+Note that this started just as a [fun weekend project](https://localai.io/#backstory) in order to try to create the necessary pieces for a full AI assistant like `ChatGPT`: the community is growing fast and we are working hard to make it better and more stable. If you want to help, please consider contributing (see below)!

-cd LocalAI
+## 🔥🔥 [Hot topics / Roadmap](https://localai.io/#-hot-topics--roadmap)

-# copy your models to models/
-cp your-model.bin models/
+## 🚀 [Features](https://localai.io/features/)

-# (optional) Edit the .env file to set things like context size and threads
-# vim .env
+- 📖 [Text generation with GPTs](https://localai.io/features/text-generation/) (`llama.cpp`, `gpt4all.cpp`, ... [:book: and more](https://localai.io/model-compatibility/index.html#model-compatibility-table))
+- 🗣 [Text to Audio](https://localai.io/features/text-to-audio/)
+- 🔈 [Audio to Text](https://localai.io/features/audio-to-text/) (Audio transcription with `whisper.cpp`)
+- 🎨 [Image generation with stable diffusion](https://localai.io/features/image-generation)
+- 🔥 [OpenAI functions](https://localai.io/features/openai-functions/) 🆕
+- 🧠 [Embeddings generation for vector databases](https://localai.io/features/embeddings/)
+- ✍️ [Constrained grammars](https://localai.io/features/constrained_grammars/)
+- 🖼️ [Download Models directly from Huggingface ](https://localai.io/models/)

-# start with docker-compose
-docker-compose up -d --build

-# Now API is accessible at localhost:8080
-curl http://localhost:8080/v1/models
-# {"object":"list","data":[{"id":"your-model.bin","object":"model"}]}
+## :book: 🎥 [Media, Blogs, Social](https://localai.io/basics/news/#media-blogs-social)

-curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
-     "model": "your-model.bin",            
-     "prompt": "A long time ago in a galaxy far, far away",
-     "temperature": 0.7
-   }'
-```
+- [Create a slackbot for teams and OSS projects that answer to documentation](https://mudler.pm/posts/smart-slackbot-for-teams/)
+- [LocalAI meets k8sgpt](https://www.youtube.com/watch?v=PKrDNuJ_dfE)
+- [Question Answering on Documents locally with LangChain, LocalAI, Chroma, and GPT4All](https://mudler.pm/posts/localai-question-answering/)
+- [Tutorial to use k8sgpt with LocalAI](https://medium.com/@tyler_97636/k8sgpt-localai-unlock-kubernetes-superpowers-for-free-584790de9b65)

-### Example: Use GPT4ALL-J model
+## 💻 Usage

-<details>
+Check out the [Getting started](https://localai.io/basics/getting_started/index.html) section in our documentation.

-```bash
-# Clone LocalAI
-git clone https://github.com/go-skynet/LocalAI
+### 💡 Example: Use Luna-AI Llama model

-cd LocalAI
+See the [documentation](https://localai.io/basics/getting_started)

-# Download gpt4all-j to models/
-wget https://gpt4all.io/models/ggml-gpt4all-j.bin -O models/ggml-gpt4all-j
+### 🔗 Resources

-# Use a template from the examples
-cp -rf prompt-templates/ggml-gpt4all-j.tmpl models/
+- [How to build locally](https://localai.io/basics/build/index.html)
+- [How to install in Kubernetes](https://localai.io/basics/getting_started/index.html#run-localai-in-kubernetes)
+- [Projects integrating LocalAI](https://localai.io/integrations/)
+- [How tos section](https://localai.io/howtos/) (curated by our community)
+  
+## Citation

-# (optional) Edit the .env file to set things like context size and threads
-# vim .env
-
-# start with docker-compose
-docker-compose up -d --build
-
-# Now API is accessible at localhost:8080
-curl http://localhost:8080/v1/models
-# {"object":"list","data":[{"id":"ggml-gpt4all-j","object":"model"}]}
-
-curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-     "model": "ggml-gpt4all-j",
-     "messages": [{"role": "user", "content": "How are you?"}],
-     "temperature": 0.9 
-   }'
-
-# {"model":"ggml-gpt4all-j","choices":[{"message":{"role":"assistant","content":"I'm doing well, thanks. How about you?"}}]}
-```
-</details>
-
-## Prompt templates 
-
-The API doesn't inject a default prompt for talking to the model. You have to use a prompt similar to what's described in the standford-alpaca docs: https://github.com/tatsu-lab/stanford_alpaca#data-release.
-
-<details>
-You can use a default template for every model present in your model path, by creating a corresponding file with the `.tmpl` suffix next to your model. For instance, if the model is called `foo.bin`, you can create a sibiling file, `foo.bin.tmpl` which will be used as a default prompt, for instance this can be used with alpaca:
+If you utilize this repository, data in a downstream project, please consider citing it with:

 ```
-Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-### Instruction:
-{{.Input}}
-
-### Response:
+@misc{localai,
+  author = {Ettore Di Giacinto},
+  title = {LocalAI: The free, Open source OpenAI alternative},
+  year = {2023},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/go-skynet/LocalAI}},
 ```

-See the [prompt-templates](https://github.com/go-skynet/LocalAI/tree/master/prompt-templates) directory in this repository for templates for most popular models.
+## ❤️ Sponsors

-</details>
+> Do you find LocalAI useful?

-## API
+Support the project by becoming [a backer or sponsor](https://github.com/sponsors/mudler). Your logo will show up here with a link to your website.

-`LocalAI` provides an API for running text generation as a service, that follows the OpenAI reference and can be used as a drop-in. The models once loaded the first time will be kept in memory.
+A huge thank you to our generous sponsors who support this project:

-<details>
-Example of starting the API with `docker`:
+| ![Spectro Cloud logo_600x600px_transparent bg](https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512) | 
+|:-----------------------------------------------:|
+|  [Spectro Cloud](https://www.spectrocloud.com/)  |  
+|  Spectro Cloud kindly supports LocalAI by providing GPU and computing resources to run tests on lamdalabs!  |

-```bash
-docker run -p 8080:8080 -ti --rm quay.io/go-skynet/local-ai:latest --models-path /path/to/models --context-size 700 --threads 4
-```
+And a huge shout-out to individuals sponsoring the project by donating hardware or backing the project. 

-And you'll see:
-```
-┌───────────────────────────────────────────────────┐ 
-│                   Fiber v2.42.0                   │ 
-│               http://127.0.0.1:8080               │ 
-│       (bound on host 0.0.0.0 and port 8080)       │ 
-│                                                   │ 
-│ Handlers ............. 1  Processes ........... 1 │ 
-│ Prefork ....... Disabled  PID ................. 1 │ 
-└───────────────────────────────────────────────────┘ 
-```
+- [Sponsor list](https://github.com/sponsors/mudler)
+- JDAM00 (donating HW for the CI)

-You can control the API server options with command line arguments:
+## 🌟 Star history

-```
-local-api --models-path <model_path> [--address <address>] [--threads <num_threads>]
-```
+[![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)

-The API takes takes the following parameters:
+## 📖 License

-| Parameter    | Environment Variable | Default Value | Description                            |
-| ------------ | -------------------- | ------------- | -------------------------------------- |
-| models-path        | MODELS_PATH           |               | The path where you have models (ending with `.bin`).      |
-| threads      | THREADS              | Number of Physical cores     | The number of threads to use for text generation. |
-| address      | ADDRESS              | :8080         | The address and port to listen on. |
-| context-size | CONTEXT_SIZE         | 512           | Default token context size. |
-| debug | DEBUG         | false           | Enable debug mode. |
+LocalAI is a community-driven project created by [Ettore Di Giacinto](https://github.com/mudler/).

-Once the server is running, you can start making requests to it using HTTP, using the OpenAI API. 
+MIT - Author Ettore Di Giacinto

-</details>
+## 🙇 Acknowledgements

-### Supported OpenAI API endpoints
-
-You can check out the [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create). 
-
-Following the list of endpoints/parameters supported. 
-
-Note:
-
- You can also specify the model a part of the OpenAI token.
- If only one model is available, the API will use it for all the requests.
-
-#### Chat completions
-
-<details>
-For example, to generate a chat completion, you can send a POST request to the `/v1/chat/completions` endpoint with the instruction as the request body:
-
-```
-curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
-     "model": "ggml-koala-7b-model-q4_0-r2.bin",
-     "messages": [{"role": "user", "content": "Say this is a test!"}],
-     "temperature": 0.7
-   }'
-```
-
-Available additional parameters: `top_p`, `top_k`, `max_tokens`
-</details>
-
-#### Completions
-
-<details>
-For example, to generate a completion, you can send a POST request to the `/v1/completions` endpoint with the instruction as the request body:
-```
-curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
-     "model": "ggml-koala-7b-model-q4_0-r2.bin",
-     "prompt": "A long time ago in a galaxy far, far away",
-     "temperature": 0.7
-   }'
-```
-
-Available additional parameters: `top_p`, `top_k`, `max_tokens`
-
-</details>
-
-#### List models
-
-<details>
-You can list all the models available with:
-
-```
-curl http://localhost:8080/v1/models
-```
-
-</details>
-
-## Using other models
-
-gpt4all (https://github.com/nomic-ai/gpt4all) works as well, however the original model needs to be converted (same applies for old alpaca models, too):
-
-```bash
-wget -O tokenizer.model https://huggingface.co/decapoda-research/llama-30b-hf/resolve/main/tokenizer.model
-mkdir models
-cp gpt4all.. models/
-git clone https://gist.github.com/eiz/828bddec6162a023114ce19146cb2b82
-pip install sentencepiece
-python 828bddec6162a023114ce19146cb2b82/gistfile1.txt models tokenizer.model
-# There will be a new model with the ".tmp" extension, you have to use that one!
-```
-
-
-## Helm Chart Installation (run LocalAI in Kubernetes)
-The local-ai Helm chart supports two options for the LocalAI server's models directory:
-1. Basic deployment with no persistent volume. You must manually update the Deployment to configure your own models directory.
-
-    Install the chart with `.Values.deployment.volumes.enabled == false` and `.Values.dataVolume.enabled == false`.
-
-2. Advanced, two-phase deployment to provision the models directory using a DataVolume. Requires [Containerized Data Importer CDI](https://github.com/kubevirt/containerized-data-importer) to be pre-installed in your cluster.
-
-    First, install the chart with `.Values.deployment.volumes.enabled == false` and `.Values.dataVolume.enabled == true`:
-    ```bash
-    helm install local-ai charts/local-ai -n local-ai --create-namespace
-    ```
-    Wait for CDI to create an importer Pod for the DataVolume and for the importer pod to finish provisioning the model archive inside the PV.
-
-    Once the PV is provisioned and the importer Pod removed, set `.Values.deployment.volumes.enabled == true` and `.Values.dataVolume.enabled == false` and upgrade the chart:
-    ```bash
-    helm upgrade local-ai -n local-ai charts/local-ai
-    ```
-    This will update the local-ai Deployment to mount the PV that was provisioned by the DataVolume.
-
-## Windows compatibility
-
-It should work, however you need to make sure you give enough resources to the container. See https://github.com/go-skynet/LocalAI/issues/2
-
-## Build locally
-
-Pre-built images might fit well for most of the modern hardware, however you can and might need to build the images manually.
-
-In order to build the `LocalAI` container image locally you can use `docker`:
-
-```
-# build the image
-docker build -t LocalAI .
-docker run LocalAI
-```
-
-Or build the binary with `make`:
-
-```
-make build
-```
-
-## Frequently asked questions
-
-Here are answers to some of the most common questions.
-
-
-### How do I get models? 
-
-<details>
-
-Most ggml-based models should work, but newer models may require additions to the API. If a model doesn't work, please feel free to open up issues. However, be cautious about downloading models from the internet and directly onto your machine, as there may be security vulnerabilities in lama.cpp or ggml that could be maliciously exploited. Some models can be found on Hugging Face: https://huggingface.co/models?search=ggml, or models from gpt4all should also work: https://github.com/nomic-ai/gpt4all.
-
-</details>
-
-### What's the difference with Serge, or XXX?
-
-
-<details>
-
-LocalAI is a multi-model solution that doesn't focus on a specific model type (e.g., llama.cpp or alpaca.cpp), and it handles all of these internally for faster inference,  easy to set up locally and deploy to Kubernetes.
-
-</details>
-
-
-### Can I use it with a Discord bot, or XXX?
-
-<details>
-
-Yes! If the client uses OpenAI and supports setting a different base URL to send requests to, you can use the LocalAI endpoint. This allows to use this with every application that was supposed to work with OpenAI, but without changing the application!
-
-</details>
-
-
-### Can this leverage GPUs? 
-
-<details>
-
-Not currently, as ggml doesn't support GPUs yet: https://github.com/ggerganov/llama.cpp/discussions/915.
-
-</details>
-
-### Where is the webUI? 
-
-<details> 
-We are working on to have a good out of the box experience - however as LocalAI is an API you can already plug it into existing projects that provides are UI interfaces to OpenAI's APIs. There are several already on github, and should be compatible with LocalAI already (as it mimics the OpenAI API)
-
-</details>
-
-### Does it work with AutoGPT? 
-
-<details>
-
-AutoGPT currently doesn't allow to set a different API URL, but there is a PR open for it, so this should be possible soon!
-
-</details>
-
-
-## Short-term roadmap
-
- [x] Mimic OpenAI API (https://github.com/go-skynet/LocalAI/issues/10)
- [ ] Binary releases (https://github.com/go-skynet/LocalAI/issues/6)
- [ ] Upstream our golang bindings to llama.cpp (https://github.com/ggerganov/llama.cpp/issues/351)
- [x] Multi-model support
- [ ] Have a webUI!
- [ ] Allow configuration of defaults for models.
- [ ] Enable automatic downloading of models from a curated gallery, with only free-licensed models.
-
-## License
-
-MIT
-
-## Acknowledgements
+LocalAI couldn't have been built without the help of great software already available from the community. Thank you!

 - [llama.cpp](https://github.com/ggerganov/llama.cpp)
 - https://github.com/tatsu-lab/stanford_alpaca
 - https://github.com/cornelk/llama-go for the initial ideas
- https://github.com/antimatter15/alpaca.cpp for the light model version (this is compatible and tested only with that checkpoint model!)
+- https://github.com/antimatter15/alpaca.cpp
+- https://github.com/EdVince/Stable-Diffusion-NCNN
+- https://github.com/ggerganov/whisper.cpp
+- https://github.com/saharNooby/rwkv.cpp
+- https://github.com/rhasspy/piper
+- https://github.com/cmp-nct/ggllm.cpp
+
+## 🤗 Contributors
+
+This is a community project, a special thanks to our contributors! 🤗
+<a href="https://github.com/go-skynet/LocalAI/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=go-skynet/LocalAI" />
+</a>
--- a/api/api.go
+++ b/api/api.go
@@ -1,400 +1,98 @@
 package api

 import (
-	"encoding/json"
 	"errors"
 	"fmt"
 	"strings"
-	"sync"

-	model "github.com/go-skynet/LocalAI/pkg/model"
-	gpt2 "github.com/go-skynet/go-gpt2.cpp"
-	gptj "github.com/go-skynet/go-gpt4all-j.cpp"
-	llama "github.com/go-skynet/go-llama.cpp"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/localai"
+	"github.com/go-skynet/LocalAI/api/openai"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/go-skynet/LocalAI/internal"
+	"github.com/go-skynet/LocalAI/metrics"
+	"github.com/go-skynet/LocalAI/pkg/assets"
+
 	"github.com/gofiber/fiber/v2"
 	"github.com/gofiber/fiber/v2/middleware/cors"
+	"github.com/gofiber/fiber/v2/middleware/logger"
 	"github.com/gofiber/fiber/v2/middleware/recover"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 )

-// APIError provides error information returned by the OpenAI API.
-type APIError struct {
-	Code    any     `json:"code,omitempty"`
-	Message string  `json:"message"`
-	Param   *string `json:"param,omitempty"`
-	Type    string  `json:"type"`
-}
+func Startup(opts ...options.AppOption) (*options.Option, *config.ConfigLoader, error) {
+	options := options.NewOptions(opts...)

-type ErrorResponse struct {
-	Error *APIError `json:"error,omitempty"`
-}
-
-type OpenAIResponse struct {
-	Created int      `json:"created,omitempty"`
-	Object  string   `json:"chat.completion,omitempty"`
-	ID      string   `json:"id,omitempty"`
-	Model   string   `json:"model,omitempty"`
-	Choices []Choice `json:"choices,omitempty"`
-}
-
-type Choice struct {
-	Index        int      `json:"index,omitempty"`
-	FinishReason string   `json:"finish_reason,omitempty"`
-	Message      *Message `json:"message,omitempty"`
-	Text         string   `json:"text,omitempty"`
-}
-
-type Message struct {
-	Role    string `json:"role,omitempty"`
-	Content string `json:"content,omitempty"`
-}
-
-type OpenAIModel struct {
-	ID     string `json:"id"`
-	Object string `json:"object"`
-}
-
-type OpenAIRequest struct {
-	Model string `json:"model"`
-
-	// Prompt is read only by completion API calls
-	Prompt string `json:"prompt"`
-
-	Stop string `json:"stop"`
-
-	// Messages is read only by chat/completion API calls
-	Messages []Message `json:"messages"`
-
-	Echo bool `json:"echo"`
-	// Common options between all the API calls
-	TopP        float64 `json:"top_p"`
-	TopK        int     `json:"top_k"`
-	Temperature float64 `json:"temperature"`
-	Maxtokens   int     `json:"max_tokens"`
-
-	N int `json:"n"`
-
-	// Custom parameters - not present in the OpenAI API
-	Batch         int     `json:"batch"`
-	F16           bool    `json:"f16kv"`
-	IgnoreEOS     bool    `json:"ignore_eos"`
-	RepeatPenalty float64 `json:"repeat_penalty"`
-	Keep          int     `json:"n_keep"`
-
-	Seed int `json:"seed"`
-}
-
-// https://platform.openai.com/docs/api-reference/completions
-func openAIEndpoint(chat, debug bool, loader *model.ModelLoader, threads, ctx int, f16 bool, mutexMap *sync.Mutex, mutexes map[string]*sync.Mutex) func(c *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		var err error
-		var model *llama.LLama
-		var gptModel *gptj.GPTJ
-		var gpt2Model *gpt2.GPT2
-		var stableLMModel *gpt2.StableLM
-
-		input := new(OpenAIRequest)
-		// Get input data from the request body
-		if err := c.BodyParser(input); err != nil {
-			return err
-		}
-		modelFile := input.Model
-		received, _ := json.Marshal(input)
-
-		log.Debug().Msgf("Request received: %s", string(received))
-
-		// Set model from bearer token, if available
-		bearer := strings.TrimLeft(c.Get("authorization"), "Bearer ")
-		bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
-
-		// If no model was specified, take the first available
-		if modelFile == "" {
-			models, _ := loader.ListModels()
-			if len(models) > 0 {
-				modelFile = models[0]
-				log.Debug().Msgf("No model specified, using: %s", modelFile)
-			}
-		}
-
-		// If no model is found or specified, we bail out
-		if modelFile == "" && !bearerExists {
-			return fmt.Errorf("no model specified")
-		}
-
-		// If a model is found in bearer token takes precedence
-		if bearerExists {
-			log.Debug().Msgf("Using model from bearer token: %s", bearer)
-			modelFile = bearer
-		}
-
-		// Try to load the model
-		var llamaerr, gpt2err, gptjerr, stableerr error
-		llamaOpts := []llama.ModelOption{}
-		if ctx != 0 {
-			llamaOpts = append(llamaOpts, llama.SetContext(ctx))
-		}
-		if f16 {
-			llamaOpts = append(llamaOpts, llama.EnableF16Memory)
-		}
-
-		// TODO: this is ugly, better identifying the model somehow! however, it is a good stab for a first implementation..
-		model, llamaerr = loader.LoadLLaMAModel(modelFile, llamaOpts...)
-		if llamaerr != nil {
-			gptModel, gptjerr = loader.LoadGPTJModel(modelFile)
-			if gptjerr != nil {
-				gpt2Model, gpt2err = loader.LoadGPT2Model(modelFile)
-				if gpt2err != nil {
-					stableLMModel, stableerr = loader.LoadStableLMModel(modelFile)
-					if stableerr != nil {
-						return fmt.Errorf("llama: %s gpt: %s gpt2: %s stableLM: %s", llamaerr.Error(), gptjerr.Error(), gpt2err.Error(), stableerr.Error()) // llama failed first, so we want to catch both errors
-					}
-				}
-			}
-		}
-
-		// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-		mutexMap.Lock()
-		l, ok := mutexes[modelFile]
-		if !ok {
-			m := &sync.Mutex{}
-			mutexes[modelFile] = m
-			l = m
-		}
-		mutexMap.Unlock()
-		l.Lock()
-		defer l.Unlock()
-
-		// Set the parameters for the language model prediction
-		topP := input.TopP
-		if topP == 0 {
-			topP = 0.7
-		}
-		topK := input.TopK
-		if topK == 0 {
-			topK = 80
-		}
-
-		temperature := input.Temperature
-		if temperature == 0 {
-			temperature = 0.9
-		}
-
-		tokens := input.Maxtokens
-		if tokens == 0 {
-			tokens = 512
-		}
-
-		predInput := input.Prompt
-		if chat {
-			mess := []string{}
-			// TODO: encode roles
-			for _, i := range input.Messages {
-				mess = append(mess, i.Content)
-			}
-
-			predInput = strings.Join(mess, "\n")
-		}
-
-		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
-		templatedInput, err := loader.TemplatePrefix(modelFile, struct {
-			Input string
-		}{Input: predInput})
-		if err == nil {
-			predInput = templatedInput
-			log.Debug().Msgf("Template found, input modified to: %s", predInput)
-		}
-
-		result := []Choice{}
-
-		n := input.N
-
-		if input.N == 0 {
-			n = 1
-		}
-
-		var predFunc func() (string, error)
-		switch {
-		case stableLMModel != nil:
-			predFunc = func() (string, error) {
-				// Generate the prediction using the language model
-				predictOptions := []gpt2.PredictOption{
-					gpt2.SetTemperature(temperature),
-					gpt2.SetTopP(topP),
-					gpt2.SetTopK(topK),
-					gpt2.SetTokens(tokens),
-					gpt2.SetThreads(threads),
-				}
-
-				if input.Batch != 0 {
-					predictOptions = append(predictOptions, gpt2.SetBatch(input.Batch))
-				}
-
-				if input.Seed != 0 {
-					predictOptions = append(predictOptions, gpt2.SetSeed(input.Seed))
-				}
-
-				return stableLMModel.Predict(
-					predInput,
-					predictOptions...,
-				)
-			}
-		case gpt2Model != nil:
-			predFunc = func() (string, error) {
-				// Generate the prediction using the language model
-				predictOptions := []gpt2.PredictOption{
-					gpt2.SetTemperature(temperature),
-					gpt2.SetTopP(topP),
-					gpt2.SetTopK(topK),
-					gpt2.SetTokens(tokens),
-					gpt2.SetThreads(threads),
-				}
-
-				if input.Batch != 0 {
-					predictOptions = append(predictOptions, gpt2.SetBatch(input.Batch))
-				}
-
-				if input.Seed != 0 {
-					predictOptions = append(predictOptions, gpt2.SetSeed(input.Seed))
-				}
-
-				return gpt2Model.Predict(
-					predInput,
-					predictOptions...,
-				)
-			}
-		case gptModel != nil:
-			predFunc = func() (string, error) {
-				// Generate the prediction using the language model
-				predictOptions := []gptj.PredictOption{
-					gptj.SetTemperature(temperature),
-					gptj.SetTopP(topP),
-					gptj.SetTopK(topK),
-					gptj.SetTokens(tokens),
-					gptj.SetThreads(threads),
-				}
-
-				if input.Batch != 0 {
-					predictOptions = append(predictOptions, gptj.SetBatch(input.Batch))
-				}
-
-				if input.Seed != 0 {
-					predictOptions = append(predictOptions, gptj.SetSeed(input.Seed))
-				}
-
-				return gptModel.Predict(
-					predInput,
-					predictOptions...,
-				)
-			}
-		case model != nil:
-			predFunc = func() (string, error) {
-				// Generate the prediction using the language model
-				predictOptions := []llama.PredictOption{
-					llama.SetTemperature(temperature),
-					llama.SetTopP(topP),
-					llama.SetTopK(topK),
-					llama.SetTokens(tokens),
-					llama.SetThreads(threads),
-				}
-
-				if debug {
-					predictOptions = append(predictOptions, llama.Debug)
-				}
-
-				if input.Stop != "" {
-					predictOptions = append(predictOptions, llama.SetStopWords(input.Stop))
-				}
-
-				if input.RepeatPenalty != 0 {
-					predictOptions = append(predictOptions, llama.SetPenalty(input.RepeatPenalty))
-				}
-
-				if input.Keep != 0 {
-					predictOptions = append(predictOptions, llama.SetNKeep(input.Keep))
-				}
-
-				if input.Batch != 0 {
-					predictOptions = append(predictOptions, llama.SetBatch(input.Batch))
-				}
-
-				if input.F16 {
-					predictOptions = append(predictOptions, llama.EnableF16KV)
-				}
-
-				if input.IgnoreEOS {
-					predictOptions = append(predictOptions, llama.IgnoreEOS)
-				}
-
-				if input.Seed != 0 {
-					predictOptions = append(predictOptions, llama.SetSeed(input.Seed))
-				}
-
-				return model.Predict(
-					predInput,
-					predictOptions...,
-				)
-			}
-		}
-
-		for i := 0; i < n; i++ {
-			prediction, err := predFunc()
-			if err != nil {
-				return err
-			}
-
-			if input.Echo {
-				prediction = predInput + prediction
-			}
-
-			if chat {
-				result = append(result, Choice{Message: &Message{Role: "assistant", Content: prediction}})
-			} else {
-				result = append(result, Choice{Text: prediction})
-			}
-		}
-
-		jsonResult, _ := json.Marshal(result)
-		log.Debug().Msgf("Response: %s", jsonResult)
-
-		// Return the prediction in the response body
-		return c.JSON(OpenAIResponse{
-			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: result,
-		})
-	}
-}
-
-func listModels(loader *model.ModelLoader) func(ctx *fiber.Ctx) error {
-	return func(c *fiber.Ctx) error {
-		models, err := loader.ListModels()
-		if err != nil {
-			return err
-		}
-
-		dataModels := []OpenAIModel{}
-		for _, m := range models {
-			dataModels = append(dataModels, OpenAIModel{ID: m, Object: "model"})
-		}
-		return c.JSON(struct {
-			Object string        `json:"object"`
-			Data   []OpenAIModel `json:"data"`
-		}{
-			Object: "list",
-			Data:   dataModels,
-		})
-	}
-}
-
-func App(loader *model.ModelLoader, threads, ctxSize int, f16 bool, debug, disableMessage bool) *fiber.App {
 	zerolog.SetGlobalLevel(zerolog.InfoLevel)
-	if debug {
+	if options.Debug {
 		zerolog.SetGlobalLevel(zerolog.DebugLevel)
 	}

+	log.Info().Msgf("Starting LocalAI using %d threads, with models path: %s", options.Threads, options.Loader.ModelPath)
+	log.Info().Msgf("LocalAI version: %s", internal.PrintableVersion())
+
+	cl := config.NewConfigLoader()
+	if err := cl.LoadConfigs(options.Loader.ModelPath); err != nil {
+		log.Error().Msgf("error loading config files: %s", err.Error())
+	}
+
+	if options.ConfigFile != "" {
+		if err := cl.LoadConfigFile(options.ConfigFile); err != nil {
+			log.Error().Msgf("error loading config file: %s", err.Error())
+		}
+	}
+
+	if options.Debug {
+		for _, v := range cl.ListConfigs() {
+			cfg, _ := cl.GetConfig(v)
+			log.Debug().Msgf("Model: %s (config: %+v)", v, cfg)
+		}
+	}
+
+	if options.AssetsDestination != "" {
+		// Extract files from the embedded FS
+		err := assets.ExtractFiles(options.BackendAssets, options.AssetsDestination)
+		log.Debug().Msgf("Extracting backend assets files to %s", options.AssetsDestination)
+		if err != nil {
+			log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
+		}
+	}
+
+	if options.PreloadJSONModels != "" {
+		if err := localai.ApplyGalleryFromString(options.Loader.ModelPath, options.PreloadJSONModels, cl, options.Galleries); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	if options.PreloadModelsFromPath != "" {
+		if err := localai.ApplyGalleryFromFile(options.Loader.ModelPath, options.PreloadModelsFromPath, cl, options.Galleries); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	// turn off any process that was started by GRPC if the context is canceled
+	go func() {
+		<-options.Context.Done()
+		log.Debug().Msgf("Context canceled, shutting down")
+		options.Loader.StopAllGRPC()
+	}()
+
+	return options, cl, nil
+}
+
+func App(opts ...options.AppOption) (*fiber.App, error) {
+
+	options, cl, err := Startup(opts...)
+	if err != nil {
+		return nil, fmt.Errorf("failed basic startup tasks with error %s", err.Error())
+	}
+
 	// Return errors as JSON responses
 	app := fiber.New(fiber.Config{
-		DisableStartupMessage: disableMessage,
+		BodyLimit:             options.UploadLimitMB * 1024 * 1024, // this is the default limit of 4MB
+		DisableStartupMessage: options.DisableMessage,
 		// Override default error handler
 		ErrorHandler: func(ctx *fiber.Ctx, err error) error {
 			// Status code defaults to 500
@@ -408,30 +106,134 @@ func App(loader *model.ModelLoader, threads, ctxSize int, f16 bool, debug, disab

 			// Send custom error page
 			return ctx.Status(code).JSON(
-				ErrorResponse{
-					Error: &APIError{Message: err.Error(), Code: code},
+				schema.ErrorResponse{
+					Error: &schema.APIError{Message: err.Error(), Code: code},
 				},
 			)
 		},
 	})

+	if options.Debug {
+		app.Use(logger.New(logger.Config{
+			Format: "[${ip}]:${port} ${status} - ${method} ${path}\n",
+		}))
+	}
+
 	// Default middleware config
 	app.Use(recover.New())
-	app.Use(cors.New())
+	if options.Metrics != nil {
+		app.Use(metrics.APIMiddleware(options.Metrics))
+	}

-	// This is still needed, see: https://github.com/ggerganov/llama.cpp/discussions/784
-	mu := map[string]*sync.Mutex{}
-	var mumutex = &sync.Mutex{}
+	// Auth middleware checking if API key is valid. If no API key is set, no auth is required.
+	auth := func(c *fiber.Ctx) error {
+		if len(options.ApiKeys) > 0 {
+			authHeader := c.Get("Authorization")
+			if authHeader == "" {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Authorization header missing"})
+			}
+			authHeaderParts := strings.Split(authHeader, " ")
+			if len(authHeaderParts) != 2 || authHeaderParts[0] != "Bearer" {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid Authorization header format"})
+			}
+
+			apiKey := authHeaderParts[1]
+			validApiKey := false
+			for _, key := range options.ApiKeys {
+				if apiKey == key {
+					validApiKey = true
+				}
+			}
+			if !validApiKey {
+				return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{"message": "Invalid API key"})
+			}
+		}
+		return c.Next()
+	}
+
+	if options.CORS {
+		var c func(ctx *fiber.Ctx) error
+		if options.CORSAllowOrigins == "" {
+			c = cors.New()
+		} else {
+			c = cors.New(cors.Config{AllowOrigins: options.CORSAllowOrigins})
+		}
+
+		app.Use(c)
+	}
+
+	// LocalAI API endpoints
+	galleryService := localai.NewGalleryService(options.Loader.ModelPath)
+	galleryService.Start(options.Context, cl)
+
+	app.Get("/version", auth, func(c *fiber.Ctx) error {
+		return c.JSON(struct {
+			Version string `json:"version"`
+		}{Version: internal.PrintableVersion()})
+	})
+
+	modelGalleryService := localai.CreateModelGalleryService(options.Galleries, options.Loader.ModelPath, galleryService)
+	app.Post("/models/apply", auth, modelGalleryService.ApplyModelGalleryEndpoint())
+	app.Get("/models/available", auth, modelGalleryService.ListModelFromGalleryEndpoint())
+	app.Get("/models/galleries", auth, modelGalleryService.ListModelGalleriesEndpoint())
+	app.Post("/models/galleries", auth, modelGalleryService.AddModelGalleryEndpoint())
+	app.Delete("/models/galleries", auth, modelGalleryService.RemoveModelGalleryEndpoint())
+	app.Get("/models/jobs/:uuid", auth, modelGalleryService.GetOpStatusEndpoint())
+	app.Get("/models/jobs", auth, modelGalleryService.GetAllStatusEndpoint())

 	// openAI compatible API endpoint
-	app.Post("/v1/chat/completions", openAIEndpoint(true, debug, loader, threads, ctxSize, f16, mumutex, mu))
-	app.Post("/chat/completions", openAIEndpoint(true, debug, loader, threads, ctxSize, f16, mumutex, mu))

-	app.Post("/v1/completions", openAIEndpoint(false, debug, loader, threads, ctxSize, f16, mumutex, mu))
-	app.Post("/completions", openAIEndpoint(false, debug, loader, threads, ctxSize, f16, mumutex, mu))
+	// chat
+	app.Post("/v1/chat/completions", auth, openai.ChatEndpoint(cl, options))
+	app.Post("/chat/completions", auth, openai.ChatEndpoint(cl, options))

-	app.Get("/v1/models", listModels(loader))
-	app.Get("/models", listModels(loader))
+	// edit
+	app.Post("/v1/edits", auth, openai.EditEndpoint(cl, options))
+	app.Post("/edits", auth, openai.EditEndpoint(cl, options))

-	return app
+	// completion
+	app.Post("/v1/completions", auth, openai.CompletionEndpoint(cl, options))
+	app.Post("/completions", auth, openai.CompletionEndpoint(cl, options))
+	app.Post("/v1/engines/:model/completions", auth, openai.CompletionEndpoint(cl, options))
+
+	// embeddings
+	app.Post("/v1/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
+	app.Post("/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
+	app.Post("/v1/engines/:model/embeddings", auth, openai.EmbeddingsEndpoint(cl, options))
+
+	// audio
+	app.Post("/v1/audio/transcriptions", auth, openai.TranscriptEndpoint(cl, options))
+	app.Post("/tts", auth, localai.TTSEndpoint(cl, options))
+
+	// images
+	app.Post("/v1/images/generations", auth, openai.ImageEndpoint(cl, options))
+
+	if options.ImageDir != "" {
+		app.Static("/generated-images", options.ImageDir)
+	}
+
+	if options.AudioDir != "" {
+		app.Static("/generated-audio", options.AudioDir)
+	}
+
+	ok := func(c *fiber.Ctx) error {
+		return c.SendStatus(200)
+	}
+
+	// Kubernetes health checks
+	app.Get("/healthz", ok)
+	app.Get("/readyz", ok)
+
+	// Experimental Backend Statistics Module
+	backendMonitor := localai.NewBackendMonitor(cl, options) // Split out for now
+	app.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitor))
+	app.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitor))
+
+	// models
+	app.Get("/v1/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
+	app.Get("/models", auth, openai.ListModelsEndpoint(options.Loader, cl))
+
+	app.Get("/metrics", metrics.MetricsHandler())
+
+	return app, nil
 }
--- a/api/api_test.go
+++ b/api/api_test.go
@@ -1,32 +1,616 @@
 package api_test

 import (
+	"bytes"
 	"context"
+	"embed"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"net/http"
 	"os"
+	"path/filepath"
+	"runtime"

 	. "github.com/go-skynet/LocalAI/api"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/metrics"
+	"github.com/go-skynet/LocalAI/pkg/gallery"
 	"github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/utils"
 	"github.com/gofiber/fiber/v2"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
+	"gopkg.in/yaml.v3"

+	openaigo "github.com/otiai10/openaigo"
 	"github.com/sashabaranov/go-openai"
+	"github.com/sashabaranov/go-openai/jsonschema"
 )

+type modelApplyRequest struct {
+	ID        string                 `json:"id"`
+	URL       string                 `json:"url"`
+	Name      string                 `json:"name"`
+	Overrides map[string]interface{} `json:"overrides"`
+}
+
+func getModelStatus(url string) (response map[string]interface{}) {
+	// Create the HTTP request
+	resp, err := http.Get(url)
+	if err != nil {
+		fmt.Println("Error creating request:", err)
+		return
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		fmt.Println("Error reading response body:", err)
+		return
+	}
+
+	// Unmarshal the response into a map[string]interface{}
+	err = json.Unmarshal(body, &response)
+	if err != nil {
+		fmt.Println("Error unmarshaling JSON response:", err)
+		return
+	}
+	return
+}
+
+func getModels(url string) (response []gallery.GalleryModel) {
+	utils.GetURI(url, func(url string, i []byte) error {
+		// Unmarshal YAML data into a struct
+		return json.Unmarshal(i, &response)
+	})
+	return
+}
+
+func postModelApplyRequest(url string, request modelApplyRequest) (response map[string]interface{}) {
+
+	//url := "http://localhost:AI/models/apply"
+
+	// Create the request payload
+
+	payload, err := json.Marshal(request)
+	if err != nil {
+		fmt.Println("Error marshaling JSON:", err)
+		return
+	}
+
+	// Create the HTTP request
+	req, err := http.NewRequest("POST", url, bytes.NewBuffer(payload))
+	if err != nil {
+		fmt.Println("Error creating request:", err)
+		return
+	}
+	req.Header.Set("Content-Type", "application/json")
+
+	// Make the request
+	client := &http.Client{}
+	resp, err := client.Do(req)
+	if err != nil {
+		fmt.Println("Error making request:", err)
+		return
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(resp.Body)
+	if err != nil {
+		fmt.Println("Error reading response body:", err)
+		return
+	}
+
+	// Unmarshal the response into a map[string]interface{}
+	err = json.Unmarshal(body, &response)
+	if err != nil {
+		fmt.Println("Error unmarshaling JSON response:", err)
+		return
+	}
+	return
+}
+
+//go:embed backend-assets/*
+var backendAssets embed.FS
+
 var _ = Describe("API test", func() {

 	var app *fiber.App
 	var modelLoader *model.ModelLoader
 	var client *openai.Client
-	Context("API query", func() {
+	var client2 *openaigo.Client
+	var c context.Context
+	var cancel context.CancelFunc
+	var tmpdir string
+
+	commonOpts := []options.AppOption{
+		options.WithDebug(true),
+		options.WithDisableMessage(true),
+	}
+
+	Context("API with ephemeral models", func() {
 		BeforeEach(func() {
-			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
-			app = App(modelLoader, 1, 512, false, false, true)
+			var err error
+			tmpdir, err = os.MkdirTemp("", "")
+			Expect(err).ToNot(HaveOccurred())
+
+			modelLoader = model.NewModelLoader(tmpdir)
+			c, cancel = context.WithCancel(context.Background())
+
+			g := []gallery.GalleryModel{
+				{
+					Name: "bert",
+					URL:  "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
+				},
+				{
+					Name:            "bert2",
+					URL:             "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
+					Overrides:       map[string]interface{}{"foo": "bar"},
+					AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml"}},
+				},
+			}
+			out, err := yaml.Marshal(g)
+			Expect(err).ToNot(HaveOccurred())
+			err = os.WriteFile(filepath.Join(tmpdir, "gallery_simple.yaml"), out, 0644)
+			Expect(err).ToNot(HaveOccurred())
+
+			galleries := []gallery.Gallery{
+				{
+					Name: "test",
+					URL:  "file://" + filepath.Join(tmpdir, "gallery_simple.yaml"),
+				},
+			}
+
+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
+			app, err = App(
+				append(commonOpts,
+					options.WithMetrics(metricsService),
+					options.WithContext(c),
+					options.WithGalleries(galleries),
+					options.WithModelLoader(modelLoader), options.WithBackendAssets(backendAssets), options.WithBackendAssetsOutput(tmpdir))...)
+			Expect(err).ToNot(HaveOccurred())
 			go app.Listen("127.0.0.1:9090")

 			defaultConfig := openai.DefaultConfig("")
 			defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"

+			client2 = openaigo.NewClient("")
+			client2.BaseURL = defaultConfig.BaseURL
+
+			// Wait for API to be ready
+			client = openai.NewClientWithConfig(defaultConfig)
+			Eventually(func() error {
+				_, err := client.ListModels(context.TODO())
+				return err
+			}, "2m").ShouldNot(HaveOccurred())
+		})
+
+		AfterEach(func() {
+			cancel()
+			app.Shutdown()
+			os.RemoveAll(tmpdir)
+		})
+
+		Context("Applying models", func() {
+			It("applies models from a gallery", func() {
+
+				models := getModels("http://127.0.0.1:9090/models/available")
+				Expect(len(models)).To(Equal(2), fmt.Sprint(models))
+				Expect(models[0].Installed).To(BeFalse(), fmt.Sprint(models))
+				Expect(models[1].Installed).To(BeFalse(), fmt.Sprint(models))
+
+				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+					ID: "test@bert2",
+				})
+
+				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+				uuid := response["uuid"].(string)
+				resp := map[string]interface{}{}
+				Eventually(func() bool {
+					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+					fmt.Println(response)
+					resp = response
+					return response["processed"].(bool)
+				}, "360s", "10s").Should(Equal(true))
+				Expect(resp["message"]).ToNot(ContainSubstring("error"))
+
+				dat, err := os.ReadFile(filepath.Join(tmpdir, "bert2.yaml"))
+				Expect(err).ToNot(HaveOccurred())
+
+				_, err = os.ReadFile(filepath.Join(tmpdir, "foo.yaml"))
+				Expect(err).ToNot(HaveOccurred())
+
+				content := map[string]interface{}{}
+				err = yaml.Unmarshal(dat, &content)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(content["backend"]).To(Equal("bert-embeddings"))
+				Expect(content["foo"]).To(Equal("bar"))
+
+				models = getModels("http://127.0.0.1:9090/models/available")
+				Expect(len(models)).To(Equal(2), fmt.Sprint(models))
+				Expect(models[0].Name).To(Or(Equal("bert"), Equal("bert2")))
+				Expect(models[1].Name).To(Or(Equal("bert"), Equal("bert2")))
+				for _, m := range models {
+					if m.Name == "bert2" {
+						Expect(m.Installed).To(BeTrue())
+					} else {
+						Expect(m.Installed).To(BeFalse())
+					}
+				}
+			})
+			It("overrides models", func() {
+				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+					URL:  "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
+					Name: "bert",
+					Overrides: map[string]interface{}{
+						"backend": "llama",
+					},
+				})
+
+				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+				uuid := response["uuid"].(string)
+
+				Eventually(func() bool {
+					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+					return response["processed"].(bool)
+				}, "360s", "10s").Should(Equal(true))
+
+				dat, err := os.ReadFile(filepath.Join(tmpdir, "bert.yaml"))
+				Expect(err).ToNot(HaveOccurred())
+
+				content := map[string]interface{}{}
+				err = yaml.Unmarshal(dat, &content)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(content["backend"]).To(Equal("llama"))
+			})
+			It("apply models without overrides", func() {
+				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+					URL:       "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
+					Name:      "bert",
+					Overrides: map[string]interface{}{},
+				})
+
+				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+				uuid := response["uuid"].(string)
+
+				Eventually(func() bool {
+					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+					return response["processed"].(bool)
+				}, "360s", "10s").Should(Equal(true))
+
+				dat, err := os.ReadFile(filepath.Join(tmpdir, "bert.yaml"))
+				Expect(err).ToNot(HaveOccurred())
+
+				content := map[string]interface{}{}
+				err = yaml.Unmarshal(dat, &content)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(content["backend"]).To(Equal("bert-embeddings"))
+			})
+
+			It("runs openllama", Label("llama"), func() {
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
+					Name:      "openllama_3b",
+					Overrides: map[string]interface{}{"backend": "llama-stable", "mmap": true, "f16": true, "context_size": 128},
+				})
+
+				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+				uuid := response["uuid"].(string)
+
+				Eventually(func() bool {
+					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+					return response["processed"].(bool)
+				}, "360s", "10s").Should(Equal(true))
+
+				By("testing completion")
+				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices)).To(Equal(1))
+				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
+
+				By("testing functions")
+				resp2, err := client.CreateChatCompletion(
+					context.TODO(),
+					openai.ChatCompletionRequest{
+						Model: "openllama_3b",
+						Messages: []openai.ChatCompletionMessage{
+							{
+								Role:    "user",
+								Content: "What is the weather like in San Francisco (celsius)?",
+							},
+						},
+						Functions: []openai.FunctionDefinition{
+							openai.FunctionDefinition{
+								Name:        "get_current_weather",
+								Description: "Get the current weather",
+								Parameters: jsonschema.Definition{
+									Type: jsonschema.Object,
+									Properties: map[string]jsonschema.Definition{
+										"location": {
+											Type:        jsonschema.String,
+											Description: "The city and state, e.g. San Francisco, CA",
+										},
+										"unit": {
+											Type: jsonschema.String,
+											Enum: []string{"celcius", "fahrenheit"},
+										},
+									},
+									Required: []string{"location"},
+								},
+							},
+						},
+					})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp2.Choices)).To(Equal(1))
+				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
+
+				var res map[string]string
+				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(res["location"]).To(Equal("San Francisco, California, United States"), fmt.Sprint(res))
+				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
+				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
+			})
+
+			It("runs openllama gguf", Label("llama-gguf"), func() {
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+				modelName := "codellama"
+				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+					URL:       "github:go-skynet/model-gallery/codellama-7b-instruct.yaml",
+					Name:      modelName,
+					Overrides: map[string]interface{}{"backend": "llama", "mmap": true, "f16": true, "context_size": 128},
+				})
+
+				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+				uuid := response["uuid"].(string)
+
+				Eventually(func() bool {
+					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+					return response["processed"].(bool)
+				}, "360s", "10s").Should(Equal(true))
+
+				By("testing chat")
+				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: modelName, Messages: []openai.ChatCompletionMessage{
+					{
+						Role:    "user",
+						Content: "How much is 2+2?",
+					},
+				}})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices)).To(Equal(1))
+				Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("4"), ContainSubstring("four")))
+
+				By("testing functions")
+				resp2, err := client.CreateChatCompletion(
+					context.TODO(),
+					openai.ChatCompletionRequest{
+						Model: modelName,
+						Messages: []openai.ChatCompletionMessage{
+							{
+								Role:    "user",
+								Content: "What is the weather like in San Francisco (celsius)?",
+							},
+						},
+						Functions: []openai.FunctionDefinition{
+							openai.FunctionDefinition{
+								Name:        "get_current_weather",
+								Description: "Get the current weather",
+								Parameters: jsonschema.Definition{
+									Type: jsonschema.Object,
+									Properties: map[string]jsonschema.Definition{
+										"location": {
+											Type:        jsonschema.String,
+											Description: "The city and state, e.g. San Francisco, CA",
+										},
+										"unit": {
+											Type: jsonschema.String,
+											Enum: []string{"celcius", "fahrenheit"},
+										},
+									},
+									Required: []string{"location"},
+								},
+							},
+						},
+					})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp2.Choices)).To(Equal(1))
+				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
+				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
+
+				var res map[string]string
+				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(res["location"]).To(Equal("San Francisco"), fmt.Sprint(res))
+				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
+				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
+			})
+
+			It("runs gpt4all", Label("gpt4all"), func() {
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+
+				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+					URL:  "github:go-skynet/model-gallery/gpt4all-j.yaml",
+					Name: "gpt4all-j",
+				})
+
+				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+				uuid := response["uuid"].(string)
+
+				Eventually(func() bool {
+					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+					return response["processed"].(bool)
+				}, "960s", "10s").Should(Equal(true))
+
+				resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-j", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "How are you?"}}})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices)).To(Equal(1))
+				Expect(resp.Choices[0].Message.Content).To(ContainSubstring("well"))
+			})
+
+		})
+	})
+
+	Context("Model gallery", func() {
+		BeforeEach(func() {
+			var err error
+			tmpdir, err = os.MkdirTemp("", "")
+			Expect(err).ToNot(HaveOccurred())
+
+			modelLoader = model.NewModelLoader(tmpdir)
+			c, cancel = context.WithCancel(context.Background())
+
+			galleries := []gallery.Gallery{
+				{
+					Name: "model-gallery",
+					URL:  "https://raw.githubusercontent.com/go-skynet/model-gallery/main/index.yaml",
+				},
+			}
+
+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
+			app, err = App(
+				append(commonOpts,
+					options.WithContext(c),
+					options.WithMetrics(metricsService),
+					options.WithAudioDir(tmpdir),
+					options.WithImageDir(tmpdir),
+					options.WithGalleries(galleries),
+					options.WithModelLoader(modelLoader),
+					options.WithBackendAssets(backendAssets),
+					options.WithBackendAssetsOutput(tmpdir))...,
+			)
+			Expect(err).ToNot(HaveOccurred())
+			go app.Listen("127.0.0.1:9090")
+
+			defaultConfig := openai.DefaultConfig("")
+			defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"
+
+			client2 = openaigo.NewClient("")
+			client2.BaseURL = defaultConfig.BaseURL
+
+			// Wait for API to be ready
+			client = openai.NewClientWithConfig(defaultConfig)
+			Eventually(func() error {
+				_, err := client.ListModels(context.TODO())
+				return err
+			}, "2m").ShouldNot(HaveOccurred())
+		})
+
+		AfterEach(func() {
+			cancel()
+			app.Shutdown()
+			os.RemoveAll(tmpdir)
+		})
+		It("installs and is capable to run tts", Label("tts"), func() {
+			if runtime.GOOS != "linux" {
+				Skip("test supported only on linux")
+			}
+
+			response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+				ID: "model-gallery@voice-en-us-kathleen-low",
+			})
+
+			Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+			uuid := response["uuid"].(string)
+
+			Eventually(func() bool {
+				response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+				fmt.Println(response)
+				return response["processed"].(bool)
+			}, "360s", "10s").Should(Equal(true))
+
+			// An HTTP Post to the /tts endpoint should return a wav audio file
+			resp, err := http.Post("http://127.0.0.1:9090/tts", "application/json", bytes.NewBuffer([]byte(`{"input": "Hello world", "model": "en-us-kathleen-low.onnx"}`)))
+			Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp))
+			dat, err := io.ReadAll(resp.Body)
+			Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp))
+
+			Expect(resp.StatusCode).To(Equal(200), fmt.Sprint(string(dat)))
+			Expect(resp.Header.Get("Content-Type")).To(Equal("audio/x-wav"))
+		})
+		It("installs and is capable to generate images", Label("stablediffusion"), func() {
+			if runtime.GOOS != "linux" {
+				Skip("test supported only on linux")
+			}
+
+			response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
+				ID: "model-gallery@stablediffusion",
+				Overrides: map[string]interface{}{
+					"parameters": map[string]interface{}{"model": "stablediffusion_assets"},
+				},
+			})
+
+			Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
+
+			uuid := response["uuid"].(string)
+
+			Eventually(func() bool {
+				response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
+				fmt.Println(response)
+				return response["processed"].(bool)
+			}, "360s", "10s").Should(Equal(true))
+
+			resp, err := http.Post(
+				"http://127.0.0.1:9090/v1/images/generations",
+				"application/json",
+				bytes.NewBuffer([]byte(`{
+					 			"prompt": "floating hair, portrait, ((loli)), ((one girl)), cute face, hidden hands, asymmetrical bangs, beautiful detailed eyes, eye shadow, hair ornament, ribbons, bowties, buttons, pleated skirt, (((masterpiece))), ((best quality)), colorful|((part of the head)), ((((mutated hands and fingers)))), deformed, blurry, bad anatomy, disfigured, poorly drawn face, mutation, mutated, extra limb, ugly, poorly drawn hands, missing limb, blurry, floating limbs, disconnected limbs, malformed hands, blur, out of focus, long neck, long body, Octane renderer, lowres, bad anatomy, bad hands, text",
+								"mode": 2,  "seed":9000,
+					 			"size": "256x256", "n":2}`)))
+			// The response should contain an URL
+			Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp))
+			dat, err := io.ReadAll(resp.Body)
+			Expect(err).ToNot(HaveOccurred(), string(dat))
+			Expect(string(dat)).To(ContainSubstring("http://127.0.0.1:9090/"), string(dat))
+			Expect(string(dat)).To(ContainSubstring(".png"), string(dat))
+
+		})
+	})
+
+	Context("API query", func() {
+		BeforeEach(func() {
+			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
+			c, cancel = context.WithCancel(context.Background())
+
+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
+			app, err = App(
+				append(commonOpts,
+					options.WithExternalBackend("huggingface", os.Getenv("HUGGINGFACE_GRPC")),
+					options.WithContext(c),
+					options.WithModelLoader(modelLoader),
+					options.WithMetrics(metricsService),
+				)...)
+			Expect(err).ToNot(HaveOccurred())
+			go app.Listen("127.0.0.1:9090")
+
+			defaultConfig := openai.DefaultConfig("")
+			defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"
+
+			client2 = openaigo.NewClient("")
+			client2.BaseURL = defaultConfig.BaseURL
+
 			// Wait for API to be ready
 			client = openai.NewClientWithConfig(defaultConfig)
 			Eventually(func() error {
@@ -35,13 +619,13 @@ var _ = Describe("API test", func() {
 			}, "2m").ShouldNot(HaveOccurred())
 		})
 		AfterEach(func() {
+			cancel()
 			app.Shutdown()
 		})
 		It("returns the models list", func() {
 			models, err := client.ListModels(context.TODO())
 			Expect(err).ToNot(HaveOccurred())
-			Expect(len(models.Models)).To(Equal(1))
-			Expect(models.Models[0].ID).To(Equal("testmodel"))
+			Expect(len(models.Models)).To(Equal(6)) // If "config.yaml" should be included, this should be 8?
 		})
 		It("can generate completions", func() {
 			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel", Prompt: "abcdedfghikl"})
@@ -49,10 +633,228 @@ var _ = Describe("API test", func() {
 			Expect(len(resp.Choices)).To(Equal(1))
 			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
 		})
+
+		It("can generate chat completions ", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "testmodel", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Choices)).To(Equal(1))
+			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
+		})
+
+		It("can generate completions from model configs", func() {
+			resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "gpt4all", Prompt: "abcdedfghikl"})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Choices)).To(Equal(1))
+			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
+		})
+
+		It("can generate chat completions from model configs", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "gpt4all-2", Messages: []openai.ChatCompletionMessage{openai.ChatCompletionMessage{Role: "user", Content: "abcdedfghikl"}}})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Choices)).To(Equal(1))
+			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
+		})
+
 		It("returns errors", func() {
+			backends := len(model.AutoLoadBackends) + 1 // +1 for huggingface
 			_, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "foomodel", Prompt: "abcdedfghikl"})
 			Expect(err).To(HaveOccurred())
-			Expect(err.Error()).To(ContainSubstring("error, status code: 500, message: llama: model does not exist"))
+			Expect(err.Error()).To(ContainSubstring(fmt.Sprintf("error, status code: 500, message: could not load model - all backends returned error: %d errors occurred:", backends)))
+		})
+		It("transcribes audio", func() {
+			if runtime.GOOS != "linux" {
+				Skip("test supported only on linux")
+			}
+			resp, err := client.CreateTranscription(
+				context.Background(),
+				openai.AudioRequest{
+					Model:    openai.Whisper1,
+					FilePath: filepath.Join(os.Getenv("TEST_DIR"), "audio.wav"),
+				},
+			)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(resp.Text).To(ContainSubstring("This is the Micro Machine Man presenting"))
+		})
+
+		It("calculate embeddings", func() {
+			if runtime.GOOS != "linux" {
+				Skip("test supported only on linux")
+			}
+			resp, err := client.CreateEmbeddings(
+				context.Background(),
+				openai.EmbeddingRequest{
+					Model: openai.AdaEmbeddingV2,
+					Input: []string{"sun", "cat"},
+				},
+			)
+			Expect(err).ToNot(HaveOccurred(), err)
+			Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 384))
+			Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 384))
+
+			sunEmbedding := resp.Data[0].Embedding
+			resp2, err := client.CreateEmbeddings(
+				context.Background(),
+				openai.EmbeddingRequest{
+					Model: openai.AdaEmbeddingV2,
+					Input: []string{"sun"},
+				},
+			)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(resp2.Data[0].Embedding).To(Equal(sunEmbedding))
+		})
+
+		Context("External gRPC calls", func() {
+			It("calculate embeddings with huggingface", func() {
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+				resp, err := client.CreateEmbeddings(
+					context.Background(),
+					openai.EmbeddingRequest{
+						Model: openai.AdaCodeSearchCode,
+						Input: []string{"sun", "cat"},
+					},
+				)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 384))
+				Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 384))
+
+				sunEmbedding := resp.Data[0].Embedding
+				resp2, err := client.CreateEmbeddings(
+					context.Background(),
+					openai.EmbeddingRequest{
+						Model: openai.AdaCodeSearchCode,
+						Input: []string{"sun"},
+					},
+				)
+				Expect(err).ToNot(HaveOccurred())
+				Expect(resp2.Data[0].Embedding).To(Equal(sunEmbedding))
+				Expect(resp2.Data[0].Embedding).ToNot(Equal(resp.Data[1].Embedding))
+			})
+		})
+
+		Context("backends", func() {
+			It("runs rwkv completion", func() {
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,"})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices) > 0).To(BeTrue())
+				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
+
+				stream, err := client.CreateCompletionStream(context.TODO(), openai.CompletionRequest{
+					Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,", Stream: true,
+				})
+				Expect(err).ToNot(HaveOccurred())
+				defer stream.Close()
+
+				tokens := 0
+				text := ""
+				for {
+					response, err := stream.Recv()
+					if errors.Is(err, io.EOF) {
+						break
+					}
+
+					Expect(err).ToNot(HaveOccurred())
+					text += response.Choices[0].Text
+					tokens++
+				}
+				Expect(text).ToNot(BeEmpty())
+				Expect(text).To(ContainSubstring("five"))
+				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
+			})
+			It("runs rwkv chat completion", func() {
+				if runtime.GOOS != "linux" {
+					Skip("test supported only on linux")
+				}
+				resp, err := client.CreateChatCompletion(context.TODO(),
+					openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
+				Expect(err).ToNot(HaveOccurred())
+				Expect(len(resp.Choices) > 0).To(BeTrue())
+				Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("Sure"), ContainSubstring("five")))
+
+				stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
+				Expect(err).ToNot(HaveOccurred())
+				defer stream.Close()
+
+				tokens := 0
+				text := ""
+				for {
+					response, err := stream.Recv()
+					if errors.Is(err, io.EOF) {
+						break
+					}
+
+					Expect(err).ToNot(HaveOccurred())
+					text += response.Choices[0].Delta.Content
+					tokens++
+				}
+				Expect(text).ToNot(BeEmpty())
+				Expect(text).To(Or(ContainSubstring("Sure"), ContainSubstring("five")))
+
+				Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
+			})
 		})
 	})
+
+	Context("Config file", func() {
+		BeforeEach(func() {
+			modelLoader = model.NewModelLoader(os.Getenv("MODELS_PATH"))
+			c, cancel = context.WithCancel(context.Background())
+
+			metricsService, err := metrics.SetupMetrics()
+			Expect(err).ToNot(HaveOccurred())
+
+			app, err = App(
+				append(commonOpts,
+					options.WithContext(c),
+					options.WithMetrics(metricsService),
+					options.WithModelLoader(modelLoader),
+					options.WithConfigFile(os.Getenv("CONFIG_FILE")))...,
+			)
+			Expect(err).ToNot(HaveOccurred())
+			go app.Listen("127.0.0.1:9090")
+
+			defaultConfig := openai.DefaultConfig("")
+			defaultConfig.BaseURL = "http://127.0.0.1:9090/v1"
+			client2 = openaigo.NewClient("")
+			client2.BaseURL = defaultConfig.BaseURL
+			// Wait for API to be ready
+			client = openai.NewClientWithConfig(defaultConfig)
+			Eventually(func() error {
+				_, err := client.ListModels(context.TODO())
+				return err
+			}, "2m").ShouldNot(HaveOccurred())
+		})
+		AfterEach(func() {
+			cancel()
+			app.Shutdown()
+		})
+		It("can generate chat completions from config file (list1)", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list1", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: "abcdedfghikl"}}})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Choices)).To(Equal(1))
+			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
+		})
+		It("can generate chat completions from config file (list2)", func() {
+			resp, err := client.CreateChatCompletion(context.TODO(), openai.ChatCompletionRequest{Model: "list2", Messages: []openai.ChatCompletionMessage{{Role: "user", Content: "abcdedfghikl"}}})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Choices)).To(Equal(1))
+			Expect(resp.Choices[0].Message.Content).ToNot(BeEmpty())
+		})
+		It("can generate edit completions from config file", func() {
+			request := openaigo.EditCreateRequestBody{
+				Model:       "list2",
+				Instruction: "foo",
+				Input:       "bar",
+			}
+			resp, err := client2.CreateEdit(context.Background(), request)
+			Expect(err).ToNot(HaveOccurred())
+			Expect(len(resp.Choices)).To(Equal(1))
+			Expect(resp.Choices[0].Text).ToNot(BeEmpty())
+		})
+
+	})
 })
--- a/api/backend/embeddings.go
+++ b/api/backend/embeddings.go
@@ -0,0 +1,92 @@
+package backend
+
+import (
+	"fmt"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/pkg/grpc"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+)
+
+func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, c config.Config, o *options.Option) (func() ([]float32, error), error) {
+	if !c.Embeddings {
+		return nil, fmt.Errorf("endpoint disabled for this model by API configuration")
+	}
+
+	modelFile := c.Model
+
+	grpcOpts := gRPCModelOpts(c)
+
+	var inferenceModel interface{}
+	var err error
+
+	opts := modelOpts(c, o, []model.Option{
+		model.WithLoadGRPCLoadModelOpts(grpcOpts),
+		model.WithThreads(uint32(c.Threads)),
+		model.WithAssetDir(o.AssetsDestination),
+		model.WithModel(modelFile),
+		model.WithContext(o.Context),
+	})
+
+	if c.Backend == "" {
+		inferenceModel, err = loader.GreedyLoader(opts...)
+	} else {
+		opts = append(opts, model.WithBackendString(c.Backend))
+		inferenceModel, err = loader.BackendLoader(opts...)
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	var fn func() ([]float32, error)
+	switch model := inferenceModel.(type) {
+	case *grpc.Client:
+		fn = func() ([]float32, error) {
+			predictOptions := gRPCPredictOpts(c, loader.ModelPath)
+			if len(tokens) > 0 {
+				embeds := []int32{}
+
+				for _, t := range tokens {
+					embeds = append(embeds, int32(t))
+				}
+				predictOptions.EmbeddingTokens = embeds
+
+				res, err := model.Embeddings(o.Context, predictOptions)
+				if err != nil {
+					return nil, err
+				}
+
+				return res.Embeddings, nil
+			}
+			predictOptions.Embeddings = s
+
+			res, err := model.Embeddings(o.Context, predictOptions)
+			if err != nil {
+				return nil, err
+			}
+
+			return res.Embeddings, nil
+		}
+	default:
+		fn = func() ([]float32, error) {
+			return nil, fmt.Errorf("embeddings not supported by the backend")
+		}
+	}
+
+	return func() ([]float32, error) {
+		embeds, err := fn()
+		if err != nil {
+			return embeds, err
+		}
+		// Remove trailing 0s
+		for i := len(embeds) - 1; i >= 0; i-- {
+			if embeds[i] == 0.0 {
+				embeds = embeds[:i]
+			} else {
+				break
+			}
+		}
+		return embeds, nil
+	}, nil
+}
--- a/api/backend/image.go
+++ b/api/backend/image.go
@@ -0,0 +1,60 @@
+package backend
+
+import (
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+)
+
+func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, c config.Config, o *options.Option) (func() error, error) {
+
+	opts := modelOpts(c, o, []model.Option{
+		model.WithBackendString(c.Backend),
+		model.WithAssetDir(o.AssetsDestination),
+		model.WithThreads(uint32(c.Threads)),
+		model.WithContext(o.Context),
+		model.WithModel(c.Model),
+		model.WithLoadGRPCLoadModelOpts(&proto.ModelOptions{
+			CUDA:          c.Diffusers.CUDA,
+			SchedulerType: c.Diffusers.SchedulerType,
+			PipelineType:  c.Diffusers.PipelineType,
+			CFGScale:      c.Diffusers.CFGScale,
+			LoraAdapter:   c.LoraAdapter,
+			LoraScale:     c.LoraScale,
+			LoraBase:      c.LoraBase,
+			IMG2IMG:       c.Diffusers.IMG2IMG,
+			CLIPModel:     c.Diffusers.ClipModel,
+			CLIPSubfolder: c.Diffusers.ClipSubFolder,
+			CLIPSkip:      int32(c.Diffusers.ClipSkip),
+		}),
+	})
+
+	inferenceModel, err := loader.BackendLoader(
+		opts...,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	fn := func() error {
+		_, err := inferenceModel.GenerateImage(
+			o.Context,
+			&proto.GenerateImageRequest{
+				Height:           int32(height),
+				Width:            int32(width),
+				Mode:             int32(mode),
+				Step:             int32(step),
+				Seed:             int32(seed),
+				CLIPSkip:         int32(c.Diffusers.ClipSkip),
+				PositivePrompt:   positive_prompt,
+				NegativePrompt:   negative_prompt,
+				Dst:              dst,
+				Src:              src,
+				EnableParameters: c.Diffusers.EnableParameters,
+			})
+		return err
+	}
+
+	return fn, nil
+}
--- a/api/backend/llm.go
+++ b/api/backend/llm.go
@@ -0,0 +1,164 @@
+package backend
+
+import (
+	"context"
+	"os"
+	"regexp"
+	"strings"
+	"sync"
+	"unicode/utf8"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/go-skynet/LocalAI/pkg/grpc"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+)
+
+type LLMResponse struct {
+	Response string // should this be []byte?
+	Usage    TokenUsage
+}
+
+type TokenUsage struct {
+	Prompt     int
+	Completion int
+}
+
+func ModelInference(ctx context.Context, s string, images []string, loader *model.ModelLoader, c config.Config, o *options.Option, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+	modelFile := c.Model
+
+	grpcOpts := gRPCModelOpts(c)
+
+	var inferenceModel *grpc.Client
+	var err error
+
+	opts := modelOpts(c, o, []model.Option{
+		model.WithLoadGRPCLoadModelOpts(grpcOpts),
+		model.WithThreads(uint32(c.Threads)), // some models uses this to allocate threads during startup
+		model.WithAssetDir(o.AssetsDestination),
+		model.WithModel(modelFile),
+		model.WithContext(o.Context),
+	})
+
+	if c.Backend != "" {
+		opts = append(opts, model.WithBackendString(c.Backend))
+	}
+
+	// Check if the modelFile exists, if it doesn't try to load it from the gallery
+	if o.AutoloadGalleries { // experimental
+		if _, err := os.Stat(modelFile); os.IsNotExist(err) {
+			utils.ResetDownloadTimers()
+			// if we failed to load the model, we try to download it
+			err := gallery.InstallModelFromGalleryByName(o.Galleries, modelFile, loader.ModelPath, gallery.GalleryModel{}, utils.DisplayDownloadFunction)
+			if err != nil {
+				return nil, err
+			}
+		}
+	}
+
+	if c.Backend == "" {
+		inferenceModel, err = loader.GreedyLoader(opts...)
+	} else {
+		inferenceModel, err = loader.BackendLoader(opts...)
+	}
+
+	if err != nil {
+		return nil, err
+	}
+
+	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
+	fn := func() (LLMResponse, error) {
+		opts := gRPCPredictOpts(c, loader.ModelPath)
+		opts.Prompt = s
+		opts.Images = images
+
+		tokenUsage := TokenUsage{}
+
+		// check the per-model feature flag for usage, since tokenCallback may have a cost.
+		// Defaults to off as for now it is still experimental
+		if c.FeatureFlag.Enabled("usage") {
+			userTokenCallback := tokenCallback
+			if userTokenCallback == nil {
+				userTokenCallback = func(token string, usage TokenUsage) bool {
+					return true
+				}
+			}
+
+			promptInfo, pErr := inferenceModel.TokenizeString(ctx, opts)
+			if pErr == nil && promptInfo.Length > 0 {
+				tokenUsage.Prompt = int(promptInfo.Length)
+			}
+
+			tokenCallback = func(token string, usage TokenUsage) bool {
+				tokenUsage.Completion++
+				return userTokenCallback(token, tokenUsage)
+			}
+		}
+
+		if tokenCallback != nil {
+			ss := ""
+
+			var partialRune []byte
+			err := inferenceModel.PredictStream(ctx, opts, func(chars []byte) {
+				partialRune = append(partialRune, chars...)
+
+				for len(partialRune) > 0 {
+					r, size := utf8.DecodeRune(partialRune)
+					if r == utf8.RuneError {
+						// incomplete rune, wait for more bytes
+						break
+					}
+
+					tokenCallback(string(r), tokenUsage)
+					ss += string(r)
+
+					partialRune = partialRune[size:]
+				}
+			})
+			return LLMResponse{
+				Response: ss,
+				Usage:    tokenUsage,
+			}, err
+		} else {
+			// TODO: Is the chicken bit the only way to get here? is that acceptable?
+			reply, err := inferenceModel.Predict(ctx, opts)
+			if err != nil {
+				return LLMResponse{}, err
+			}
+			return LLMResponse{
+				Response: string(reply.Message),
+				Usage:    tokenUsage,
+			}, err
+		}
+	}
+
+	return fn, nil
+}
+
+var cutstrings map[string]*regexp.Regexp = make(map[string]*regexp.Regexp)
+var mu sync.Mutex = sync.Mutex{}
+
+func Finetune(config config.Config, input, prediction string) string {
+	if config.Echo {
+		prediction = input + prediction
+	}
+
+	for _, c := range config.Cutstrings {
+		mu.Lock()
+		reg, ok := cutstrings[c]
+		if !ok {
+			cutstrings[c] = regexp.MustCompile(c)
+			reg = cutstrings[c]
+		}
+		mu.Unlock()
+		prediction = reg.ReplaceAllString(prediction, "")
+	}
+
+	for _, c := range config.TrimSpace {
+		prediction = strings.TrimSpace(strings.TrimPrefix(prediction, c))
+	}
+	return prediction
+
+}
--- a/api/backend/options.go
+++ b/api/backend/options.go
@@ -0,0 +1,122 @@
+package backend
+
+import (
+	"os"
+	"path/filepath"
+
+	pb "github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+)
+
+func modelOpts(c config.Config, o *options.Option, opts []model.Option) []model.Option {
+	if o.SingleBackend {
+		opts = append(opts, model.WithSingleActiveBackend())
+	}
+
+	if c.GRPC.Attempts != 0 {
+		opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
+	}
+
+	if c.GRPC.AttemptsSleepTime != 0 {
+		opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
+	}
+
+	for k, v := range o.ExternalGRPCBackends {
+		opts = append(opts, model.WithExternalBackend(k, v))
+	}
+
+	return opts
+}
+
+func gRPCModelOpts(c config.Config) *pb.ModelOptions {
+	b := 512
+	if c.Batch != 0 {
+		b = c.Batch
+	}
+
+	return &pb.ModelOptions{
+		ContextSize:    int32(c.ContextSize),
+		Seed:           int32(c.Seed),
+		NBatch:         int32(b),
+		NoMulMatQ:      c.NoMulMatQ,
+		DraftModel:     c.DraftModel,
+		AudioPath:      c.VallE.AudioPath,
+		Quantization:   c.Quantization,
+		MMProj:         c.MMProj,
+		YarnExtFactor:  c.YarnExtFactor,
+		YarnAttnFactor: c.YarnAttnFactor,
+		YarnBetaFast:   c.YarnBetaFast,
+		YarnBetaSlow:   c.YarnBetaSlow,
+		LoraAdapter:    c.LoraAdapter,
+		LoraBase:       c.LoraBase,
+		LoraScale:      c.LoraScale,
+		NGQA:           c.NGQA,
+		RMSNormEps:     c.RMSNormEps,
+		F16Memory:      c.F16,
+		MLock:          c.MMlock,
+		RopeFreqBase:   c.RopeFreqBase,
+		RopeFreqScale:  c.RopeFreqScale,
+		NUMA:           c.NUMA,
+		Embeddings:     c.Embeddings,
+		LowVRAM:        c.LowVRAM,
+		NGPULayers:     int32(c.NGPULayers),
+		MMap:           c.MMap,
+		MainGPU:        c.MainGPU,
+		Threads:        int32(c.Threads),
+		TensorSplit:    c.TensorSplit,
+		// AutoGPTQ
+		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
+		Device:           c.AutoGPTQ.Device,
+		UseTriton:        c.AutoGPTQ.Triton,
+		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
+		// RWKV
+		Tokenizer: c.Tokenizer,
+	}
+}
+
+func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
+	promptCachePath := ""
+	if c.PromptCachePath != "" {
+		p := filepath.Join(modelPath, c.PromptCachePath)
+		os.MkdirAll(filepath.Dir(p), 0755)
+		promptCachePath = p
+	}
+	return &pb.PredictOptions{
+		Temperature:         float32(c.Temperature),
+		TopP:                float32(c.TopP),
+		NDraft:              c.NDraft,
+		TopK:                int32(c.TopK),
+		Tokens:              int32(c.Maxtokens),
+		Threads:             int32(c.Threads),
+		PromptCacheAll:      c.PromptCacheAll,
+		PromptCacheRO:       c.PromptCacheRO,
+		PromptCachePath:     promptCachePath,
+		F16KV:               c.F16,
+		DebugMode:           c.Debug,
+		Grammar:             c.Grammar,
+		NegativePromptScale: c.NegativePromptScale,
+		RopeFreqBase:        c.RopeFreqBase,
+		RopeFreqScale:       c.RopeFreqScale,
+		NegativePrompt:      c.NegativePrompt,
+		Mirostat:            int32(c.LLMConfig.Mirostat),
+		MirostatETA:         float32(c.LLMConfig.MirostatETA),
+		MirostatTAU:         float32(c.LLMConfig.MirostatTAU),
+		Debug:               c.Debug,
+		StopPrompts:         c.StopWords,
+		Repeat:              int32(c.RepeatPenalty),
+		NKeep:               int32(c.Keep),
+		Batch:               int32(c.Batch),
+		IgnoreEOS:           c.IgnoreEOS,
+		Seed:                int32(c.Seed),
+		FrequencyPenalty:    float32(c.FrequencyPenalty),
+		MLock:               c.MMlock,
+		MMap:                c.MMap,
+		MainGPU:             c.MainGPU,
+		TensorSplit:         c.TensorSplit,
+		TailFreeSamplingZ:   float32(c.TFZ),
+		TypicalP:            float32(c.TypicalP),
+	}
+}
--- a/api/backend/transcript.go
+++ b/api/backend/transcript.go
@@ -0,0 +1,39 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/schema"
+
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+)
+
+func ModelTranscription(audio, language string, loader *model.ModelLoader, c config.Config, o *options.Option) (*schema.Result, error) {
+
+	opts := modelOpts(c, o, []model.Option{
+		model.WithBackendString(model.WhisperBackend),
+		model.WithModel(c.Model),
+		model.WithContext(o.Context),
+		model.WithThreads(uint32(c.Threads)),
+		model.WithAssetDir(o.AssetsDestination),
+	})
+
+	whisperModel, err := o.Loader.BackendLoader(opts...)
+	if err != nil {
+		return nil, err
+	}
+
+	if whisperModel == nil {
+		return nil, fmt.Errorf("could not load whisper model")
+	}
+
+	return whisperModel.AudioTranscription(context.Background(), &proto.TranscriptRequest{
+		Dst:      audio,
+		Language: language,
+		Threads:  uint32(c.Threads),
+	})
+}
--- a/api/backend/tts.go
+++ b/api/backend/tts.go
@@ -0,0 +1,75 @@
+package backend
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+
+	api_config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+)
+
+func generateUniqueFileName(dir, baseName, ext string) string {
+	counter := 1
+	fileName := baseName + ext
+
+	for {
+		filePath := filepath.Join(dir, fileName)
+		_, err := os.Stat(filePath)
+		if os.IsNotExist(err) {
+			return fileName
+		}
+
+		counter++
+		fileName = fmt.Sprintf("%s_%d%s", baseName, counter, ext)
+	}
+}
+
+func ModelTTS(backend, text, modelFile string, loader *model.ModelLoader, o *options.Option) (string, *proto.Result, error) {
+	bb := backend
+	if bb == "" {
+		bb = model.PiperBackend
+	}
+	opts := modelOpts(api_config.Config{}, o, []model.Option{
+		model.WithBackendString(bb),
+		model.WithModel(modelFile),
+		model.WithContext(o.Context),
+		model.WithAssetDir(o.AssetsDestination),
+	})
+	piperModel, err := o.Loader.BackendLoader(opts...)
+	if err != nil {
+		return "", nil, err
+	}
+
+	if piperModel == nil {
+		return "", nil, fmt.Errorf("could not load piper model")
+	}
+
+	if err := os.MkdirAll(o.AudioDir, 0755); err != nil {
+		return "", nil, fmt.Errorf("failed creating audio directory: %s", err)
+	}
+
+	fileName := generateUniqueFileName(o.AudioDir, "piper", ".wav")
+	filePath := filepath.Join(o.AudioDir, fileName)
+
+	// If the model file is not empty, we pass it joined with the model path
+	modelPath := ""
+	if modelFile != "" {
+		modelPath = filepath.Join(o.Loader.ModelPath, modelFile)
+		if err := utils.VerifyPath(modelPath, o.Loader.ModelPath); err != nil {
+			return "", nil, err
+		}
+	}
+
+	res, err := piperModel.TTS(context.Background(), &proto.TTSRequest{
+		Text:  text,
+		Model: modelPath,
+		Dst:   filePath,
+	})
+
+	return filePath, res, err
+}
--- a/api/config/config.go
+++ b/api/config/config.go
@@ -0,0 +1,290 @@
+package api_config
+
+import (
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+	"strings"
+	"sync"
+
+	"gopkg.in/yaml.v3"
+)
+
+type Config struct {
+	PredictionOptions `yaml:"parameters"`
+	Name              string `yaml:"name"`
+
+	F16            bool              `yaml:"f16"`
+	Threads        int               `yaml:"threads"`
+	Debug          bool              `yaml:"debug"`
+	Roles          map[string]string `yaml:"roles"`
+	Embeddings     bool              `yaml:"embeddings"`
+	Backend        string            `yaml:"backend"`
+	TemplateConfig TemplateConfig    `yaml:"template"`
+
+	PromptStrings, InputStrings                []string `yaml:"-"`
+	InputToken                                 [][]int  `yaml:"-"`
+	functionCallString, functionCallNameString string   `yaml:"-"`
+
+	FunctionsConfig Functions `yaml:"function"`
+
+	FeatureFlag FeatureFlag `yaml:"feature_flags"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early.
+	// LLM configs (GPT4ALL, Llama.cpp, ...)
+	LLMConfig `yaml:",inline"`
+
+	// AutoGPTQ specifics
+	AutoGPTQ AutoGPTQ `yaml:"autogptq"`
+
+	// Diffusers
+	Diffusers Diffusers `yaml:"diffusers"`
+
+	Step int `yaml:"step"`
+
+	// GRPC Options
+	GRPC GRPC `yaml:"grpc"`
+
+	// Vall-e-x
+	VallE VallE `yaml:"vall-e"`
+}
+
+type VallE struct {
+	AudioPath string `yaml:"audio_path"`
+}
+
+type FeatureFlag map[string]*bool
+
+func (ff FeatureFlag) Enabled(s string) bool {
+	v, exist := ff[s]
+	return exist && v != nil && *v
+}
+
+type GRPC struct {
+	Attempts          int `yaml:"attempts"`
+	AttemptsSleepTime int `yaml:"attempts_sleep_time"`
+}
+
+type Diffusers struct {
+	PipelineType     string  `yaml:"pipeline_type"`
+	SchedulerType    string  `yaml:"scheduler_type"`
+	CUDA             bool    `yaml:"cuda"`
+	EnableParameters string  `yaml:"enable_parameters"` // A list of comma separated parameters to specify
+	CFGScale         float32 `yaml:"cfg_scale"`         // Classifier-Free Guidance Scale
+	IMG2IMG          bool    `yaml:"img2img"`           // Image to Image Diffuser
+	ClipSkip         int     `yaml:"clip_skip"`         // Skip every N frames
+	ClipModel        string  `yaml:"clip_model"`        // Clip model to use
+	ClipSubFolder    string  `yaml:"clip_subfolder"`    // Subfolder to use for clip model
+}
+
+type LLMConfig struct {
+	SystemPrompt    string   `yaml:"system_prompt"`
+	TensorSplit     string   `yaml:"tensor_split"`
+	MainGPU         string   `yaml:"main_gpu"`
+	RMSNormEps      float32  `yaml:"rms_norm_eps"`
+	NGQA            int32    `yaml:"ngqa"`
+	PromptCachePath string   `yaml:"prompt_cache_path"`
+	PromptCacheAll  bool     `yaml:"prompt_cache_all"`
+	PromptCacheRO   bool     `yaml:"prompt_cache_ro"`
+	MirostatETA     float64  `yaml:"mirostat_eta"`
+	MirostatTAU     float64  `yaml:"mirostat_tau"`
+	Mirostat        int      `yaml:"mirostat"`
+	NGPULayers      int      `yaml:"gpu_layers"`
+	MMap            bool     `yaml:"mmap"`
+	MMlock          bool     `yaml:"mmlock"`
+	LowVRAM         bool     `yaml:"low_vram"`
+	Grammar         string   `yaml:"grammar"`
+	StopWords       []string `yaml:"stopwords"`
+	Cutstrings      []string `yaml:"cutstrings"`
+	TrimSpace       []string `yaml:"trimspace"`
+	ContextSize     int      `yaml:"context_size"`
+	NUMA            bool     `yaml:"numa"`
+	LoraAdapter     string   `yaml:"lora_adapter"`
+	LoraBase        string   `yaml:"lora_base"`
+	LoraScale       float32  `yaml:"lora_scale"`
+	NoMulMatQ       bool     `yaml:"no_mulmatq"`
+	DraftModel      string   `yaml:"draft_model"`
+	NDraft          int32    `yaml:"n_draft"`
+	Quantization    string   `yaml:"quantization"`
+	MMProj          string   `yaml:"mmproj"`
+
+	RopeScaling    string  `yaml:"rope_scaling"`
+	YarnExtFactor  float32 `yaml:"yarn_ext_factor"`
+	YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
+	YarnBetaFast   float32 `yaml:"yarn_beta_fast"`
+	YarnBetaSlow   float32 `yaml:"yarn_beta_slow"`
+}
+
+type AutoGPTQ struct {
+	ModelBaseName    string `yaml:"model_base_name"`
+	Device           string `yaml:"device"`
+	Triton           bool   `yaml:"triton"`
+	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
+}
+
+type Functions struct {
+	DisableNoAction         bool   `yaml:"disable_no_action"`
+	NoActionFunctionName    string `yaml:"no_action_function_name"`
+	NoActionDescriptionName string `yaml:"no_action_description_name"`
+}
+
+type TemplateConfig struct {
+	Chat        string `yaml:"chat"`
+	ChatMessage string `yaml:"chat_message"`
+	Completion  string `yaml:"completion"`
+	Edit        string `yaml:"edit"`
+	Functions   string `yaml:"function"`
+}
+
+type ConfigLoader struct {
+	configs map[string]Config
+	sync.Mutex
+}
+
+func (c *Config) SetFunctionCallString(s string) {
+	c.functionCallString = s
+}
+
+func (c *Config) SetFunctionCallNameString(s string) {
+	c.functionCallNameString = s
+}
+
+func (c *Config) ShouldUseFunctions() bool {
+	return ((c.functionCallString != "none" || c.functionCallString == "") || c.ShouldCallSpecificFunction())
+}
+
+func (c *Config) ShouldCallSpecificFunction() bool {
+	return len(c.functionCallNameString) > 0
+}
+
+func (c *Config) FunctionToCall() string {
+	return c.functionCallNameString
+}
+
+func defaultPredictOptions(modelFile string) PredictionOptions {
+	return PredictionOptions{
+		TopP:        0.7,
+		TopK:        80,
+		Maxtokens:   512,
+		Temperature: 0.9,
+		Model:       modelFile,
+	}
+}
+
+func DefaultConfig(modelFile string) *Config {
+	return &Config{
+		PredictionOptions: defaultPredictOptions(modelFile),
+	}
+}
+
+func NewConfigLoader() *ConfigLoader {
+	return &ConfigLoader{
+		configs: make(map[string]Config),
+	}
+}
+func ReadConfigFile(file string) ([]*Config, error) {
+	c := &[]*Config{}
+	f, err := os.ReadFile(file)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read config file: %w", err)
+	}
+	if err := yaml.Unmarshal(f, c); err != nil {
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
+	}
+
+	return *c, nil
+}
+
+func ReadConfig(file string) (*Config, error) {
+	c := &Config{}
+	f, err := os.ReadFile(file)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read config file: %w", err)
+	}
+	if err := yaml.Unmarshal(f, c); err != nil {
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
+	}
+
+	return c, nil
+}
+
+func (cm *ConfigLoader) LoadConfigFile(file string) error {
+	cm.Lock()
+	defer cm.Unlock()
+	c, err := ReadConfigFile(file)
+	if err != nil {
+		return fmt.Errorf("cannot load config file: %w", err)
+	}
+
+	for _, cc := range c {
+		cm.configs[cc.Name] = *cc
+	}
+	return nil
+}
+
+func (cm *ConfigLoader) LoadConfig(file string) error {
+	cm.Lock()
+	defer cm.Unlock()
+	c, err := ReadConfig(file)
+	if err != nil {
+		return fmt.Errorf("cannot read config file: %w", err)
+	}
+
+	cm.configs[c.Name] = *c
+	return nil
+}
+
+func (cm *ConfigLoader) GetConfig(m string) (Config, bool) {
+	cm.Lock()
+	defer cm.Unlock()
+	v, exists := cm.configs[m]
+	return v, exists
+}
+
+func (cm *ConfigLoader) GetAllConfigs() []Config {
+	cm.Lock()
+	defer cm.Unlock()
+	var res []Config
+	for _, v := range cm.configs {
+		res = append(res, v)
+	}
+	return res
+}
+
+func (cm *ConfigLoader) ListConfigs() []string {
+	cm.Lock()
+	defer cm.Unlock()
+	var res []string
+	for k := range cm.configs {
+		res = append(res, k)
+	}
+	return res
+}
+
+func (cm *ConfigLoader) LoadConfigs(path string) error {
+	cm.Lock()
+	defer cm.Unlock()
+	entries, err := os.ReadDir(path)
+	if err != nil {
+		return err
+	}
+	files := make([]fs.FileInfo, 0, len(entries))
+	for _, entry := range entries {
+		info, err := entry.Info()
+		if err != nil {
+			return err
+		}
+		files = append(files, info)
+	}
+	for _, file := range files {
+		// Skip templates, YAML and .keep files
+		if !strings.Contains(file.Name(), ".yaml") {
+			continue
+		}
+		c, err := ReadConfig(filepath.Join(path, file.Name()))
+		if err == nil {
+			cm.configs[c.Name] = *c
+		}
+	}
+
+	return nil
+}
--- a/api/config/config_test.go
+++ b/api/config/config_test.go
@@ -0,0 +1,56 @@
+package api_config_test
+
+import (
+	"os"
+
+	. "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/pkg/model"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Test cases for config related functions", func() {
+
+	var (
+		configFile string
+	)
+
+	Context("Test Read configuration functions", func() {
+		configFile = os.Getenv("CONFIG_FILE")
+		It("Test ReadConfigFile", func() {
+			config, err := ReadConfigFile(configFile)
+			Expect(err).To(BeNil())
+			Expect(config).ToNot(BeNil())
+			// two configs in config.yaml
+			Expect(config[0].Name).To(Equal("list1"))
+			Expect(config[1].Name).To(Equal("list2"))
+		})
+
+		It("Test LoadConfigs", func() {
+			cm := NewConfigLoader()
+			opts := options.NewOptions()
+			modelLoader := model.NewModelLoader(os.Getenv("MODELS_PATH"))
+			options.WithModelLoader(modelLoader)(opts)
+
+			err := cm.LoadConfigs(opts.Loader.ModelPath)
+			Expect(err).To(BeNil())
+			Expect(cm.ListConfigs()).ToNot(BeNil())
+
+			// config should includes gpt4all models's api.config
+			Expect(cm.ListConfigs()).To(ContainElements("gpt4all"))
+
+			// config should includes gpt2 models's api.config
+			Expect(cm.ListConfigs()).To(ContainElements("gpt4all-2"))
+
+			// config should includes text-embedding-ada-002 models's api.config
+			Expect(cm.ListConfigs()).To(ContainElements("text-embedding-ada-002"))
+
+			// config should includes rwkv_test models's api.config
+			Expect(cm.ListConfigs()).To(ContainElements("rwkv_test"))
+
+			// config should includes whisper-1 models's api.config
+			Expect(cm.ListConfigs()).To(ContainElements("whisper-1"))
+		})
+	})
+})
--- a/api/config/prediction.go
+++ b/api/config/prediction.go
@@ -0,0 +1,50 @@
+package api_config
+
+type PredictionOptions struct {
+
+	// Also part of the OpenAI official spec
+	Model string `json:"model" yaml:"model"`
+
+	// Also part of the OpenAI official spec
+	Language string `json:"language"`
+
+	// Also part of the OpenAI official spec. use it for returning multiple results
+	N int `json:"n"`
+
+	// Common options between all the API calls, part of the OpenAI spec
+	TopP        float64 `json:"top_p" yaml:"top_p"`
+	TopK        int     `json:"top_k" yaml:"top_k"`
+	Temperature float64 `json:"temperature" yaml:"temperature"`
+	Maxtokens   int     `json:"max_tokens" yaml:"max_tokens"`
+	Echo        bool    `json:"echo"`
+
+	// Custom parameters - not present in the OpenAI API
+	Batch         int     `json:"batch" yaml:"batch"`
+	F16           bool    `json:"f16" yaml:"f16"`
+	IgnoreEOS     bool    `json:"ignore_eos" yaml:"ignore_eos"`
+	RepeatPenalty float64 `json:"repeat_penalty" yaml:"repeat_penalty"`
+	Keep          int     `json:"n_keep" yaml:"n_keep"`
+
+	MirostatETA float64 `json:"mirostat_eta" yaml:"mirostat_eta"`
+	MirostatTAU float64 `json:"mirostat_tau" yaml:"mirostat_tau"`
+	Mirostat    int     `json:"mirostat" yaml:"mirostat"`
+
+	FrequencyPenalty float64 `json:"frequency_penalty" yaml:"frequency_penalty"`
+	TFZ              float64 `json:"tfz" yaml:"tfz"`
+
+	TypicalP float64 `json:"typical_p" yaml:"typical_p"`
+	Seed     int     `json:"seed" yaml:"seed"`
+
+	NegativePrompt      string  `json:"negative_prompt" yaml:"negative_prompt"`
+	RopeFreqBase        float32 `json:"rope_freq_base" yaml:"rope_freq_base"`
+	RopeFreqScale       float32 `json:"rope_freq_scale" yaml:"rope_freq_scale"`
+	NegativePromptScale float32 `json:"negative_prompt_scale" yaml:"negative_prompt_scale"`
+	// AutoGPTQ
+	UseFastTokenizer bool `json:"use_fast_tokenizer" yaml:"use_fast_tokenizer"`
+
+	// Diffusers
+	ClipSkip int `json:"clip_skip" yaml:"clip_skip"`
+
+	// RWKV (?)
+	Tokenizer string `json:"tokenizer" yaml:"tokenizer"`
+}
--- a/api/localai/backend_monitor.go
+++ b/api/localai/backend_monitor.go
@@ -0,0 +1,163 @@
+package localai
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/pkg/grpc/proto"
+
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+
+	gopsutil "github.com/shirou/gopsutil/v3/process"
+)
+
+type BackendMonitorRequest struct {
+	Model string `json:"model" yaml:"model"`
+}
+
+type BackendMonitorResponse struct {
+	MemoryInfo    *gopsutil.MemoryInfoStat
+	MemoryPercent float32
+	CPUPercent    float64
+}
+
+type BackendMonitor struct {
+	configLoader *config.ConfigLoader
+	options      *options.Option // Taking options in case we need to inspect ExternalGRPCBackends, though that's out of scope for now, hence the name.
+}
+
+func NewBackendMonitor(configLoader *config.ConfigLoader, options *options.Option) BackendMonitor {
+	return BackendMonitor{
+		configLoader: configLoader,
+		options:      options,
+	}
+}
+
+func (bm *BackendMonitor) SampleLocalBackendProcess(model string) (*BackendMonitorResponse, error) {
+	config, exists := bm.configLoader.GetConfig(model)
+	var backend string
+	if exists {
+		backend = config.Model
+	} else {
+		// Last ditch effort: use it raw, see if a backend happens to match.
+		backend = model
+	}
+
+	if !strings.HasSuffix(backend, ".bin") {
+		backend = fmt.Sprintf("%s.bin", backend)
+	}
+
+	pid, err := bm.options.Loader.GetGRPCPID(backend)
+
+	if err != nil {
+		log.Error().Msgf("model %s : failed to find pid %+v", model, err)
+		return nil, err
+	}
+
+	// Name is slightly frightening but this does _not_ create a new process, rather it looks up an existing process by PID.
+	backendProcess, err := gopsutil.NewProcess(int32(pid))
+
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting process info %+v", model, pid, err)
+		return nil, err
+	}
+
+	memInfo, err := backendProcess.MemoryInfo()
+
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting memory info %+v", model, pid, err)
+		return nil, err
+	}
+
+	memPercent, err := backendProcess.MemoryPercent()
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting memory percent %+v", model, pid, err)
+		return nil, err
+	}
+
+	cpuPercent, err := backendProcess.CPUPercent()
+	if err != nil {
+		log.Error().Msgf("model %s [PID %d] : error getting cpu percent %+v", model, pid, err)
+		return nil, err
+	}
+
+	return &BackendMonitorResponse{
+		MemoryInfo:    memInfo,
+		MemoryPercent: memPercent,
+		CPUPercent:    cpuPercent,
+	}, nil
+}
+
+func (bm BackendMonitor) getModelLoaderIDFromCtx(c *fiber.Ctx) (string, error) {
+	input := new(BackendMonitorRequest)
+	// Get input data from the request body
+	if err := c.BodyParser(input); err != nil {
+		return "", err
+	}
+
+	config, exists := bm.configLoader.GetConfig(input.Model)
+	var backendId string
+	if exists {
+		backendId = config.Model
+	} else {
+		// Last ditch effort: use it raw, see if a backend happens to match.
+		backendId = input.Model
+	}
+
+	if !strings.HasSuffix(backendId, ".bin") {
+		backendId = fmt.Sprintf("%s.bin", backendId)
+	}
+
+	return backendId, nil
+}
+
+func BackendMonitorEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+
+		backendId, err := bm.getModelLoaderIDFromCtx(c)
+		if err != nil {
+			return err
+		}
+
+		client := bm.options.Loader.CheckIsLoaded(backendId)
+
+		if client == nil {
+			return fmt.Errorf("backend %s is not currently loaded", backendId)
+		}
+
+		status, rpcErr := client.Status(context.TODO())
+		if rpcErr != nil {
+			log.Warn().Msgf("backend %s experienced an error retrieving status info: %s", backendId, rpcErr.Error())
+			val, slbErr := bm.SampleLocalBackendProcess(backendId)
+			if slbErr != nil {
+				return fmt.Errorf("backend %s experienced an error retrieving status info via rpc: %s, then failed local node process sample: %s", backendId, rpcErr.Error(), slbErr.Error())
+			}
+			return c.JSON(proto.StatusResponse{
+				State: proto.StatusResponse_ERROR,
+				Memory: &proto.MemoryUsageData{
+					Total: val.MemoryInfo.VMS,
+					Breakdown: map[string]uint64{
+						"gopsutil-RSS": val.MemoryInfo.RSS,
+					},
+				},
+			})
+		}
+
+		return c.JSON(status)
+	}
+}
+
+func BackendShutdownEndpoint(bm BackendMonitor) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		backendId, err := bm.getModelLoaderIDFromCtx(c)
+		if err != nil {
+			return err
+		}
+
+		return bm.options.Loader.ShutdownModel(backendId)
+	}
+}
--- a/api/localai/gallery.go
+++ b/api/localai/gallery.go
@@ -0,0 +1,320 @@
+package localai
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"slices"
+	"strings"
+	"sync"
+
+	json "github.com/json-iterator/go"
+	"gopkg.in/yaml.v3"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/pkg/gallery"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+
+	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
+	"github.com/rs/zerolog/log"
+)
+
+type galleryOp struct {
+	req         gallery.GalleryModel
+	id          string
+	galleries   []gallery.Gallery
+	galleryName string
+}
+
+type galleryOpStatus struct {
+	FileName           string  `json:"file_name"`
+	Error              error   `json:"error"`
+	Processed          bool    `json:"processed"`
+	Message            string  `json:"message"`
+	Progress           float64 `json:"progress"`
+	TotalFileSize      string  `json:"file_size"`
+	DownloadedFileSize string  `json:"downloaded_size"`
+}
+
+type galleryApplier struct {
+	modelPath string
+	sync.Mutex
+	C        chan galleryOp
+	statuses map[string]*galleryOpStatus
+}
+
+func NewGalleryService(modelPath string) *galleryApplier {
+	return &galleryApplier{
+		modelPath: modelPath,
+		C:         make(chan galleryOp),
+		statuses:  make(map[string]*galleryOpStatus),
+	}
+}
+
+func prepareModel(modelPath string, req gallery.GalleryModel, cm *config.ConfigLoader, downloadStatus func(string, string, string, float64)) error {
+
+	config, err := gallery.GetGalleryConfigFromURL(req.URL)
+	if err != nil {
+		return err
+	}
+
+	config.Files = append(config.Files, req.AdditionalFiles...)
+
+	return gallery.InstallModel(modelPath, req.Name, &config, req.Overrides, downloadStatus)
+}
+
+func (g *galleryApplier) updateStatus(s string, op *galleryOpStatus) {
+	g.Lock()
+	defer g.Unlock()
+	g.statuses[s] = op
+}
+
+func (g *galleryApplier) getStatus(s string) *galleryOpStatus {
+	g.Lock()
+	defer g.Unlock()
+
+	return g.statuses[s]
+}
+
+func (g *galleryApplier) getAllStatus() map[string]*galleryOpStatus {
+	g.Lock()
+	defer g.Unlock()
+
+	return g.statuses
+}
+
+func (g *galleryApplier) Start(c context.Context, cm *config.ConfigLoader) {
+	go func() {
+		for {
+			select {
+			case <-c.Done():
+				return
+			case op := <-g.C:
+				utils.ResetDownloadTimers()
+
+				g.updateStatus(op.id, &galleryOpStatus{Message: "processing", Progress: 0})
+
+				// updates the status with an error
+				updateError := func(e error) {
+					g.updateStatus(op.id, &galleryOpStatus{Error: e, Processed: true, Message: "error: " + e.Error()})
+				}
+
+				// displayDownload displays the download progress
+				progressCallback := func(fileName string, current string, total string, percentage float64) {
+					g.updateStatus(op.id, &galleryOpStatus{Message: "processing", FileName: fileName, Progress: percentage, TotalFileSize: total, DownloadedFileSize: current})
+					utils.DisplayDownloadFunction(fileName, current, total, percentage)
+				}
+
+				var err error
+				// if the request contains a gallery name, we apply the gallery from the gallery list
+				if op.galleryName != "" {
+					if strings.Contains(op.galleryName, "@") {
+						err = gallery.InstallModelFromGallery(op.galleries, op.galleryName, g.modelPath, op.req, progressCallback)
+					} else {
+						err = gallery.InstallModelFromGalleryByName(op.galleries, op.galleryName, g.modelPath, op.req, progressCallback)
+					}
+				} else {
+					err = prepareModel(g.modelPath, op.req, cm, progressCallback)
+				}
+
+				if err != nil {
+					updateError(err)
+					continue
+				}
+
+				// Reload models
+				err = cm.LoadConfigs(g.modelPath)
+				if err != nil {
+					updateError(err)
+					continue
+				}
+
+				g.updateStatus(op.id, &galleryOpStatus{Processed: true, Message: "completed", Progress: 100})
+			}
+		}
+	}()
+}
+
+type galleryModel struct {
+	gallery.GalleryModel `yaml:",inline"` // https://github.com/go-yaml/yaml/issues/63
+	ID                   string           `json:"id"`
+}
+
+func processRequests(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery, requests []galleryModel) error {
+	var err error
+	for _, r := range requests {
+		utils.ResetDownloadTimers()
+		if r.ID == "" {
+			err = prepareModel(modelPath, r.GalleryModel, cm, utils.DisplayDownloadFunction)
+		} else {
+			if strings.Contains(r.ID, "@") {
+				err = gallery.InstallModelFromGallery(
+					galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
+			} else {
+				err = gallery.InstallModelFromGalleryByName(
+					galleries, r.ID, modelPath, r.GalleryModel, utils.DisplayDownloadFunction)
+			}
+		}
+	}
+	return err
+}
+
+func ApplyGalleryFromFile(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
+	dat, err := os.ReadFile(s)
+	if err != nil {
+		return err
+	}
+	var requests []galleryModel
+
+	if err := yaml.Unmarshal(dat, &requests); err != nil {
+		return err
+	}
+
+	return processRequests(modelPath, s, cm, galleries, requests)
+}
+
+func ApplyGalleryFromString(modelPath, s string, cm *config.ConfigLoader, galleries []gallery.Gallery) error {
+	var requests []galleryModel
+	err := json.Unmarshal([]byte(s), &requests)
+	if err != nil {
+		return err
+	}
+
+	return processRequests(modelPath, s, cm, galleries, requests)
+}
+
+/// Endpoint Service
+
+type ModelGalleryService struct {
+	galleries      []gallery.Gallery
+	modelPath      string
+	galleryApplier *galleryApplier
+}
+
+type GalleryModel struct {
+	ID string `json:"id"`
+	gallery.GalleryModel
+}
+
+func CreateModelGalleryService(galleries []gallery.Gallery, modelPath string, galleryApplier *galleryApplier) ModelGalleryService {
+	return ModelGalleryService{
+		galleries:      galleries,
+		modelPath:      modelPath,
+		galleryApplier: galleryApplier,
+	}
+}
+
+func (mgs *ModelGalleryService) GetOpStatusEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		status := mgs.galleryApplier.getStatus(c.Params("uuid"))
+		if status == nil {
+			return fmt.Errorf("could not find any status for ID")
+		}
+		return c.JSON(status)
+	}
+}
+
+func (mgs *ModelGalleryService) GetAllStatusEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		return c.JSON(mgs.galleryApplier.getAllStatus())
+	}
+}
+
+func (mgs *ModelGalleryService) ApplyModelGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(GalleryModel)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		uuid, err := uuid.NewUUID()
+		if err != nil {
+			return err
+		}
+		mgs.galleryApplier.C <- galleryOp{
+			req:         input.GalleryModel,
+			id:          uuid.String(),
+			galleryName: input.ID,
+			galleries:   mgs.galleries,
+		}
+		return c.JSON(struct {
+			ID        string `json:"uuid"`
+			StatusURL string `json:"status"`
+		}{ID: uuid.String(), StatusURL: c.BaseURL() + "/models/jobs/" + uuid.String()})
+	}
+}
+
+func (mgs *ModelGalleryService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)
+
+		models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
+		if err != nil {
+			return err
+		}
+		log.Debug().Msgf("Models found from galleries: %+v", models)
+		for _, m := range models {
+			log.Debug().Msgf("Model found from galleries: %+v", m)
+		}
+		dat, err := json.Marshal(models)
+		if err != nil {
+			return err
+		}
+		return c.Send(dat)
+	}
+}
+
+// NOTE: This is different (and much simpler!) than above! This JUST lists the model galleries that have been loaded, not their contents!
+func (mgs *ModelGalleryService) ListModelGalleriesEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		log.Debug().Msgf("Listing model galleries %+v", mgs.galleries)
+		dat, err := json.Marshal(mgs.galleries)
+		if err != nil {
+			return err
+		}
+		return c.Send(dat)
+	}
+}
+
+func (mgs *ModelGalleryService) AddModelGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(gallery.Gallery)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+		if slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		}) {
+			return fmt.Errorf("%s already exists", input.Name)
+		}
+		dat, err := json.Marshal(mgs.galleries)
+		if err != nil {
+			return err
+		}
+		log.Debug().Msgf("Adding %+v to gallery list", *input)
+		mgs.galleries = append(mgs.galleries, *input)
+		return c.Send(dat)
+	}
+}
+
+func (mgs *ModelGalleryService) RemoveModelGalleryEndpoint() func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		input := new(gallery.Gallery)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+		if !slices.ContainsFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		}) {
+			return fmt.Errorf("%s is not currently registered", input.Name)
+		}
+		mgs.galleries = slices.DeleteFunc(mgs.galleries, func(gallery gallery.Gallery) bool {
+			return gallery.Name == input.Name
+		})
+		return c.Send(nil)
+	}
+}
--- a/api/localai/localai.go
+++ b/api/localai/localai.go
@@ -0,0 +1,32 @@
+package localai
+
+import (
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/gofiber/fiber/v2"
+)
+
+type TTSRequest struct {
+	Model   string `json:"model" yaml:"model"`
+	Input   string `json:"input" yaml:"input"`
+	Backend string `json:"backend" yaml:"backend"`
+}
+
+func TTSEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+
+		input := new(TTSRequest)
+		// Get input data from the request body
+		if err := c.BodyParser(input); err != nil {
+			return err
+		}
+
+		filePath, _, err := backend.ModelTTS(input.Backend, input.Input, input.Model, o.Loader, o)
+		if err != nil {
+			return err
+		}
+		return c.Download(filePath)
+	}
+}
--- a/api/openai/chat.go
+++ b/api/openai/chat.go
@@ -0,0 +1,393 @@
+package openai
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/go-skynet/LocalAI/pkg/grammar"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/pkg/utils"
+	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
+	"github.com/rs/zerolog/log"
+	"github.com/valyala/fasthttp"
+)
+
+func ChatEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	emptyMessage := ""
+	id := uuid.New().String()
+	created := int(time.Now().Unix())
+
+	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+		initialMessage := schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: []schema.Choice{{Delta: &schema.Message{Role: "assistant", Content: &emptyMessage}}},
+			Object:  "chat.completion.chunk",
+		}
+		responses <- initialMessage
+
+		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+			resp := schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{{Delta: &schema.Message{Content: &s}, Index: 0}},
+				Object:  "chat.completion.chunk",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     usage.Prompt,
+					CompletionTokens: usage.Completion,
+					TotalTokens:      usage.Prompt + usage.Completion,
+				},
+			}
+
+			responses <- resp
+			return true
+		})
+		close(responses)
+	}
+	return func(c *fiber.Ctx) error {
+		processFunctions := false
+		funcs := grammar.Functions{}
+		modelFile, input, err := readInput(c, o, true)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		config, input, err := readConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+		log.Debug().Msgf("Configuration read: %+v", config)
+
+		// Allow the user to set custom actions via config file
+		// to be "embedded" in each model
+		noActionName := "answer"
+		noActionDescription := "use this action to answer without performing any action"
+
+		if config.FunctionsConfig.NoActionFunctionName != "" {
+			noActionName = config.FunctionsConfig.NoActionFunctionName
+		}
+		if config.FunctionsConfig.NoActionDescriptionName != "" {
+			noActionDescription = config.FunctionsConfig.NoActionDescriptionName
+		}
+
+		if input.ResponseFormat == "json_object" {
+			input.Grammar = grammar.JSONBNF
+		}
+
+		// process functions if we have any defined or if we have a function call string
+		if len(input.Functions) > 0 && config.ShouldUseFunctions() {
+			log.Debug().Msgf("Response needs to process functions")
+
+			processFunctions = true
+
+			noActionGrammar := grammar.Function{
+				Name:        noActionName,
+				Description: noActionDescription,
+				Parameters: map[string]interface{}{
+					"properties": map[string]interface{}{
+						"message": map[string]interface{}{
+							"type":        "string",
+							"description": "The message to reply the user with",
+						}},
+				},
+			}
+
+			// Append the no action function
+			funcs = append(funcs, input.Functions...)
+			if !config.FunctionsConfig.DisableNoAction {
+				funcs = append(funcs, noActionGrammar)
+			}
+
+			// Force picking one of the functions by the request
+			if config.FunctionToCall() != "" {
+				funcs = funcs.Select(config.FunctionToCall())
+			}
+
+			// Update input grammar
+			jsStruct := funcs.ToJSONStructure()
+			config.Grammar = jsStruct.Grammar("")
+		} else if input.JSONFunctionGrammarObject != nil {
+			config.Grammar = input.JSONFunctionGrammarObject.Grammar("")
+		}
+
+		// functions are not supported in stream mode (yet?)
+		toStream := input.Stream && !processFunctions
+
+		log.Debug().Msgf("Parameters: %+v", config)
+
+		var predInput string
+
+		suppressConfigSystemPrompt := false
+		mess := []string{}
+		for messageIndex, i := range input.Messages {
+			var content string
+			role := i.Role
+
+			// if function call, we might want to customize the role so we can display better that the "assistant called a json action"
+			// if an "assistant_function_call" role is defined, we use it, otherwise we use the role that is passed by in the request
+			if i.FunctionCall != nil && i.Role == "assistant" {
+				roleFn := "assistant_function_call"
+				r := config.Roles[roleFn]
+				if r != "" {
+					role = roleFn
+				}
+			}
+			r := config.Roles[role]
+			contentExists := i.Content != nil && i.StringContent != ""
+			// First attempt to populate content via a chat message specific template
+			if config.TemplateConfig.ChatMessage != "" {
+				chatMessageData := model.ChatMessageTemplateData{
+					SystemPrompt: config.SystemPrompt,
+					Role:         r,
+					RoleName:     role,
+					Content:      i.StringContent,
+					MessageIndex: messageIndex,
+				}
+				templatedChatMessage, err := o.Loader.EvaluateTemplateForChatMessage(config.TemplateConfig.ChatMessage, chatMessageData)
+				if err != nil {
+					log.Error().Msgf("error processing message %+v using template \"%s\": %v. Skipping!", chatMessageData, config.TemplateConfig.ChatMessage, err)
+				} else {
+					if templatedChatMessage == "" {
+						log.Warn().Msgf("template \"%s\" produced blank output for %+v. Skipping!", config.TemplateConfig.ChatMessage, chatMessageData)
+						continue // TODO: This continue is here intentionally to skip over the line `mess = append(mess, content)` below, and to prevent the sprintf
+					}
+					log.Debug().Msgf("templated message for chat: %s", templatedChatMessage)
+					content = templatedChatMessage
+				}
+			}
+			// If this model doesn't have such a template, or if that template fails to return a value, template at the message level.
+			if content == "" {
+				if r != "" {
+					if contentExists {
+						content = fmt.Sprint(r, i.StringContent)
+					}
+					if i.FunctionCall != nil {
+						j, err := json.Marshal(i.FunctionCall)
+						if err == nil {
+							if contentExists {
+								content += "\n" + fmt.Sprint(r, " ", string(j))
+							} else {
+								content = fmt.Sprint(r, " ", string(j))
+							}
+						}
+					}
+				} else {
+					if contentExists {
+						content = fmt.Sprint(i.StringContent)
+					}
+					if i.FunctionCall != nil {
+						j, err := json.Marshal(i.FunctionCall)
+						if err == nil {
+							if contentExists {
+								content += "\n" + string(j)
+							} else {
+								content = string(j)
+							}
+						}
+					}
+				}
+				// Special Handling: System. We care if it was printed at all, not the r branch, so check seperately
+				if contentExists && role == "system" {
+					suppressConfigSystemPrompt = true
+				}
+			}
+
+			mess = append(mess, content)
+		}
+
+		predInput = strings.Join(mess, "\n")
+		log.Debug().Msgf("Prompt (before templating): %s", predInput)
+
+		if toStream {
+			log.Debug().Msgf("Stream request received")
+			c.Context().SetContentType("text/event-stream")
+			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
+			//	c.Set("Content-Type", "text/event-stream")
+			c.Set("Cache-Control", "no-cache")
+			c.Set("Connection", "keep-alive")
+			c.Set("Transfer-Encoding", "chunked")
+		}
+
+		templateFile := config.Model
+
+		if config.TemplateConfig.Chat != "" && !processFunctions {
+			templateFile = config.TemplateConfig.Chat
+		}
+
+		if config.TemplateConfig.Functions != "" && processFunctions {
+			templateFile = config.TemplateConfig.Functions
+		}
+
+		// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+		templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.ChatPromptTemplate, templateFile, model.PromptTemplateData{
+			SystemPrompt:         config.SystemPrompt,
+			SuppressSystemPrompt: suppressConfigSystemPrompt,
+			Input:                predInput,
+			Functions:            funcs,
+		})
+		if err == nil {
+			predInput = templatedInput
+			log.Debug().Msgf("Template found, input modified to: %s", predInput)
+		} else {
+			log.Debug().Msgf("Template failed loading: %s", err.Error())
+		}
+
+		log.Debug().Msgf("Prompt (after templating): %s", predInput)
+		if processFunctions {
+			log.Debug().Msgf("Grammar: %+v", config.Grammar)
+		}
+
+		if toStream {
+			responses := make(chan schema.OpenAIResponse)
+
+			go process(predInput, input, config, o.Loader, responses)
+
+			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
+
+				usage := &schema.OpenAIUsage{}
+
+				for ev := range responses {
+					usage = &ev.Usage // Copy a pointer to the latest usage chunk so that the stop message can reference it
+					var buf bytes.Buffer
+					enc := json.NewEncoder(&buf)
+					enc.Encode(ev)
+					log.Debug().Msgf("Sending chunk: %s", buf.String())
+					_, err := fmt.Fprintf(w, "data: %v\n", buf.String())
+					if err != nil {
+						log.Debug().Msgf("Sending chunk failed: %v", err)
+						input.Cancel()
+						break
+					}
+					w.Flush()
+				}
+
+				resp := &schema.OpenAIResponse{
+					ID:      id,
+					Created: created,
+					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []schema.Choice{
+						{
+							FinishReason: "stop",
+							Index:        0,
+							Delta:        &schema.Message{Content: &emptyMessage},
+						}},
+					Object: "chat.completion.chunk",
+					Usage:  *usage,
+				}
+				respData, _ := json.Marshal(resp)
+
+				w.WriteString(fmt.Sprintf("data: %s\n\n", respData))
+				w.WriteString("data: [DONE]\n\n")
+				w.Flush()
+			}))
+			return nil
+		}
+
+		result, tokenUsage, err := ComputeChoices(input, predInput, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+			if processFunctions {
+				// As we have to change the result before processing, we can't stream the answer (yet?)
+				ss := map[string]interface{}{}
+				// This prevent newlines to break JSON parsing for clients
+				s = utils.EscapeNewLines(s)
+				json.Unmarshal([]byte(s), &ss)
+				log.Debug().Msgf("Function return: %s %+v", s, ss)
+
+				// The grammar defines the function name as "function", while OpenAI returns "name"
+				func_name := ss["function"]
+				// Similarly, while here arguments is a map[string]interface{}, OpenAI actually want a stringified object
+				args := ss["arguments"] // arguments needs to be a string, but we return an object from the grammar result (TODO: fix)
+				d, _ := json.Marshal(args)
+
+				ss["arguments"] = string(d)
+				ss["name"] = func_name
+
+				// if do nothing, reply with a message
+				if func_name == noActionName {
+					log.Debug().Msgf("nothing to do, computing a reply")
+
+					// If there is a message that the LLM already sends as part of the JSON reply, use it
+					arguments := map[string]interface{}{}
+					json.Unmarshal([]byte(d), &arguments)
+					m, exists := arguments["message"]
+					if exists {
+						switch message := m.(type) {
+						case string:
+							if message != "" {
+								log.Debug().Msgf("Reply received from LLM: %s", message)
+								message = backend.Finetune(*config, predInput, message)
+								log.Debug().Msgf("Reply received from LLM(finetuned): %s", message)
+
+								*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &message}})
+								return
+							}
+						}
+					}
+
+					log.Debug().Msgf("No action received from LLM, without a message, computing a reply")
+					// Otherwise ask the LLM to understand the JSON output and the context, and return a message
+					// Note: This costs (in term of CPU) another computation
+					config.Grammar = ""
+					images := []string{}
+					for _, m := range input.Messages {
+						images = append(images, m.StringImages...)
+					}
+					predFunc, err := backend.ModelInference(input.Context, predInput, images, o.Loader, *config, o, nil)
+					if err != nil {
+						log.Error().Msgf("inference error: %s", err.Error())
+						return
+					}
+
+					prediction, err := predFunc()
+					if err != nil {
+						log.Error().Msgf("inference error: %s", err.Error())
+						return
+					}
+
+					fineTunedResponse := backend.Finetune(*config, predInput, prediction.Response)
+					*c = append(*c, schema.Choice{Message: &schema.Message{Role: "assistant", Content: &fineTunedResponse}})
+				} else {
+					// otherwise reply with the function call
+					*c = append(*c, schema.Choice{
+						FinishReason: "function_call",
+						Message:      &schema.Message{Role: "assistant", FunctionCall: ss},
+					})
+				}
+
+				return
+			}
+			*c = append(*c, schema.Choice{FinishReason: "stop", Index: 0, Message: &schema.Message{Role: "assistant", Content: &s}})
+		}, nil)
+		if err != nil {
+			return err
+		}
+
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: result,
+			Object:  "chat.completion",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     tokenUsage.Prompt,
+				CompletionTokens: tokenUsage.Completion,
+				TotalTokens:      tokenUsage.Prompt + tokenUsage.Completion,
+			},
+		}
+		respData, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", respData)
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
--- a/api/openai/completion.go
+++ b/api/openai/completion.go
@@ -0,0 +1,191 @@
+package openai
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"time"
+
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/go-skynet/LocalAI/pkg/grammar"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
+	"github.com/rs/zerolog/log"
+	"github.com/valyala/fasthttp"
+)
+
+// https://platform.openai.com/docs/api-reference/completions
+func CompletionEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	id := uuid.New().String()
+	created := int(time.Now().Unix())
+
+	process := func(s string, req *schema.OpenAIRequest, config *config.Config, loader *model.ModelLoader, responses chan schema.OpenAIResponse) {
+		ComputeChoices(req, s, config, o, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
+			resp := schema.OpenAIResponse{
+				ID:      id,
+				Created: created,
+				Model:   req.Model, // we have to return what the user sent here, due to OpenAI spec.
+				Choices: []schema.Choice{
+					{
+						Index: 0,
+						Text:  s,
+					},
+				},
+				Object: "text_completion",
+				Usage: schema.OpenAIUsage{
+					PromptTokens:     usage.Prompt,
+					CompletionTokens: usage.Completion,
+					TotalTokens:      usage.Prompt + usage.Completion,
+				},
+			}
+			log.Debug().Msgf("Sending goroutine: %s", s)
+
+			responses <- resp
+			return true
+		})
+		close(responses)
+	}
+
+	return func(c *fiber.Ctx) error {
+		modelFile, input, err := readInput(c, o, true)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		log.Debug().Msgf("`input`: %+v", input)
+
+		config, input, err := readConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		if input.ResponseFormat == "json_object" {
+			input.Grammar = grammar.JSONBNF
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+
+		if input.Stream {
+			log.Debug().Msgf("Stream request received")
+			c.Context().SetContentType("text/event-stream")
+			//c.Response().Header.SetContentType(fiber.MIMETextHTMLCharsetUTF8)
+			//c.Set("Content-Type", "text/event-stream")
+			c.Set("Cache-Control", "no-cache")
+			c.Set("Connection", "keep-alive")
+			c.Set("Transfer-Encoding", "chunked")
+		}
+
+		templateFile := config.Model
+
+		if config.TemplateConfig.Completion != "" {
+			templateFile = config.TemplateConfig.Completion
+		}
+
+		if input.Stream {
+			if len(config.PromptStrings) > 1 {
+				return errors.New("cannot handle more than 1 `PromptStrings` when Streaming")
+			}
+
+			predInput := config.PromptStrings[0]
+
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
+				Input: predInput,
+			})
+			if err == nil {
+				predInput = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", predInput)
+			}
+
+			responses := make(chan schema.OpenAIResponse)
+
+			go process(predInput, input, config, o.Loader, responses)
+
+			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
+
+				for ev := range responses {
+					var buf bytes.Buffer
+					enc := json.NewEncoder(&buf)
+					enc.Encode(ev)
+
+					log.Debug().Msgf("Sending chunk: %s", buf.String())
+					fmt.Fprintf(w, "data: %v\n", buf.String())
+					w.Flush()
+				}
+
+				resp := &schema.OpenAIResponse{
+					ID:      id,
+					Created: created,
+					Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+					Choices: []schema.Choice{
+						{
+							Index:        0,
+							FinishReason: "stop",
+						},
+					},
+					Object: "text_completion",
+				}
+				respData, _ := json.Marshal(resp)
+
+				w.WriteString(fmt.Sprintf("data: %s\n\n", respData))
+				w.WriteString("data: [DONE]\n\n")
+				w.Flush()
+			}))
+			return nil
+		}
+
+		var result []schema.Choice
+
+		totalTokenUsage := backend.TokenUsage{}
+
+		for k, i := range config.PromptStrings {
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.CompletionPromptTemplate, templateFile, model.PromptTemplateData{
+				SystemPrompt: config.SystemPrompt,
+				Input:        i,
+			})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
+			}
+
+			r, tokenUsage, err := ComputeChoices(
+				input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+					*c = append(*c, schema.Choice{Text: s, FinishReason: "stop", Index: k})
+				}, nil)
+			if err != nil {
+				return err
+			}
+
+			totalTokenUsage.Prompt += tokenUsage.Prompt
+			totalTokenUsage.Completion += tokenUsage.Completion
+
+			result = append(result, r...)
+		}
+
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: result,
+			Object:  "text_completion",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     totalTokenUsage.Prompt,
+				CompletionTokens: totalTokenUsage.Completion,
+				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
+			},
+		}
+
+		jsonResult, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", jsonResult)
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
--- a/api/openai/edit.go
+++ b/api/openai/edit.go
@@ -0,0 +1,88 @@
+package openai
+
+import (
+	"encoding/json"
+	"fmt"
+	"time"
+
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/google/uuid"
+
+	"github.com/rs/zerolog/log"
+)
+
+func EditEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		modelFile, input, err := readInput(c, o, true)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		config, input, err := readConfig(modelFile, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+
+		templateFile := config.Model
+
+		if config.TemplateConfig.Edit != "" {
+			templateFile = config.TemplateConfig.Edit
+		}
+
+		var result []schema.Choice
+		totalTokenUsage := backend.TokenUsage{}
+
+		for _, i := range config.InputStrings {
+			// A model can have a "file.bin.tmpl" file associated with a prompt template prefix
+			templatedInput, err := o.Loader.EvaluateTemplateForPrompt(model.EditPromptTemplate, templateFile, model.PromptTemplateData{
+				Input:        i,
+				Instruction:  input.Instruction,
+				SystemPrompt: config.SystemPrompt,
+			})
+			if err == nil {
+				i = templatedInput
+				log.Debug().Msgf("Template found, input modified to: %s", i)
+			}
+
+			r, tokenUsage, err := ComputeChoices(input, i, config, o, o.Loader, func(s string, c *[]schema.Choice) {
+				*c = append(*c, schema.Choice{Text: s})
+			}, nil)
+			if err != nil {
+				return err
+			}
+
+			totalTokenUsage.Prompt += tokenUsage.Prompt
+			totalTokenUsage.Completion += tokenUsage.Completion
+
+			result = append(result, r...)
+		}
+
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Choices: result,
+			Object:  "edit",
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     totalTokenUsage.Prompt,
+				CompletionTokens: totalTokenUsage.Completion,
+				TotalTokens:      totalTokenUsage.Prompt + totalTokenUsage.Completion,
+			},
+		}
+
+		jsonResult, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", jsonResult)
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
--- a/api/openai/embeddings.go
+++ b/api/openai/embeddings.go
@@ -0,0 +1,78 @@
+package openai
+
+import (
+	"encoding/json"
+	"fmt"
+	"time"
+
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/google/uuid"
+
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+// https://platform.openai.com/docs/api-reference/embeddings
+func EmbeddingsEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		model, input, err := readInput(c, o, true)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		config, input, err := readConfig(model, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+		items := []schema.Item{}
+
+		for i, s := range config.InputToken {
+			// get the model function to call for the result
+			embedFn, err := backend.ModelEmbedding("", s, o.Loader, *config, o)
+			if err != nil {
+				return err
+			}
+
+			embeddings, err := embedFn()
+			if err != nil {
+				return err
+			}
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
+		}
+
+		for i, s := range config.InputStrings {
+			// get the model function to call for the result
+			embedFn, err := backend.ModelEmbedding(s, []int{}, o.Loader, *config, o)
+			if err != nil {
+				return err
+			}
+
+			embeddings, err := embedFn()
+			if err != nil {
+				return err
+			}
+			items = append(items, schema.Item{Embedding: embeddings, Index: i, Object: "embedding"})
+		}
+
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
+			Data:    items,
+			Object:  "list",
+		}
+
+		jsonResult, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", jsonResult)
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
--- a/api/openai/image.go
+++ b/api/openai/image.go
@@ -0,0 +1,194 @@
+package openai
+
+import (
+	"bufio"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/go-skynet/LocalAI/api/schema"
+	"github.com/google/uuid"
+
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+// https://platform.openai.com/docs/api-reference/images/create
+
+/*
+*
+
+	curl http://localhost:8080/v1/images/generations \
+	  -H "Content-Type: application/json" \
+	  -d '{
+	    "prompt": "A cute baby sea otter",
+	    "n": 1,
+	    "size": "512x512"
+	  }'
+
+*
+*/
+func ImageEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		m, input, err := readInput(c, o, false)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		if m == "" {
+			m = model.StableDiffusionBackend
+		}
+		log.Debug().Msgf("Loading model: %+v", m)
+
+		config, input, err := readConfig(m, input, cm, o.Loader, o.Debug, 0, 0, false)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		src := ""
+		if input.File != "" {
+			//base 64 decode the file and write it somewhere
+			// that we will cleanup
+			decoded, err := base64.StdEncoding.DecodeString(input.File)
+			if err != nil {
+				return err
+			}
+			// Create a temporary file
+			outputFile, err := os.CreateTemp(o.ImageDir, "b64")
+			if err != nil {
+				return err
+			}
+			// write the base64 result
+			writer := bufio.NewWriter(outputFile)
+			_, err = writer.Write(decoded)
+			if err != nil {
+				outputFile.Close()
+				return err
+			}
+			outputFile.Close()
+			src = outputFile.Name()
+			defer os.RemoveAll(src)
+		}
+
+		log.Debug().Msgf("Parameter Config: %+v", config)
+
+		// XXX: Only stablediffusion is supported for now
+		if config.Backend == "" {
+			config.Backend = model.StableDiffusionBackend
+		}
+
+		sizeParts := strings.Split(input.Size, "x")
+		if len(sizeParts) != 2 {
+			return fmt.Errorf("Invalid value for 'size'")
+		}
+		width, err := strconv.Atoi(sizeParts[0])
+		if err != nil {
+			return fmt.Errorf("Invalid value for 'size'")
+		}
+		height, err := strconv.Atoi(sizeParts[1])
+		if err != nil {
+			return fmt.Errorf("Invalid value for 'size'")
+		}
+
+		b64JSON := false
+		if input.ResponseFormat == "b64_json" {
+			b64JSON = true
+		}
+		// src and clip_skip
+		var result []schema.Item
+		for _, i := range config.PromptStrings {
+			n := input.N
+			if input.N == 0 {
+				n = 1
+			}
+			for j := 0; j < n; j++ {
+				prompts := strings.Split(i, "|")
+				positive_prompt := prompts[0]
+				negative_prompt := ""
+				if len(prompts) > 1 {
+					negative_prompt = prompts[1]
+				}
+
+				mode := 0
+				step := config.Step
+				if step == 0 {
+					step = 15
+				}
+
+				if input.Mode != 0 {
+					mode = input.Mode
+				}
+
+				if input.Step != 0 {
+					step = input.Step
+				}
+
+				tempDir := ""
+				if !b64JSON {
+					tempDir = o.ImageDir
+				}
+				// Create a temporary file
+				outputFile, err := os.CreateTemp(tempDir, "b64")
+				if err != nil {
+					return err
+				}
+				outputFile.Close()
+				output := outputFile.Name() + ".png"
+				// Rename the temporary file
+				err = os.Rename(outputFile.Name(), output)
+				if err != nil {
+					return err
+				}
+
+				baseURL := c.BaseURL()
+
+				fn, err := backend.ImageGeneration(height, width, mode, step, input.Seed, positive_prompt, negative_prompt, src, output, o.Loader, *config, o)
+				if err != nil {
+					return err
+				}
+				if err := fn(); err != nil {
+					return err
+				}
+
+				item := &schema.Item{}
+
+				if b64JSON {
+					defer os.RemoveAll(output)
+					data, err := os.ReadFile(output)
+					if err != nil {
+						return err
+					}
+					item.B64JSON = base64.StdEncoding.EncodeToString(data)
+				} else {
+					base := filepath.Base(output)
+					item.URL = baseURL + "/generated-images/" + base
+				}
+
+				result = append(result, *item)
+			}
+		}
+
+		id := uuid.New().String()
+		created := int(time.Now().Unix())
+		resp := &schema.OpenAIResponse{
+			ID:      id,
+			Created: created,
+			Data:    result,
+		}
+
+		jsonResult, _ := json.Marshal(resp)
+		log.Debug().Msgf("Response: %s", jsonResult)
+
+		// Return the prediction in the response body
+		return c.JSON(resp)
+	}
+}
--- a/api/openai/inference.go
+++ b/api/openai/inference.go
@@ -0,0 +1,55 @@
+package openai
+
+import (
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+)
+
+func ComputeChoices(
+	req *schema.OpenAIRequest,
+	predInput string,
+	config *config.Config,
+	o *options.Option,
+	loader *model.ModelLoader,
+	cb func(string, *[]schema.Choice),
+	tokenCallback func(string, backend.TokenUsage) bool) ([]schema.Choice, backend.TokenUsage, error) {
+	n := req.N // number of completions to return
+	result := []schema.Choice{}
+
+	if n == 0 {
+		n = 1
+	}
+
+	images := []string{}
+	for _, m := range req.Messages {
+		images = append(images, m.StringImages...)
+	}
+
+	// get the model function to call for the result
+	predFunc, err := backend.ModelInference(req.Context, predInput, images, loader, *config, o, tokenCallback)
+	if err != nil {
+		return result, backend.TokenUsage{}, err
+	}
+
+	tokenUsage := backend.TokenUsage{}
+
+	for i := 0; i < n; i++ {
+		prediction, err := predFunc()
+		if err != nil {
+			return result, backend.TokenUsage{}, err
+		}
+
+		tokenUsage.Prompt += prediction.Usage.Prompt
+		tokenUsage.Completion += prediction.Usage.Completion
+
+		finetunedResponse := backend.Finetune(*config, predInput, prediction.Response)
+		cb(finetunedResponse, &result)
+
+		//result = append(result, Choice{Text: prediction})
+
+	}
+	return result, tokenUsage, err
+}
--- a/api/openai/list.go
+++ b/api/openai/list.go
@@ -0,0 +1,69 @@
+package openai
+
+import (
+	"regexp"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/schema"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+)
+
+func ListModelsEndpoint(loader *model.ModelLoader, cm *config.ConfigLoader) func(ctx *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		models, err := loader.ListModels()
+		if err != nil {
+			return err
+		}
+		var mm map[string]interface{} = map[string]interface{}{}
+
+		dataModels := []schema.OpenAIModel{}
+
+		var filterFn func(name string) bool
+		filter := c.Query("filter")
+
+		// If filter is not specified, do not filter the list by model name
+		if filter == "" {
+			filterFn = func(_ string) bool { return true }
+		} else {
+			// If filter _IS_ specified, we compile it to a regex which is used to create the filterFn
+			rxp, err := regexp.Compile(filter)
+			if err != nil {
+				return err
+			}
+			filterFn = func(name string) bool {
+				return rxp.MatchString(name)
+			}
+		}
+
+		// By default, exclude any loose files that are already referenced by a configuration file.
+		excludeConfigured := c.QueryBool("excludeConfigured", true)
+
+		// Start with the known configurations
+		for _, c := range cm.GetAllConfigs() {
+			if excludeConfigured {
+				mm[c.Model] = nil
+			}
+
+			if filterFn(c.Name) {
+				dataModels = append(dataModels, schema.OpenAIModel{ID: c.Name, Object: "model"})
+			}
+		}
+
+		// Then iterate through the loose files:
+		for _, m := range models {
+			// And only adds them if they shouldn't be skipped.
+			if _, exists := mm[m]; !exists && filterFn(m) {
+				dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
+			}
+		}
+
+		return c.JSON(struct {
+			Object string               `json:"object"`
+			Data   []schema.OpenAIModel `json:"data"`
+		}{
+			Object: "list",
+			Data:   dataModels,
+		})
+	}
+}
--- a/api/openai/request.go
+++ b/api/openai/request.go
@@ -0,0 +1,336 @@
+package openai
+
+import (
+	"context"
+	"encoding/base64"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strings"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+	options "github.com/go-skynet/LocalAI/api/options"
+	"github.com/go-skynet/LocalAI/api/schema"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+func readInput(c *fiber.Ctx, o *options.Option, randomModel bool) (string, *schema.OpenAIRequest, error) {
+	loader := o.Loader
+	input := new(schema.OpenAIRequest)
+	ctx, cancel := context.WithCancel(o.Context)
+	input.Context = ctx
+	input.Cancel = cancel
+	// Get input data from the request body
+	if err := c.BodyParser(input); err != nil {
+		return "", nil, fmt.Errorf("failed parsing request body: %w", err)
+	}
+
+	modelFile := input.Model
+
+	if c.Params("model") != "" {
+		modelFile = c.Params("model")
+	}
+
+	received, _ := json.Marshal(input)
+
+	log.Debug().Msgf("Request received: %s", string(received))
+
+	// Set model from bearer token, if available
+	bearer := strings.TrimLeft(c.Get("authorization"), "Bearer ")
+	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
+
+	// If no model was specified, take the first available
+	if modelFile == "" && !bearerExists && randomModel {
+		models, _ := loader.ListModels()
+		if len(models) > 0 {
+			modelFile = models[0]
+			log.Debug().Msgf("No model specified, using: %s", modelFile)
+		} else {
+			log.Debug().Msgf("No model specified, returning error")
+			return "", nil, fmt.Errorf("no model specified")
+		}
+	}
+
+	// If a model is found in bearer token takes precedence
+	if bearerExists {
+		log.Debug().Msgf("Using model from bearer token: %s", bearer)
+		modelFile = bearer
+	}
+	return modelFile, input, nil
+}
+
+// this function check if the string is an URL, if it's an URL downloads the image in memory
+// encodes it in base64 and returns the base64 string
+func getBase64Image(s string) (string, error) {
+	if strings.HasPrefix(s, "http") {
+		// download the image
+		resp, err := http.Get(s)
+		if err != nil {
+			return "", err
+		}
+		defer resp.Body.Close()
+
+		// read the image data into memory
+		data, err := ioutil.ReadAll(resp.Body)
+		if err != nil {
+			return "", err
+		}
+
+		// encode the image data in base64
+		encoded := base64.StdEncoding.EncodeToString(data)
+
+		// return the base64 string
+		return encoded, nil
+	}
+
+	// if the string instead is prefixed with "data:image/jpeg;base64,", drop it
+	if strings.HasPrefix(s, "data:image/jpeg;base64,") {
+		return strings.ReplaceAll(s, "data:image/jpeg;base64,", ""), nil
+	}
+	return "", fmt.Errorf("not valid string")
+}
+
+func updateConfig(config *config.Config, input *schema.OpenAIRequest) {
+	if input.Echo {
+		config.Echo = input.Echo
+	}
+	if input.TopK != 0 {
+		config.TopK = input.TopK
+	}
+	if input.TopP != 0 {
+		config.TopP = input.TopP
+	}
+
+	if input.Backend != "" {
+		config.Backend = input.Backend
+	}
+
+	if input.ClipSkip != 0 {
+		config.Diffusers.ClipSkip = input.ClipSkip
+	}
+
+	if input.ModelBaseName != "" {
+		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
+	}
+
+	if input.NegativePromptScale != 0 {
+		config.NegativePromptScale = input.NegativePromptScale
+	}
+
+	if input.UseFastTokenizer {
+		config.UseFastTokenizer = input.UseFastTokenizer
+	}
+
+	if input.NegativePrompt != "" {
+		config.NegativePrompt = input.NegativePrompt
+	}
+
+	if input.RopeFreqBase != 0 {
+		config.RopeFreqBase = input.RopeFreqBase
+	}
+
+	if input.RopeFreqScale != 0 {
+		config.RopeFreqScale = input.RopeFreqScale
+	}
+
+	if input.Grammar != "" {
+		config.Grammar = input.Grammar
+	}
+
+	if input.Temperature != 0 {
+		config.Temperature = input.Temperature
+	}
+
+	if input.Maxtokens != 0 {
+		config.Maxtokens = input.Maxtokens
+	}
+
+	switch stop := input.Stop.(type) {
+	case string:
+		if stop != "" {
+			config.StopWords = append(config.StopWords, stop)
+		}
+	case []interface{}:
+		for _, pp := range stop {
+			if s, ok := pp.(string); ok {
+				config.StopWords = append(config.StopWords, s)
+			}
+		}
+	}
+
+	// Decode each request's message content
+	index := 0
+	for i, m := range input.Messages {
+		switch content := m.Content.(type) {
+		case string:
+			input.Messages[i].StringContent = content
+		case []interface{}:
+			dat, _ := json.Marshal(content)
+			c := []schema.Content{}
+			json.Unmarshal(dat, &c)
+			for _, pp := range c {
+				if pp.Type == "text" {
+					input.Messages[i].StringContent = pp.Text
+				} else if pp.Type == "image_url" {
+					// Detect if pp.ImageURL is an URL, if it is download the image and encode it in base64:
+					base64, err := getBase64Image(pp.ImageURL.URL)
+					if err == nil {
+						input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
+						// set a placeholder for each image
+						input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", index) + input.Messages[i].StringContent
+						index++
+					} else {
+						fmt.Print("Failed encoding image", err)
+					}
+				}
+			}
+		}
+	}
+
+	if input.RepeatPenalty != 0 {
+		config.RepeatPenalty = input.RepeatPenalty
+	}
+
+	if input.Keep != 0 {
+		config.Keep = input.Keep
+	}
+
+	if input.Batch != 0 {
+		config.Batch = input.Batch
+	}
+
+	if input.F16 {
+		config.F16 = input.F16
+	}
+
+	if input.IgnoreEOS {
+		config.IgnoreEOS = input.IgnoreEOS
+	}
+
+	if input.Seed != 0 {
+		config.Seed = input.Seed
+	}
+
+	if input.Mirostat != 0 {
+		config.LLMConfig.Mirostat = input.Mirostat
+	}
+
+	if input.MirostatETA != 0 {
+		config.LLMConfig.MirostatETA = input.MirostatETA
+	}
+
+	if input.MirostatTAU != 0 {
+		config.LLMConfig.MirostatTAU = input.MirostatTAU
+	}
+
+	if input.TypicalP != 0 {
+		config.TypicalP = input.TypicalP
+	}
+
+	switch inputs := input.Input.(type) {
+	case string:
+		if inputs != "" {
+			config.InputStrings = append(config.InputStrings, inputs)
+		}
+	case []interface{}:
+		for _, pp := range inputs {
+			switch i := pp.(type) {
+			case string:
+				config.InputStrings = append(config.InputStrings, i)
+			case []interface{}:
+				tokens := []int{}
+				for _, ii := range i {
+					tokens = append(tokens, int(ii.(float64)))
+				}
+				config.InputToken = append(config.InputToken, tokens)
+			}
+		}
+	}
+
+	// Can be either a string or an object
+	switch fnc := input.FunctionCall.(type) {
+	case string:
+		if fnc != "" {
+			config.SetFunctionCallString(fnc)
+		}
+	case map[string]interface{}:
+		var name string
+		n, exists := fnc["name"]
+		if exists {
+			nn, e := n.(string)
+			if e {
+				name = nn
+			}
+		}
+		config.SetFunctionCallNameString(name)
+	}
+
+	switch p := input.Prompt.(type) {
+	case string:
+		config.PromptStrings = append(config.PromptStrings, p)
+	case []interface{}:
+		for _, pp := range p {
+			if s, ok := pp.(string); ok {
+				config.PromptStrings = append(config.PromptStrings, s)
+			}
+		}
+	}
+}
+
+func readConfig(modelFile string, input *schema.OpenAIRequest, cm *config.ConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.Config, *schema.OpenAIRequest, error) {
+	// Load a config file if present after the model name
+	modelConfig := filepath.Join(loader.ModelPath, modelFile+".yaml")
+
+	var cfg *config.Config
+
+	defaults := func() {
+		cfg = config.DefaultConfig(modelFile)
+		cfg.ContextSize = ctx
+		cfg.Threads = threads
+		cfg.F16 = f16
+		cfg.Debug = debug
+	}
+
+	cfgExisting, exists := cm.GetConfig(modelFile)
+	if !exists {
+		if _, err := os.Stat(modelConfig); err == nil {
+			if err := cm.LoadConfig(modelConfig); err != nil {
+				return nil, nil, fmt.Errorf("failed loading model config (%s) %s", modelConfig, err.Error())
+			}
+			cfgExisting, exists = cm.GetConfig(modelFile)
+			if exists {
+				cfg = &cfgExisting
+			} else {
+				defaults()
+			}
+		} else {
+			defaults()
+		}
+	} else {
+		cfg = &cfgExisting
+	}
+
+	// Set the parameters for the language model prediction
+	updateConfig(cfg, input)
+
+	// Don't allow 0 as setting
+	if cfg.Threads == 0 {
+		if threads != 0 {
+			cfg.Threads = threads
+		} else {
+			cfg.Threads = 4
+		}
+	}
+
+	// Enforce debug flag if passed from CLI
+	if debug {
+		cfg.Debug = true
+	}
+
+	return cfg, input, nil
+}
--- a/api/openai/transcription.go
+++ b/api/openai/transcription.go
@@ -0,0 +1,71 @@
+package openai
+
+import (
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"path"
+	"path/filepath"
+
+	"github.com/go-skynet/LocalAI/api/backend"
+	config "github.com/go-skynet/LocalAI/api/config"
+	"github.com/go-skynet/LocalAI/api/options"
+
+	"github.com/gofiber/fiber/v2"
+	"github.com/rs/zerolog/log"
+)
+
+// https://platform.openai.com/docs/api-reference/audio/create
+func TranscriptEndpoint(cm *config.ConfigLoader, o *options.Option) func(c *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
+		m, input, err := readInput(c, o, false)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+
+		config, input, err := readConfig(m, input, cm, o.Loader, o.Debug, o.Threads, o.ContextSize, o.F16)
+		if err != nil {
+			return fmt.Errorf("failed reading parameters from request:%w", err)
+		}
+		// retrieve the file data from the request
+		file, err := c.FormFile("file")
+		if err != nil {
+			return err
+		}
+		f, err := file.Open()
+		if err != nil {
+			return err
+		}
+		defer f.Close()
+
+		dir, err := os.MkdirTemp("", "whisper")
+
+		if err != nil {
+			return err
+		}
+		defer os.RemoveAll(dir)
+
+		dst := filepath.Join(dir, path.Base(file.Filename))
+		dstFile, err := os.Create(dst)
+		if err != nil {
+			return err
+		}
+
+		if _, err := io.Copy(dstFile, f); err != nil {
+			log.Debug().Msgf("Audio file copying error %+v - %+v - err %+v", file.Filename, dst, err)
+			return err
+		}
+
+		log.Debug().Msgf("Audio file copied to: %+v", dst)
+
+		tr, err := backend.ModelTranscription(dst, input.Language, o.Loader, *config, o)
+		if err != nil {
+			return err
+		}
+
+		log.Debug().Msgf("Trascribed: %+v", tr)
+		// TODO: handle different outputs here
+		return c.Status(http.StatusOK).JSON(tr)
+	}
+}
--- a/api/options/options.go
+++ b/api/options/options.go
@@ -0,0 +1,208 @@
+package options
+
+import (
+	"context"
+	"embed"
+	"encoding/json"
+
+	"github.com/go-skynet/LocalAI/pkg/gallery"
+	model "github.com/go-skynet/LocalAI/pkg/model"
+	"github.com/go-skynet/LocalAI/metrics"
+	"github.com/rs/zerolog/log"
+)
+
+type Option struct {
+	Context                             context.Context
+	ConfigFile                          string
+	Loader                              *model.ModelLoader
+	UploadLimitMB, Threads, ContextSize int
+	F16                                 bool
+	Debug, DisableMessage               bool
+	ImageDir                            string
+	AudioDir                            string
+	CORS                                bool
+	PreloadJSONModels                   string
+	PreloadModelsFromPath               string
+	CORSAllowOrigins                    string
+	ApiKeys                             []string
+	Metrics                             *metrics.Metrics
+
+	Galleries []gallery.Gallery
+
+	BackendAssets     embed.FS
+	AssetsDestination string
+
+	ExternalGRPCBackends map[string]string
+
+	AutoloadGalleries bool
+
+	SingleBackend bool
+}
+
+type AppOption func(*Option)
+
+func NewOptions(o ...AppOption) *Option {
+	opt := &Option{
+		Context:        context.Background(),
+		UploadLimitMB:  15,
+		Threads:        1,
+		ContextSize:    512,
+		Debug:          true,
+		DisableMessage: true,
+	}
+	for _, oo := range o {
+		oo(opt)
+	}
+	return opt
+}
+
+func WithCors(b bool) AppOption {
+	return func(o *Option) {
+		o.CORS = b
+	}
+}
+
+var EnableSingleBackend = func(o *Option) {
+	o.SingleBackend = true
+}
+
+var EnableGalleriesAutoload = func(o *Option) {
+	o.AutoloadGalleries = true
+}
+
+func WithExternalBackend(name string, uri string) AppOption {
+	return func(o *Option) {
+		if o.ExternalGRPCBackends == nil {
+			o.ExternalGRPCBackends = make(map[string]string)
+		}
+		o.ExternalGRPCBackends[name] = uri
+	}
+}
+
+func WithCorsAllowOrigins(b string) AppOption {
+	return func(o *Option) {
+		o.CORSAllowOrigins = b
+	}
+}
+
+func WithBackendAssetsOutput(out string) AppOption {
+	return func(o *Option) {
+		o.AssetsDestination = out
+	}
+}
+
+func WithBackendAssets(f embed.FS) AppOption {
+	return func(o *Option) {
+		o.BackendAssets = f
+	}
+}
+
+func WithStringGalleries(galls string) AppOption {
+	return func(o *Option) {
+		if galls == "" {
+			log.Debug().Msgf("no galleries to load")
+			o.Galleries = []gallery.Gallery{}
+			return
+		}
+		var galleries []gallery.Gallery
+		if err := json.Unmarshal([]byte(galls), &galleries); err != nil {
+			log.Error().Msgf("failed loading galleries: %s", err.Error())
+		}
+		o.Galleries = append(o.Galleries, galleries...)
+	}
+}
+
+func WithGalleries(galleries []gallery.Gallery) AppOption {
+	return func(o *Option) {
+		o.Galleries = append(o.Galleries, galleries...)
+	}
+}
+
+func WithContext(ctx context.Context) AppOption {
+	return func(o *Option) {
+		o.Context = ctx
+	}
+}
+
+func WithYAMLConfigPreload(configFile string) AppOption {
+	return func(o *Option) {
+		o.PreloadModelsFromPath = configFile
+	}
+}
+
+func WithJSONStringPreload(configFile string) AppOption {
+	return func(o *Option) {
+		o.PreloadJSONModels = configFile
+	}
+}
+func WithConfigFile(configFile string) AppOption {
+	return func(o *Option) {
+		o.ConfigFile = configFile
+	}
+}
+
+func WithModelLoader(loader *model.ModelLoader) AppOption {
+	return func(o *Option) {
+		o.Loader = loader
+	}
+}
+
+func WithUploadLimitMB(limit int) AppOption {
+	return func(o *Option) {
+		o.UploadLimitMB = limit
+	}
+}
+
+func WithThreads(threads int) AppOption {
+	return func(o *Option) {
+		o.Threads = threads
+	}
+}
+
+func WithContextSize(ctxSize int) AppOption {
+	return func(o *Option) {
+		o.ContextSize = ctxSize
+	}
+}
+
+func WithF16(f16 bool) AppOption {
+	return func(o *Option) {
+		o.F16 = f16
+	}
+}
+
+func WithDebug(debug bool) AppOption {
+	return func(o *Option) {
+		o.Debug = debug
+	}
+}
+
+func WithDisableMessage(disableMessage bool) AppOption {
+	return func(o *Option) {
+		o.DisableMessage = disableMessage
+	}
+}
+
+func WithAudioDir(audioDir string) AppOption {
+	return func(o *Option) {
+		o.AudioDir = audioDir
+	}
+}
+
+func WithImageDir(imageDir string) AppOption {
+	return func(o *Option) {
+		o.ImageDir = imageDir
+	}
+}
+
+func WithApiKeys(apiKeys []string) AppOption {
+	return func(o *Option) {
+		o.ApiKeys = apiKeys
+	}
+}
+
+func WithMetrics(meter *metrics.Metrics) AppOption {
+	return func(o *Option) {
+		o.Metrics = meter
+	}
+}
--- a/api/schema/openai.go
+++ b/api/schema/openai.go
@@ -0,0 +1,129 @@
+package schema
+
+import (
+	"context"
+
+	config "github.com/go-skynet/LocalAI/api/config"
+
+	"github.com/go-skynet/LocalAI/pkg/grammar"
+)
+
+// APIError provides error information returned by the OpenAI API.
+type APIError struct {
+	Code    any     `json:"code,omitempty"`
+	Message string  `json:"message"`
+	Param   *string `json:"param,omitempty"`
+	Type    string  `json:"type"`
+}
+
+type ErrorResponse struct {
+	Error *APIError `json:"error,omitempty"`
+}
+
+type OpenAIUsage struct {
+	PromptTokens     int `json:"prompt_tokens"`
+	CompletionTokens int `json:"completion_tokens"`
+	TotalTokens      int `json:"total_tokens"`
+}
+
+type Item struct {
+	Embedding []float32 `json:"embedding"`
+	Index     int       `json:"index"`
+	Object    string    `json:"object,omitempty"`
+
+	// Images
+	URL     string `json:"url,omitempty"`
+	B64JSON string `json:"b64_json,omitempty"`
+}
+
+type OpenAIResponse struct {
+	Created int      `json:"created,omitempty"`
+	Object  string   `json:"object,omitempty"`
+	ID      string   `json:"id,omitempty"`
+	Model   string   `json:"model,omitempty"`
+	Choices []Choice `json:"choices,omitempty"`
+	Data    []Item   `json:"data,omitempty"`
+
+	Usage OpenAIUsage `json:"usage"`
+}
+
+type Choice struct {
+	Index        int      `json:"index"`
+	FinishReason string   `json:"finish_reason,omitempty"`
+	Message      *Message `json:"message,omitempty"`
+	Delta        *Message `json:"delta,omitempty"`
+	Text         string   `json:"text,omitempty"`
+}
+
+type Content struct {
+	Type     string     `json:"type" yaml:"type"`
+	Text     string     `json:"text" yaml:"text"`
+	ImageURL ContentURL `json:"image_url" yaml:"image_url"`
+}
+
+type ContentURL struct {
+	URL string `json:"url" yaml:"url"`
+}
+
+type Message struct {
+	// The message role
+	Role string `json:"role,omitempty" yaml:"role"`
+	// The message content
+	Content interface{} `json:"content" yaml:"content"`
+
+	StringContent string   `json:"string_content,omitempty" yaml:"string_content,omitempty"`
+	StringImages  []string `json:"string_images,omitempty" yaml:"string_images,omitempty"`
+
+	// A result of a function call
+	FunctionCall interface{} `json:"function_call,omitempty" yaml:"function_call,omitempty"`
+}
+
+type OpenAIModel struct {
+	ID     string `json:"id"`
+	Object string `json:"object"`
+}
+
+type OpenAIRequest struct {
+	config.PredictionOptions
+
+	Context context.Context
+	Cancel  context.CancelFunc
+
+	// whisper
+	File string `json:"file" validate:"required"`
+	//whisper/image
+	ResponseFormat string `json:"response_format"`
+	// image
+	Size string `json:"size"`
+	// Prompt is read only by completion/image API calls
+	Prompt interface{} `json:"prompt" yaml:"prompt"`
+
+	// Edit endpoint
+	Instruction string      `json:"instruction" yaml:"instruction"`
+	Input       interface{} `json:"input" yaml:"input"`
+
+	Stop interface{} `json:"stop" yaml:"stop"`
+
+	// Messages is read only by chat/completion API calls
+	Messages []Message `json:"messages" yaml:"messages"`
+
+	// A list of available functions to call
+	Functions    []grammar.Function `json:"functions" yaml:"functions"`
+	FunctionCall interface{}        `json:"function_call" yaml:"function_call"` // might be a string or an object
+
+	Stream bool `json:"stream"`
+
+	// Image (not supported by OpenAI)
+	Mode int `json:"mode"`
+	Step int `json:"step"`
+
+	// A grammar to constrain the LLM output
+	Grammar string `json:"grammar" yaml:"grammar"`
+
+	JSONFunctionGrammarObject *grammar.JSONFunctionStructure `json:"grammar_json_functions" yaml:"grammar_json_functions"`
+
+	Backend string `json:"backend" yaml:"backend"`
+
+	// AutoGPTQ
+	ModelBaseName string `json:"model_base_name" yaml:"model_base_name"`
+}
--- a/api/schema/whisper.go
+++ b/api/schema/whisper.go
@@ -0,0 +1,16 @@
+package schema
+
+import "time"
+
+type Segment struct {
+	Id     int           `json:"id"`
+	Start  time.Duration `json:"start"`
+	End    time.Duration `json:"end"`
+	Text   string        `json:"text"`
+	Tokens []int         `json:"tokens"`
+}
+
+type Result struct {
+	Segments []Segment `json:"segments"`
+	Text     string    `json:"text"`
+}
--- a/assets.go
+++ b/assets.go
@@ -0,0 +1,6 @@
+package main
+
+import "embed"
+
+//go:embed backend-assets/*
+var backendAssets embed.FS
--- a/backend/cpp/grpc/.gitignore
+++ b/backend/cpp/grpc/.gitignore
@@ -0,0 +1,3 @@
+installed_packages/
+grpc_build/
+grpc_repo/
--- a/backend/cpp/grpc/script/build_grpc.sh
+++ b/backend/cpp/grpc/script/build_grpc.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+
+# Builds locally from sources the packages needed by the llama cpp backend.
+
+# Makes sure a few base packages exist.
+# sudo apt-get --no-upgrade -y install g++ gcc binutils cmake git build-essential autoconf libtool pkg-config 
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+echo "Script directory: $SCRIPT_DIR"
+
+CPP_INSTALLED_PACKAGES_DIR=$1
+if [ -z ${CPP_INSTALLED_PACKAGES_DIR} ]; then 
+    echo "CPP_INSTALLED_PACKAGES_DIR env variable not set. Don't know where to install: failed."; 
+    echo
+    exit -1
+fi
+
+if [ -d "${CPP_INSTALLED_PACKAGES_DIR}" ]; then
+  echo "gRPC installation directory already exists. Nothing to do."
+  exit 0
+fi
+
+# The depth when cloning a git repo. 1 speeds up the clone when the repo history is not needed.
+GIT_CLONE_DEPTH=1
+
+NUM_BUILD_THREADS=$(nproc --ignore=1)
+
+# Google gRPC --------------------------------------------------------------------------------------
+TAG_LIB_GRPC="v1.59.0"
+GIT_REPO_LIB_GRPC="https://github.com/grpc/grpc.git"
+GRPC_REPO_DIR="${SCRIPT_DIR}/../grpc_repo"
+GRPC_BUILD_DIR="${SCRIPT_DIR}/../grpc_build"
+SRC_DIR_LIB_GRPC="${GRPC_REPO_DIR}/grpc"
+
+echo "SRC_DIR_LIB_GRPC: ${SRC_DIR_LIB_GRPC}"
+echo "GRPC_REPO_DIR: ${GRPC_REPO_DIR}"
+echo "GRPC_BUILD_DIR: ${GRPC_BUILD_DIR}"
+
+mkdir -pv ${GRPC_REPO_DIR}
+
+rm   -rf ${GRPC_BUILD_DIR}
+mkdir -pv ${GRPC_BUILD_DIR}
+
+mkdir -pv ${CPP_INSTALLED_PACKAGES_DIR}
+	
+if [ -d "${SRC_DIR_LIB_GRPC}" ]; then
+  echo "gRPC source already exists locally. Not cloned again."
+else  
+  ( cd ${GRPC_REPO_DIR} && \
+    git clone --depth ${GIT_CLONE_DEPTH} -b ${TAG_LIB_GRPC} ${GIT_REPO_LIB_GRPC} && \
+    cd ${SRC_DIR_LIB_GRPC} && \
+    git submodule update --init --recursive --depth ${GIT_CLONE_DEPTH} 
+  )    
+fi
+
+( cd ${GRPC_BUILD_DIR} && \
+  cmake -G "Unix Makefiles" \
+     -DCMAKE_BUILD_TYPE=Release \
+     -DgRPC_INSTALL=ON \
+     -DEXECUTABLE_OUTPUT_PATH=${CPP_INSTALLED_PACKAGES_DIR}/grpc/bin \
+     -DLIBRARY_OUTPUT_PATH=${CPP_INSTALLED_PACKAGES_DIR}/grpc/lib \
+     -DgRPC_BUILD_TESTS=OFF \
+     -DgRPC_BUILD_CSHARP_EXT=OFF \
+     -DgRPC_BUILD_GRPC_CPP_PLUGIN=ON \
+     -DgRPC_BUILD_GRPC_CSHARP_PLUGIN=OFF \
+     -DgRPC_BUILD_GRPC_NODE_PLUGIN=OFF \
+     -DgRPC_BUILD_GRPC_OBJECTIVE_C_PLUGIN=OFF \
+     -DgRPC_BUILD_GRPC_PHP_PLUGIN=OFF \
+     -DgRPC_BUILD_GRPC_PYTHON_PLUGIN=ON \
+     -DgRPC_BUILD_GRPC_RUBY_PLUGIN=OFF \
+     -Dprotobuf_WITH_ZLIB=ON \
+     -DRE2_BUILD_TESTING=OFF \
+     -DCMAKE_INSTALL_PREFIX=${CPP_INSTALLED_PACKAGES_DIR}/ \
+     ${SRC_DIR_LIB_GRPC}  && \
+  cmake --build .  -- -j ${NUM_BUILD_THREADS} && \
+  cmake --build .  --target install -- -j ${NUM_BUILD_THREADS} 
+)
+
+rm -rf ${GRPC_BUILD_DIR}
+rm -rf ${GRPC_REPO_DIR}
+
--- a/backend/cpp/llama/CMakeLists.txt
+++ b/backend/cpp/llama/CMakeLists.txt
@@ -0,0 +1,74 @@
+
+## XXX: In some versions of CMake clip wasn't being built before llama.
+## This is an hack for now, but it should be fixed in the future.
+set(TARGET myclip)
+add_library(${TARGET} clip.cpp clip.h)
+install(TARGETS ${TARGET} LIBRARY)
+target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if (NOT MSVC)
+    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
+endif()
+
+set(TARGET grpc-server)
+# END CLIP hack
+set(CMAKE_CXX_STANDARD 17)
+cmake_minimum_required(VERSION 3.15)
+set(TARGET grpc-server)
+set(_PROTOBUF_LIBPROTOBUF libprotobuf)
+set(_REFLECTION grpc++_reflection)
+if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    link_directories("/opt/homebrew/lib")
+    include_directories("/opt/homebrew/include")
+endif()
+
+find_package(absl CONFIG REQUIRED)
+find_package(Protobuf CONFIG REQUIRED)
+find_package(gRPC CONFIG REQUIRED)
+
+find_program(_PROTOBUF_PROTOC protoc)
+set(_GRPC_GRPCPP grpc++)
+find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+include_directories(${Protobuf_INCLUDE_DIRS})
+
+message(STATUS "Using protobuf version ${Protobuf_VERSION} | Protobuf_INCLUDE_DIRS: ${Protobuf_INCLUDE_DIRS} | CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}")
+
+# Proto file
+get_filename_component(hw_proto "../../../../../../pkg/grpc/proto/backend.proto" ABSOLUTE)
+get_filename_component(hw_proto_path "${hw_proto}" PATH)
+
+# Generated sources
+set(hw_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.cc")
+set(hw_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.pb.h")
+set(hw_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.cc")
+set(hw_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/backend.grpc.pb.h")
+
+add_custom_command(
+      OUTPUT "${hw_proto_srcs}" "${hw_proto_hdrs}" "${hw_grpc_srcs}" "${hw_grpc_hdrs}"
+      COMMAND ${_PROTOBUF_PROTOC}
+      ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+        -I "${hw_proto_path}"
+        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+        "${hw_proto}"
+      DEPENDS "${hw_proto}")
+
+# hw_grpc_proto
+add_library(hw_grpc_proto
+  ${hw_grpc_srcs}
+  ${hw_grpc_hdrs}
+  ${hw_proto_srcs}
+  ${hw_proto_hdrs} )
+
+add_executable(${TARGET} grpc-server.cpp json.hpp )
+target_link_libraries(${TARGET} PRIVATE common llama myclip ${CMAKE_THREAD_LIBS_INIT} absl::flags hw_grpc_proto
+  absl::flags_parse
+  gRPC::${_REFLECTION}
+  gRPC::${_GRPC_GRPCPP}
+  protobuf::${_PROTOBUF_LIBPROTOBUF})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -0,0 +1,50 @@
+
+LLAMA_VERSION?=d9b33fe95bd257b36c84ee5769cc048230067d6f
+
+CMAKE_ARGS?=
+BUILD_TYPE?=
+
+# If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DLLAMA_CUBLAS=ON
+# If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+# to CMAKE_ARGS automatically
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
+# If build type is clblast (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+else ifeq ($(BUILD_TYPE),clblast)
+	CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
+# If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
+else ifeq ($(BUILD_TYPE),hipblas)
+	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
+endif
+
+llama.cpp:
+	git clone --recurse-submodules https://github.com/ggerganov/llama.cpp llama.cpp
+	cd llama.cpp && git checkout -b build $(LLAMA_VERSION) && git submodule update --init --recursive --depth 1
+
+llama.cpp/examples/grpc-server:
+	mkdir -p llama.cpp/examples/grpc-server
+	cp -r $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
+	cp -r $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
+	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
+	echo "add_subdirectory(grpc-server)" >> llama.cpp/examples/CMakeLists.txt
+## XXX: In some versions of CMake clip wasn't being built before llama.
+## This is an hack for now, but it should be fixed in the future.
+	cp -rfv llama.cpp/examples/llava/clip.h llama.cpp/examples/grpc-server/clip.h
+	cp -rfv llama.cpp/examples/llava/clip.cpp llama.cpp/examples/grpc-server/clip.cpp
+
+rebuild:
+	cp -rfv $(abspath ./)/CMakeLists.txt llama.cpp/examples/grpc-server/
+	cp -rfv $(abspath ./)/grpc-server.cpp llama.cpp/examples/grpc-server/
+	cp -rfv $(abspath ./)/json.hpp llama.cpp/examples/grpc-server/
+	rm -rf grpc-server
+	$(MAKE) grpc-server
+
+clean:
+	rm -rf llama.cpp
+	rm -rf grpc-server
+
+grpc-server: llama.cpp llama.cpp/examples/grpc-server
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release
+	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
--- a/backend/cpp/llama/json.hpp
+++ b/backend/cpp/llama/json.hpp
--- a/charts/local-ai/Chart.yaml
+++ b/charts/local-ai/Chart.yaml
@@ -1,6 +0,0 @@
-apiVersion: v2
-appVersion: 0.1.0
-description: A Helm chart for LocalAI
-name: local-ai
-type: application
-version: 1.0.0
--- a/charts/local-ai/templates/_helpers.tpl
+++ b/charts/local-ai/templates/_helpers.tpl
@@ -1,44 +0,0 @@
-{{/*
-Expand the name of the chart.
-*/}}
-{{- define "local-ai.name" -}}
-{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
-{{- end }}
-
-{{/*
-Create a default fully qualified app name.
-We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
-If release name contains chart name it will be used as a full name.
-*/}}
-{{- define "local-ai.fullname" -}}
-{{- if .Values.fullnameOverride }}
-{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
-{{- else }}
-{{- $name := default .Chart.Name .Values.nameOverride }}
-{{- if contains $name .Release.Name }}
-{{- .Release.Name | trunc 63 | trimSuffix "-" }}
-{{- else }}
-{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
-{{- end }}
-{{- end }}
-{{- end }}
-
-{{/*
-Create chart name and version as used by the chart label.
-*/}}
-{{- define "local-ai.chart" -}}
-{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
-{{- end }}
-
-{{/*
-Common labels
-*/}}
-{{- define "local-ai.labels" -}}
-helm.sh/chart: {{ include "local-ai.chart" . }}
-app.kubernetes.io/name: {{ include "local-ai.name" . }}
-app.kubernetes.io/instance: "{{ .Release.Name }}"
-app.kubernetes.io/managed-by: {{ .Release.Service }}
-{{- if .Chart.AppVersion }}
-app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
-{{- end }}
-{{- end }}
--- a/charts/local-ai/templates/data-volume.yaml
+++ b/charts/local-ai/templates/data-volume.yaml
@@ -1,39 +0,0 @@
-{{- if .Values.dataVolume.enabled }}
-apiVersion: cdi.kubevirt.io/v1beta1
-kind: DataVolume
-metadata:
-  name: {{ template "local-ai.fullname" . }}
-  namespace: {{ .Release.Namespace | quote }}
-  labels:
-    {{- include "local-ai.labels" . | nindent 4 }}
-spec:
-  contentType: archive
-  source:
-    {{ .Values.dataVolume.source.type }}:
-      url: {{ .Values.dataVolume.source.url }}
-      secretRef: {{ template "local-ai.fullname" . }}
-      {{- if and (eq .Values.dataVolume.source.type "http") .Values.dataVolume.source.secretExtraHeaders }}
-      secretExtraHeaders: {{ .Values.dataVolume.source.secretExtraHeaders }}
-      {{- end }}
-      {{- if .Values.dataVolume.source.caCertConfigMap }}
-      caCertConfigMap: {{ .Values.dataVolume.source.caCertConfigMap }}
-      {{- end }}
-  pvc:
-    accessModes: {{ .Values.dataVolume.pvc.accessModes }}
-    resources:
-      requests:
-        storage: {{ .Values.dataVolume.pvc.size }}
---
-{{- if .Values.dataVolume.secret.enabled }}
-apiVersion: v1
-kind: Secret
-metadata:
-  name: {{ template "local-ai.fullname" . }}
-  namespace: {{ .Release.Namespace | quote }}
-  labels:
-    {{- include "local-ai.labels" . | nindent 4 }}
-data:
-  accessKeyId: {{ .Values.dataVolume.secret.username }}
-  secretKey: {{ .Values.dataVolume.secret.password }}
-{{- end }}
-{{- end }}
--- a/charts/local-ai/templates/deployment.yaml
+++ b/charts/local-ai/templates/deployment.yaml
@@ -1,39 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: {{ template "local-ai.fullname" . }}
-  namespace: {{ .Release.Namespace | quote }}
-  labels:
-    {{- include "local-ai.labels" . | nindent 4 }}
-spec:
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: {{ include "local-ai.name" . }}
-      app.kubernetes.io/instance: {{ .Release.Name }}
-  replicas: 1
-  template:
-    metadata:
-      name: {{ template "local-ai.fullname" . }}
-      labels:
-        app.kubernetes.io/name: {{ include "local-ai.name" . }}
-        app.kubernetes.io/instance: {{ .Release.Name }}
-    spec:
-      containers:
-        - name: {{ template "local-ai.fullname" . }}
-          image: {{ .Values.deployment.image }}
-          env:
-          - name: THREADS
-            value: {{ .Values.deployment.env.threads | quote }}
-          - name: CONTEXT_SIZE
-            value: {{ .Values.deployment.env.contextSize | quote }}
-          - name: MODELS_PATH
-            value: {{ .Values.deployment.env.modelsPath }}
-{{- if .Values.deployment.volume.enabled }}
-          volumeMounts:
-          - mountPath: {{ .Values.deployment.env.modelsPath }}
-            name: models
-      volumes:
-      - name: models
-        persistentVolumeClaim:
-          claimName: {{ template "local-ai.fullname" . }}
-{{- end }}
--- a/charts/local-ai/templates/service.yaml
+++ b/charts/local-ai/templates/service.yaml
@@ -1,19 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: {{ template "local-ai.fullname" . }}
-  namespace: {{ .Release.Namespace | quote }}
-  labels:
-    {{- include "local-ai.labels" . | nindent 4 }}
-{{- if .Values.service.annotations }}
-  annotations:
-  {{ toYaml .Values.service.annotations | indent 4 }}
-{{- end }}
-spec:
-  selector:
-    app.kubernetes.io/name: {{ include "local-ai.name" . }}
-  type: "{{ .Values.service.type }}"
-  ports:
-    - protocol: TCP
-      port: 8080
-      targetPort: 8080
--- a/charts/local-ai/values.yaml
+++ b/charts/local-ai/values.yaml
@@ -1,38 +0,0 @@
-deployment:
-  image: quay.io/go-skynet/local-ai:latest
-  env:
-    threads: 14
-    contextSize: 512
-    modelsPath: "/models"
-  volume:
-    enabled: false
-
-service:
-  type: ClusterIP
-  annotations: {}
-  # If using an AWS load balancer, you'll need to override the default 60s load balancer idle timeout
-  # service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "1200"
-
-# Optionally create a PVC containing a model binary, sourced from an arbitrary HTTP server or S3 bucket
-# (requires https://github.com/kubevirt/containerized-data-importer)
-dataVolume:
-  enabled: false
-  source:
-    type: "http" # Source type. One of: [ http | s3 ]
-    url: "http://<model_server>/<model_archive>" # e.g. koala-7B-4bit-128g.GGML.tar
-
-    # CertConfigMap is an optional ConfigMap reference, containing a Certificate Authority (CA) public key
-    # and a base64 encoded pem certificate
-    caCertConfigMap: ""
-
-    # SecretExtraHeaders is an optional list of Secret references, each containing an extra HTTP header
-    # that may include sensitive information. Only applicable for the http source type.
-    secretExtraHeaders: []
-  pvc:
-    accessModes:
-    - ReadWriteOnce
-    size: 5Gi
-  secret:
-    enabled: false
-    username: "" # base64 encoded
-    password: "" # base64 encoded
--- a/cmd/grpc/bert-embeddings/main.go
+++ b/cmd/grpc/bert-embeddings/main.go
@@ -0,0 +1,22 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	bert "github.com/go-skynet/LocalAI/pkg/backend/llm/bert"
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &bert.Embeddings{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/dolly/main.go
+++ b/cmd/grpc/dolly/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.Dolly{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/falcon-ggml/main.go
+++ b/cmd/grpc/falcon-ggml/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.Falcon{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/gpt2/main.go
+++ b/cmd/grpc/gpt2/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.GPT2{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/gpt4all/main.go
+++ b/cmd/grpc/gpt4all/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	gpt4all "github.com/go-skynet/LocalAI/pkg/backend/llm/gpt4all"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &gpt4all.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/gptj/main.go
+++ b/cmd/grpc/gptj/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.GPTJ{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/gptneox/main.go
+++ b/cmd/grpc/gptneox/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.GPTNeoX{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/langchain-huggingface/main.go
+++ b/cmd/grpc/langchain-huggingface/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	langchain "github.com/go-skynet/LocalAI/pkg/backend/llm/langchain"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &langchain.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/llama-stable/main.go
+++ b/cmd/grpc/llama-stable/main.go
@@ -0,0 +1,21 @@
+package main
+
+import (
+	"flag"
+
+	llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama-stable"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/llama/main.go
+++ b/cmd/grpc/llama/main.go
@@ -0,0 +1,25 @@
+package main
+
+// GRPC Falcon server
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	llama "github.com/go-skynet/LocalAI/pkg/backend/llm/llama"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &llama.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/mpt/main.go
+++ b/cmd/grpc/mpt/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.MPT{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/piper/main.go
+++ b/cmd/grpc/piper/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	tts "github.com/go-skynet/LocalAI/pkg/backend/tts"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &tts.Piper{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/replit/main.go
+++ b/cmd/grpc/replit/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.Replit{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/rwkv/main.go
+++ b/cmd/grpc/rwkv/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	rwkv "github.com/go-skynet/LocalAI/pkg/backend/llm/rwkv"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &rwkv.LLM{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/stablediffusion/main.go
+++ b/cmd/grpc/stablediffusion/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	image "github.com/go-skynet/LocalAI/pkg/backend/image"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &image.StableDiffusion{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/starcoder/main.go
+++ b/cmd/grpc/starcoder/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transformers "github.com/go-skynet/LocalAI/pkg/backend/llm/transformers"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transformers.Starcoder{}); err != nil {
+		panic(err)
+	}
+}
--- a/cmd/grpc/whisper/main.go
+++ b/cmd/grpc/whisper/main.go
@@ -0,0 +1,23 @@
+package main
+
+// Note: this is started internally by LocalAI and a server is allocated for each model
+
+import (
+	"flag"
+
+	transcribe "github.com/go-skynet/LocalAI/pkg/backend/transcribe"
+
+	grpc "github.com/go-skynet/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+func main() {
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &transcribe.Whisper{}); err != nil {
+		panic(err)
+	}
+}
--- a/custom-ca-certs/.keep
+++ b/custom-ca-certs/.keep
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -12,4 +12,5 @@ services:
      - .env
    volumes:
      - ./models:/models:cached
-    command: ["/usr/bin/local-ai" ]
+      - ./images/:/tmp/generated/images/
+    command: ["/usr/bin/local-ai" ]
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+set -e
+
+cd /build
+
+if [ "$REBUILD" != "false" ]; then
+	rm -rf ./local-ai
+	make build -j${BUILD_PARALLELISM:-1}
+else
+	echo "@@@@@"
+	echo "Skipping rebuild"
+	echo "@@@@@"
+	echo "If you are experiencing issues with the pre-compiled builds, try setting REBUILD=true"
+	echo "If you are still experiencing issues with the build, try setting CMAKE_ARGS and disable the instructions set as needed:"
+	echo 'CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF"'
+	echo "see the documentation at: https://localai.io/basics/build/index.html"
+	echo "Note: See also https://github.com/go-skynet/LocalAI/issues/288"
+	echo "@@@@@"
+	echo "CPU info:"
+	grep -e "model\sname" /proc/cpuinfo | head -1
+	grep -e "flags" /proc/cpuinfo | head -1
+	if grep -q -e "\savx\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX    found OK"
+	else
+		echo "CPU: no AVX    found"
+	fi
+	if grep -q -e "\savx2\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX2   found OK"
+	else
+		echo "CPU: no AVX2   found"
+	fi
+	if grep -q -e "\savx512" /proc/cpuinfo ; then
+		echo "CPU:    AVX512 found OK"
+	else
+		echo "CPU: no AVX512 found"
+	fi
+	echo "@@@@@"
+fi
+
+./local-ai "$@"
--- a/examples/README.md
+++ b/examples/README.md
@@ -0,0 +1,182 @@
+# Examples
+
+| [ChatGPT OSS alternative](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui)                                                                                                                | [Image generation](https://localai.io/api-endpoints/index.html#image-generation)                                                                                                              |
+|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
+|  ![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)            | ![b6441997879](https://github.com/go-skynet/LocalAI/assets/2420543/d50af51c-51b7-4f39-b6c2-bf04c403894c)                  |
+
+|                                                                    [Telegram bot](https://github.com/go-skynet/LocalAI/tree/master/examples/telegram-bot)   | [Flowise](https://github.com/go-skynet/LocalAI/tree/master/examples/flowise)                                                                                                                     |
+|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|
+![Screenshot from 2023-06-09 00-36-26](https://github.com/go-skynet/LocalAI/assets/2420543/e98b4305-fa2d-41cf-9d2f-1bb2d75ca902)   |  ![Screenshot from 2023-05-30 18-01-03](https://github.com/go-skynet/LocalAI/assets/2420543/02458782-0549-4131-971c-95ee56ec1af8)|    |
+
+Here is a list of projects that can easily be integrated with the LocalAI backend. 
+
+
+### Projects
+
+### AutoGPT
+
+_by [@mudler](https://github.com/mudler)_
+
+This example shows how to use AutoGPT with LocalAI.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/autoGPT/)
+
+### Chatbot-UI
+
+_by [@mkellerman](https://github.com/mkellerman)_
+
+![Screenshot from 2023-04-26 23-59-55](https://user-images.githubusercontent.com/2420543/234715439-98d12e03-d3ce-4f94-ab54-2b256808e05e.png)
+
+This integration shows how to use LocalAI with [mckaywrigley/chatbot-ui](https://github.com/mckaywrigley/chatbot-ui).
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui/)
+
+There is also a separate example to show how to manually setup a model: [example](https://github.com/go-skynet/LocalAI/tree/master/examples/chatbot-ui-manual/)
+
+### K8sGPT
+
+_by [@mudler](https://github.com/mudler)_
+
+This example show how to use LocalAI inside Kubernetes with [k8sgpt](https://k8sgpt.ai).
+
+![Screenshot from 2023-06-19 23-58-47](https://github.com/go-skynet/go-ggml-transformers.cpp/assets/2420543/cab87409-ee68-44ae-8d53-41627fb49509)
+
+### Flowise
+
+_by [@mudler](https://github.com/mudler)_
+
+This example shows how to use [FlowiseAI/Flowise](https://github.com/FlowiseAI/Flowise) with LocalAI.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/flowise/)
+
+### Discord bot
+
+_by [@mudler](https://github.com/mudler)_
+
+Run a discord bot which lets you talk directly with a model
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/discord-bot/), or for a live demo you can talk with our bot in #random-bot in our discord server.
+
+### Langchain
+
+_by [@dave-gray101](https://github.com/dave-gray101)_
+
+A ready to use example to show e2e how to integrate LocalAI with langchain
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/langchain/)
+
+### Langchain Python
+
+_by [@mudler](https://github.com/mudler)_
+
+A ready to use example to show e2e how to integrate LocalAI with langchain
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/langchain-python/)
+
+### LocalAI functions
+
+_by [@mudler](https://github.com/mudler)_
+
+A ready to use example to show how to use OpenAI functions with LocalAI
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/functions/)
+
+### LocalAI WebUI
+
+_by [@dhruvgera](https://github.com/dhruvgera)_
+
+![image](https://user-images.githubusercontent.com/42107491/235344183-44b5967d-ba22-4331-804c-8da7004a5d35.png)
+
+A light, community-maintained web interface for LocalAI
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/localai-webui/)
+
+### How to run rwkv models
+
+_by [@mudler](https://github.com/mudler)_
+
+A full example on how to run RWKV models with LocalAI
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/rwkv/)
+
+### PrivateGPT
+
+_by [@mudler](https://github.com/mudler)_
+
+A full example on how to run PrivateGPT with LocalAI
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/privateGPT/)
+
+### Slack bot
+
+_by [@mudler](https://github.com/mudler)_
+
+Run a slack bot which lets you talk directly with a model
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/slack-bot/)
+
+### Slack bot (Question answering)
+
+_by [@mudler](https://github.com/mudler)_
+
+Run a slack bot, ideally for teams, which lets you ask questions on a documentation website, or a github repository.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/slack-qa-bot/)
+
+### Question answering on documents with llama-index
+
+_by [@mudler](https://github.com/mudler)_
+
+Shows how to integrate with [Llama-Index](https://gpt-index.readthedocs.io/en/stable/getting_started/installation.html) to enable question answering on a set of documents.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/query_data/)
+
+### Question answering on documents with langchain and chroma
+
+_by [@mudler](https://github.com/mudler)_
+
+Shows how to integrate with `Langchain` and `Chroma` to enable question answering on a set of documents.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/langchain-chroma/)
+
+### Telegram bot
+
+_by [@mudler](https://github.com/mudler)
+
+![Screenshot from 2023-06-09 00-36-26](https://github.com/go-skynet/LocalAI/assets/2420543/e98b4305-fa2d-41cf-9d2f-1bb2d75ca902)
+
+Use LocalAI to power a Telegram bot assistant, with Image generation and audio support!
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/telegram-bot/)
+
+### Template for Runpod.io
+
+_by [@fHachenberg](https://github.com/fHachenberg)_
+
+Allows to run any LocalAI-compatible model as a backend on the servers of https://runpod.io
+
+[Check it out here](https://runpod.io/gsc?template=uv9mtqnrd0&ref=984wlcra)
+
+### Continue
+
+_by [@gruberdev](https://github.com/gruberdev)_
+
+<img src="continue/img/screen.png" width="600" height="200" alt="Screenshot">
+
+Demonstrates how to integrate an open-source copilot alternative that enhances code analysis, completion, and improvements. This approach seamlessly integrates with any LocalAI model, offering a more user-friendly experience.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/continue/)
+
+### Streamlit bot
+
+_by [@majoshi1](https://github.com/majoshi1)_
+
+![Screenshot](streamlit-bot/streamlit-bot.png)
+
+A chat bot made using `Streamlit` & LocalAI.
+
+[Check it out here](https://github.com/go-skynet/LocalAI/tree/master/examples/streamlit-bot/)
+
+## Want to contribute?
+
+Create an issue, and put `Example: <description>` in the title! We will post your examples here.
--- a/examples/autoGPT/.env.example
+++ b/examples/autoGPT/.env.example
@@ -0,0 +1,9 @@
+# CPU .env docs: https://localai.io/howtos/easy-setup-docker-cpu/
+# GPU .env docs: https://localai.io/howtos/easy-setup-docker-gpu/
+
+OPENAI_API_KEY=sk---anystringhere
+OPENAI_API_BASE=http://api:8080/v1
+# Models to preload at start
+# Here we configure gpt4all as gpt-3.5-turbo and bert as embeddings,
+# see other options in the model gallery at https://github.com/go-skynet/model-gallery
+PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}, { "url": "github:go-skynet/model-gallery/bert-embeddings.yaml", "name": "text-embedding-ada-002"}]
--- a/examples/autoGPT/README.md
+++ b/examples/autoGPT/README.md
@@ -0,0 +1,36 @@
+# AutoGPT
+
+Example of integration with [AutoGPT](https://github.com/Significant-Gravitas/Auto-GPT).
+
+## Run
+
+```bash
+# Clone LocalAI
+git clone https://github.com/go-skynet/LocalAI
+
+cd LocalAI/examples/autoGPT
+
+cp -rfv .env.example .env
+
+# Edit the .env file to set a different model by editing `PRELOAD_MODELS`.
+vim .env
+
+docker-compose run --rm auto-gpt
+```
+
+Note: The example automatically downloads the `gpt4all` model as it is under a permissive license. The GPT4All model does not seem to be enough to run AutoGPT. WizardLM-7b-uncensored seems to perform better (with `f16: true`).
+
+
+## Without docker
+
+Run AutoGPT with `OPENAI_API_BASE` pointing to the LocalAI endpoint. If you run it locally for instance:
+
+```
+OPENAI_API_BASE=http://localhost:8080 python ...
+```
+
+Note: you need a model named `gpt-3.5-turbo` and `text-embedding-ada-002`. You can preload those in LocalAI at start by setting in the env:
+
+```
+PRELOAD_MODELS=[{"url": "github:go-skynet/model-gallery/gpt4all-j.yaml", "name": "gpt-3.5-turbo"}, { "url": "github:go-skynet/model-gallery/bert-embeddings.yaml", "name": "text-embedding-ada-002"}]
+```
--- a/examples/autoGPT/docker-compose.yaml
+++ b/examples/autoGPT/docker-compose.yaml
@@ -0,0 +1,42 @@
+version: "3.9"
+services:
+  api:
+    image: quay.io/go-skynet/local-ai:latest
+    ports:
+      - 8080:8080
+    env_file:
+      - .env
+    environment:
+      - DEBUG=true
+      - MODELS_PATH=/models
+    volumes:
+      - ./models:/models:cached
+    command: ["/usr/bin/local-ai" ]
+  auto-gpt:
+    image: significantgravitas/auto-gpt
+    depends_on:
+      api:
+        condition: service_healthy
+      redis:
+        condition: service_started
+    env_file:
+      - .env
+    environment:
+      MEMORY_BACKEND: ${MEMORY_BACKEND:-redis}
+      REDIS_HOST: ${REDIS_HOST:-redis}
+    profiles: ["exclude-from-up"]
+    volumes:
+      - ./auto_gpt_workspace:/app/autogpt/auto_gpt_workspace
+      - ./data:/app/data
+      ## allow auto-gpt to write logs to disk
+      - ./logs:/app/logs
+      ## uncomment following lines if you want to make use of these files
+      ## you must have them existing in the same folder as this docker-compose.yml
+      #- type: bind
+      #  source: ./azure.yaml
+      #  target: /app/azure.yaml
+      #- type: bind
+      #  source: ./ai_settings.yaml
+      #  target: /app/ai_settings.yaml
+  redis:
+    image: "redis/redis-stack-server:latest"
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: backend monitor
+  type: http
+  seq: 4
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/monitor
+  body: none
+  auth: none
+}
--- a/monitor/backend-shutdown.bru
+++ b/monitor/backend-shutdown.bru
@@ -0,0 +1,21 @@
+meta {
+  name: backend-shutdown
+  type: http
+  seq: 3
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/shutdown
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,5 @@
+{
+  "version": "1",
+  "name": "LocalAI Test Requests",
+  "type": "collection"
+}
--- a/Requests/environments/localhost.bru
+++ b/Requests/environments/localhost.bru
@@ -0,0 +1,6 @@
+vars {
+  HOST: localhost
+  PORT: 8080
+  DEFAULT_MODEL: gpt-3.5-turbo
+  PROTOCOL: http://
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,11 @@
+meta {
+  name: get models list
+  type: http
+  seq: 2
+}
+
+get {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models
+  body: none
+  auth: none
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,25 @@
+meta {
+  name: Generate image
+  type: http
+  seq: 1
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+    "prompt": "<positive prompt>|<negative prompt>",
+    "model": "model-name",
+    "step": 51,
+    "size": "1024x1024",
+    "image": ""
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,24 @@
+meta {
+  name: -completions
+  type: http
+  seq: 4
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/completions
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "prompt": "function downloadFile(string url, string outputPath) {",
+      "max_tokens": 256,
+      "temperature": 0.5
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,23 @@
+meta {
+  name: -edits
+  type: http
+  seq: 5
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/edits
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "input": "What day of the wek is it?",
+      "instruction": "Fix the spelling mistakes"
+  }
+}
--- a/examples/bruno/LocalAI
+++ b/examples/bruno/LocalAI
@@ -0,0 +1,22 @@
+meta {
+  name: -embeddings
+  type: http
+  seq: 6
+}
+
+post {
+  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/embeddings
+  body: json
+  auth: none
+}
+
+headers {
+  Content-Type: application/json
+}
+
+body:json {
+  {
+      "model": "{{DEFAULT_MODEL}}",
+      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
+  }
+}
--- a/Show More
+++ b/Show More